Address
304 North Cardinal
St. Dorchester Center, MA 02124
Work Hours
Monday to Friday: 7AM - 7PM
Weekend: 10AM - 5PM
Address
304 North Cardinal
St. Dorchester Center, MA 02124
Work Hours
Monday to Friday: 7AM - 7PM
Weekend: 10AM - 5PM
利用ESP32-CAM來做智慧辨識一直是有趣的議題
目前使用ESP32CAM來做智慧辨識可分成以下3種方式
我的ESP32實做書籍:我出書了 ESP32 物聯網專題
博客來網址:https://www.books.com.tw/products/0010901195
以往我也介紹了利用.net環境接取多台ESP32-CAM的影像進行分析(ESP32-CAM mpeg Video Stream 用.net接收並錄製影片檔 ),但智慧辨識的後台都是採用微軟Azure的AI Cognitive Service API進行智慧判斷。
這次參考網路文章,終於完成將ESP32-CAM接取到python後,用時下最流行的yolo v3物件辨識模型進行分析。
要完成本次操作,請先閱讀以下文章

重點來了,如何使用python接取ESP32-CAM影像,一般在python接取webcam都是用opencv的「cap = cv.VideoCapture(0)」一句就可以搞定,不過ESP32-CAM的影像屬於mjpeg影像串流,因此無法使用這個方法,後來發現需要用到jpg的格式「 FF D8 」為開頭與「 FF D9」為結尾的方式,在串流中找出一張完整的jpg,在放入yolo中進行分析,以下分成兩個主題說明
本部份僅有利用python接取ESP32-CAM影像的程式,無任何其他處理,參考範例程式:
import cv2 as cv
import numpy as np
from urllib.request import urlopen
import os
import datetime
import time
import sys
#change to your ESP32-CAM ip
url="http://192.168.1.149:9601/stream"
CAMERA_BUFFRER_SIZE=4096
stream=urlopen(url)
bts=b''
i=0
while True:
try:
bts+=stream.read(CAMERA_BUFFRER_SIZE)
jpghead=bts.find(b'\xff\xd8')
jpgend=bts.find(b'\xff\xd9')
if jpghead>-1 and jpgend>-1:
jpg=bts[jpghead:jpgend+2]
bts=bts[jpgend+2:]
img=cv.imdecode(np.frombuffer(jpg,dtype=np.uint8),cv.IMREAD_UNCHANGED)
#img=cv.flip(img,0) #>0:垂直翻轉, 0:水平翻轉, <0:垂直水平翻轉
#h,w=img.shape[:2]
#print('影像大小 高:' + str(h) + '寬:' + str(w))
img=cv.resize(img,(640,480))
cv.imshow("a",img)
k=cv.waitKey(1)
except Exception as e:
print("Error:" + str(e))
bts=b''
stream=urlopen(url)
continue
k=cv.waitKey(1)
# 按a拍照存檔
if k & 0xFF == ord('a'):
cv.imwrite(str(i) + ".jpg", img)
i=i+1
# 按q離開
if k & 0xFF == ord('q'):
break
cv.destroyAllWindows()

本部份將上述ESP32CAM影像利用python知名的yolo模型進行物件辨識,參考範例程式:
# This code is written at BigVision LLC. It is based on the OpenCV project. It is subject to the license terms in the LICENSE file found in this distribution and at http://opencv.org/license.html
# Usage example: python3 object_detection_yolo.py --video=run.mp4
# python3 object_detection_yolo.py --image=bird.jpg
import cv2 as cv
import argparse
import sys
import numpy as np
from urllib.request import urlopen
import os
import datetime
import time
# Initialize the parameters
confThreshold = 0.5 #Confidence threshold
nmsThreshold = 0.4 #Non-maximum suppression threshold置信度阈值
inpWidth = 320 #Width of network's input image,改为320*320更快
inpHeight = 320 #Height of network's input image,改为608*608更准
parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
parser.add_argument('--image', help='Path to image file.')
parser.add_argument('--video', help='Path to video file.')
args = parser.parse_args()
# Load names of classes
classesFile = "YOLO\\coco.names"
classes = None
with open(classesFile, 'rt') as f:
classes = f.read().rstrip('\n').split('\n')
# Give the configuration and weight files for the model and load the network using them.
modelConfiguration = "YOLO\\yolov3.cfg";
modelWeights = "YOLO\\yolov3.weights";
net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU) #可切换到GPU,cv.dnn.DNN_TARGET_OPENCL,
# 只支持Intel的GPU,没有则自动切换到cpu
# Get the names of the output layers
def getOutputsNames(net):
# Get the names of all the layers in the network
layersNames = net.getLayerNames()
# Get the names of the output layers, i.e. the layers with unconnected outputs
return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]
# Draw the predicted bounding box
def drawPred(classId, conf, left, top, right, bottom):
# Draw a bounding box.
cv.rectangle(frame, (left, top), (right, bottom), (255, 178, 50), 3)
label = '%.2f' % conf
# Get the label for the class name and its confidence
if classes:
assert(classId < len(classes))
label = '%s:%s' % (classes[classId], label)
#Display the label at the top of the bounding box
labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
top = max(top, labelSize[1])
cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (255, 255, 255), cv.FILLED)
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,0), 1)
# Remove the bounding boxes with low confidence using non-maxima suppression
def postprocess(frame, outs):
frameHeight = frame.shape[0]
frameWidth = frame.shape[1]
classIds = []
confidences = []
boxes = []
# Scan through all the bounding boxes output from the network and keep only the
# ones with high confidence scores. Assign the box's class label as the class with the highest score.
classIds = []
confidences = []
boxes = []
for out in outs:
for detection in out:
scores = detection[5:]
classId = np.argmax(scores)
confidence = scores[classId]
if confidence > confThreshold:
center_x = int(detection[0] * frameWidth)
center_y = int(detection[1] * frameHeight)
width = int(detection[2] * frameWidth)
height = int(detection[3] * frameHeight)
left = int(center_x - width / 2)
top = int(center_y - height / 2)
classIds.append(classId)
confidences.append(float(confidence))
boxes.append([left, top, width, height])
# Perform non maximum suppression to eliminate redundant overlapping boxes with
# lower confidences.
indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
for i in indices:
i = i[0]
box = boxes[i]
left = box[0]
top = box[1]
width = box[2]
height = box[3]
drawPred(classIds[i], confidences[i], left, top, left + width, top + height)
# Process inputs
winName = 'Deep learning object detection in OpenCV'
cv.namedWindow(winName, cv.WINDOW_NORMAL)
outputFile = "yolo_out_py.avi"
# Webcam input
url="http://192.168.1.149:9601/stream"
CAMERA_BUFFRER_SIZE=4096
stream=urlopen(url)
bts=b''
# Get the video writer initialized to save the output video
#if (not args.image):
# vid_writer = cv.VideoWriter(outputFile, cv.VideoWriter_fourcc('M','J','P','G'), 30, (round(cap.get(cv.CAP_PROP_FRAME_WIDTH)),round(cap.get(cv.CAP_PROP_FRAME_HEIGHT))))
while cv.waitKey(1) < 0:
bts+=stream.read(CAMERA_BUFFRER_SIZE)
jpghead=bts.find(b'\xff\xd8')
jpgend=bts.find(b'\xff\xd9')
if jpghead>-1 and jpgend>-1:
jpg=bts[jpghead:jpgend+2]
bts=bts[jpgend+2:]
img=cv.imdecode(np.frombuffer(jpg,dtype=np.uint8),cv.IMREAD_UNCHANGED)
v=cv.flip(img,0)
h=cv.flip(img,1)
p=cv.flip(img,-1)
frame=p
h,w=frame.shape[:2]
frame=cv.resize(frame,(1024,768))
blob = cv.dnn.blobFromImage(frame, 1/255, (inpWidth, inpHeight), [0,0,0], 1, crop=False)
net.setInput(blob)
# Runs the forward pass to get output of the output layers
outs = net.forward(getOutputsNames(net))
# Remove the bounding boxes with low confidence
postprocess(frame, outs)
# Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
t, _ = net.getPerfProfile()
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
cv.imshow(winName, frame)

參考網址:
1.http://hk.voidcc.com/question/p-nagarvzd-b.html
2.http://t.ly/pvLyM