ESP32-CAM Python物件辨識-採用YOLOV3

利用ESP32-CAM來做智慧辨識一直是有趣的議題
目前使用ESP32CAM來做智慧辨識可分成以下3種方式

ESP32-CAM Python物件辨識方法

ESP32-CAM 智慧辨識方式說明
操作前準備
Python 接取esp32-cam影像
利用yolo物件辨識

我的ESP32實做書籍：我出書了 ESP32 物聯網專題
博客來網址：https://www.books.com.tw/products/0010901195

ESP32-CAM 智慧辨識方式說明

在ESP32-CAM上做智慧辨識 : 這端受限ESP32記憶體及CPU速度，很難達成高速度的影像辨識運算
在雲端做智慧辨識 : 將ESP32CAM所獲取的影像上傳到智慧雲端平台做辨識後，將結果回傳，這種則可辨識較為精確，但網路傳輸則是另外須考量的問題。
利用近端智慧辨識：也就是在近端建立智慧平台，收集ESP32-CAM所傳來的影像，這樣影像不用上傳到雲端，在區域網路就可以辨識，也算一種折衷的邊緣運算，知名大師法蘭斯所用tensorflow.js外掛到ESP32-CAM內進行體態辨識也屬這類，主要是透過觀看影像的裝置進行運算，例如手機或電腦。

操作前準備

以往我也介紹了利用.net環境接取多台ESP32-CAM的影像進行分析（ESP32-CAM mpeg Video Stream 用.net接收並錄製影片檔），但智慧辨識的後台都是採用微軟Azure的AI Cognitive Service API進行智慧判斷。

這次參考網路文章，終於完成將ESP32-CAM接取到python後，用時下最流行的yolo v3物件辨識模型進行分析。

要完成本次操作，請先閱讀以下文章

ESP32-CAM：露天拍賣請搜尋： https://goods.ruten.com.tw/item/show?21910115309507
ESP32-CAM的開發環境建立與測試，請參考：ESP32-CAM (arduino)影像伺服器及臉部辨識教學原始檔Video Stream Server
python yolo v3辨識，請參考：Yolo實作教學 — train出屬於自己的model使用darkflow (windows)
本次ESP32-CAM所需使用的程式，請參考這篇：ESP32-CAM(ardunio)拍照auto take picture、串流stream及物件辨識object

重點來了，如何使用python接取ESP32-CAM影像，一般在python接取webcam都是用opencv的「cap = cv.VideoCapture(0)」一句就可以搞定，不過ESP32-CAM的影像屬於mjpeg影像串流，因此無法使用這個方法，後來發現需要用到jpg的格式「 FF D8 」為開頭與「 FF D9」為結尾的方式，在串流中找出一張完整的jpg，在放入yolo中進行分析，以下分成兩個主題說明

Python 接取esp32-cam影像

本部份僅有利用python接取ESP32-CAM影像的程式，無任何其他處理，參考範例程式：

import cv2 as cv
import numpy as np
from urllib.request import urlopen
import os
import datetime
import time
import sys

#change to your ESP32-CAM ip
url="http://192.168.1.149:9601/stream"
CAMERA_BUFFRER_SIZE=4096
stream=urlopen(url)
bts=b''
i=0
while True:    
    try:
        bts+=stream.read(CAMERA_BUFFRER_SIZE)
        jpghead=bts.find(b'\xff\xd8')
        jpgend=bts.find(b'\xff\xd9')
        if jpghead>-1 and jpgend>-1:
            jpg=bts[jpghead:jpgend+2]
            bts=bts[jpgend+2:]
            img=cv.imdecode(np.frombuffer(jpg,dtype=np.uint8),cv.IMREAD_UNCHANGED)
            #img=cv.flip(img,0) #>0:垂直翻轉, 0:水平翻轉, <0:垂直水平翻轉            
            #h,w=img.shape[:2]
            #print('影像大小 高:' + str(h) + '寬：' + str(w))
            img=cv.resize(img,(640,480))
            cv.imshow("a",img)
        k=cv.waitKey(1)
    except Exception as e:
        print("Error:" + str(e))
        bts=b''
        stream=urlopen(url)
        continue
    
    k=cv.waitKey(1)
    # 按a拍照存檔
    if k & 0xFF == ord('a'):
        cv.imwrite(str(i) + ".jpg", img)
        i=i+1
    # 按q離開
    if k & 0xFF == ord('q'):
        break
cv.destroyAllWindows()

利用yolo物件辨識

本部份將上述ESP32CAM影像利用python知名的yolo模型進行物件辨識，參考範例程式：

# This code is written at BigVision LLC. It is based on the OpenCV project. It is subject to the license terms in the LICENSE file found in this distribution and at http://opencv.org/license.html

# Usage example:  python3 object_detection_yolo.py --video=run.mp4
#                 python3 object_detection_yolo.py --image=bird.jpg
import cv2 as cv
import argparse
import sys
import numpy as np
from urllib.request import urlopen
import os
import datetime
import time

# Initialize the parameters
confThreshold = 0.5  #Confidence threshold
nmsThreshold = 0.4   #Non-maximum suppression threshold置信度阈值
inpWidth = 320       #Width of network's input image，改为320*320更快
inpHeight = 320      #Height of network's input image，改为608*608更准

parser = argparse.ArgumentParser(description='Object Detection using YOLO in OPENCV')
parser.add_argument('--image', help='Path to image file.')
parser.add_argument('--video', help='Path to video file.')
args = parser.parse_args()

# Load names of classes
classesFile = "YOLO\\coco.names"
classes = None
with open(classesFile, 'rt') as f:
    classes = f.read().rstrip('\n').split('\n')

# Give the configuration and weight files for the model and load the network using them.
modelConfiguration = "YOLO\\yolov3.cfg";
modelWeights = "YOLO\\yolov3.weights";

net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU) #可切换到GPU,cv.dnn.DNN_TARGET_OPENCL，
# 只支持Intel的GPU,没有则自动切换到cpu

# Get the names of the output layers
def getOutputsNames(net):
    # Get the names of all the layers in the network
    layersNames = net.getLayerNames()
    # Get the names of the output layers, i.e. the layers with unconnected outputs
    return [layersNames[i[0] - 1] for i in net.getUnconnectedOutLayers()]

# Draw the predicted bounding box
def drawPred(classId, conf, left, top, right, bottom):
    # Draw a bounding box.
    cv.rectangle(frame, (left, top), (right, bottom), (255, 178, 50), 3)

    label = '%.2f' % conf

    # Get the label for the class name and its confidence
    if classes:
        assert(classId < len(classes))
        label = '%s:%s' % (classes[classId], label)

    #Display the label at the top of the bounding box
    labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
    top = max(top, labelSize[1])
    cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (255, 255, 255), cv.FILLED)
    cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,0), 1)

# Remove the bounding boxes with low confidence using non-maxima suppression
def postprocess(frame, outs):
    frameHeight = frame.shape[0]
    frameWidth = frame.shape[1]

    classIds = []
    confidences = []
    boxes = []
    # Scan through all the bounding boxes output from the network and keep only the
    # ones with high confidence scores. Assign the box's class label as the class with the highest score.
    classIds = []
    confidences = []
    boxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            classId = np.argmax(scores)
            confidence = scores[classId]
            if confidence > confThreshold:
                center_x = int(detection[0] * frameWidth)
                center_y = int(detection[1] * frameHeight)
                width = int(detection[2] * frameWidth)
                height = int(detection[3] * frameHeight)
                left = int(center_x - width / 2)
                top = int(center_y - height / 2)
                classIds.append(classId)
                confidences.append(float(confidence))
                boxes.append([left, top, width, height])

    # Perform non maximum suppression to eliminate redundant overlapping boxes with
    # lower confidences.
    indices = cv.dnn.NMSBoxes(boxes, confidences, confThreshold, nmsThreshold)
    for i in indices:
        i = i[0]
        box = boxes[i]
        left = box[0]
        top = box[1]
        width = box[2]
        height = box[3]
        drawPred(classIds[i], confidences[i], left, top, left + width, top + height)

# Process inputs
winName = 'Deep learning object detection in OpenCV'
cv.namedWindow(winName, cv.WINDOW_NORMAL)

outputFile = "yolo_out_py.avi"
# Webcam input
url="http://192.168.1.149:9601/stream"
CAMERA_BUFFRER_SIZE=4096
stream=urlopen(url)
bts=b''

# Get the video writer initialized to save the output video
#if (not args.image):
#   vid_writer = cv.VideoWriter(outputFile, cv.VideoWriter_fourcc('M','J','P','G'), 30, (round(cap.get(cv.CAP_PROP_FRAME_WIDTH)),round(cap.get(cv.CAP_PROP_FRAME_HEIGHT))))

while cv.waitKey(1) < 0:
    bts+=stream.read(CAMERA_BUFFRER_SIZE)
    jpghead=bts.find(b'\xff\xd8')
    jpgend=bts.find(b'\xff\xd9')
    if jpghead>-1 and jpgend>-1:
        jpg=bts[jpghead:jpgend+2]
        bts=bts[jpgend+2:]
        img=cv.imdecode(np.frombuffer(jpg,dtype=np.uint8),cv.IMREAD_UNCHANGED)
        v=cv.flip(img,0)
        h=cv.flip(img,1)
        p=cv.flip(img,-1)        
        frame=p
        h,w=frame.shape[:2]
        frame=cv.resize(frame,(1024,768))
        blob = cv.dnn.blobFromImage(frame, 1/255, (inpWidth, inpHeight), [0,0,0], 1, crop=False)
        net.setInput(blob)
        # Runs the forward pass to get output of the output layers
        outs = net.forward(getOutputsNames(net))
        # Remove the bounding boxes with low confidence
        postprocess(frame, outs)
        # Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
        t, _ = net.getPerfProfile()
        label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
        cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255))
        cv.imshow(winName, frame)

參考網址：
1.http://hk.voidcc.com/question/p-nagarvzd-b.html
2.http://t.ly/pvLyM

ESP32-CAM Python物件辨識-採用YOLOV3

ESP32-CAM 智慧辨識方式說明

操作前準備

Python 接取esp32-cam影像

利用yolo物件辨識

相關

Leave a Comment Cancel