Computer Vision Code Notes

This is a personal notebook of computer vision code snippets and experiments from 2022. Topics include object detection, tracking, segmentation, and more.

Object Tracking with DeepSORT

Combines YOLOv5 for object detection and DeepSORT for multi-object tracking using bounding box re-identification.
import cv2
import numpy as np
import sys
import glob
import time
import torch

class YoloDetector:
    def __init__(self, model_name):
        self.model = self.load_model(model_name)
        self.classes = self.model.names
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print("Using Device:", self.device)

    def load_model(self, model_name):
        if model_name:
            model = torch.hub.load("ultralytics/yolov5", "custom", path=model_name, force_reload=True)
        else:
            model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=True)
        return model

    def score_frame(self, frame):
        self.model.to(self.device)
        downscale_factor = 2
        width = int(frame.shape[1] / downscale_factor)
        height = int(frame.shape[0] / downscale_factor)
        frame = cv2.resize(frame, (width, height))
        results = self.model(frame)
        labels, cord = results.xyxyn[0][:, -1], results.xyxyn[0][:, :-1]
        return labels, cord

    def class_to_label(self, x):
        return self.classes[int(x)]

    def plot_boxes(self, results, frame, height, width, confidence=0.3):
        labels, cord = results
        detections = []

        n = len(labels)
        x_shape, y_shape = width, height

        for i in range(n):
            row = cord[i]

            if row[4] >= confidence:
                x1, y1, x2, y2 = int(row[0] * x_shape), int(row[1] * y_shape), int(row[2] * x_shape), int(row[3] * y_shape)

                if self.class_to_label(labels[i]) == "person":
                    detections.append(([x1, y1, int(x2-x1), int(y2-y1)], row[4].item(), "person"))

        return frame, detections

cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
detector = YoloDetector(model_name=None)

from deep_sort_realtime.deepsort_tracker import DeepSort

object_tracker = DeepSort(max_age=5, n_init=2, nms_max_overlap=1.0,
                max_cosine_distance=0.3, nn_budget=None,
                embedder="mobilenet", half=True, bgr=True,
                embedder_gpu=True)

while cap.isOpened():
    success, img = cap.read()
    results = detector.score_frame(img)
    img, detections = detector.plot_boxes(results, img, img.shape[0], img.shape[1], confidence=0.5)
    tracks = object_tracker.update_tracks(detections, frame=img)

    for track in tracks:
        if not track.is_confirmed():
            continue
        track_id = track.track_id
        ltrb = track.to_ltrb()
        bbox = ltrb
        cv2.rectangle(img, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 0, 255), 2)
        cv2.putText(img, "ID: " + str(track_id), (int(bbox[0]), int(bbox[1] - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    cv2.imshow('img', img)
    if cv2.waitKey(1) == ord("q"):
        break
cap.release()
cv2.destroyAllWindows()

Vehicle Detection and Counting

Counts vehicles crossing a horizontal line in the video using YOLOv5.
import cv2
import numpy as np
import torch

cap = cv2.VideoCapture("Cars.mp4")
model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=True)
cnt = 0

while True:
    ret, frame = cap.read()
    results = model(frame)
    height = frame.shape[0]
    for result in results.xyxy[0]:
        if int(result[3]) >= int(0.6 * height - 2):
            cv2.rectangle(frame, (int(result[0]), int(result[1])), (int(result[2]), int(result[3])), (0, 255, 0), 2)
            cnt += 1
    cv2.line(frame, (0, int(0.6 * height)), (frame.shape[1], int(0.6 * height)), (0, 255, 0), 2)
    cv2.putText(frame, "count:"+str(cnt), (30, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 1, cv2.LINE_AA)
    cv2.imshow("video", frame)
    if cv2.waitKey(1) == ord("q"):
        break
cap.release()
cv2.destroyAllWindows()

Semantic Segmentation with DeepLabV3

Trains a DeepLabV3 segmentation model on custom data by splitting images and masks, applying transforms, and using a modified classifier for 3 output classes.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torchvision.models.segmentation import deeplabv3_resnet50

pretrained_model = deeplabv3_resnet50(pretrained=True)

model = pretrained_model

model.classifier = nn.Conv2d(2048, 3, kernel_size=1)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


def split_input_mask(path):
    inputs = []
    masks = []
    for file in os.listdir(path):
        img = os.path.join(path, file)
        img = cv2.imread(img)
        a = img[:, :256]
        inputs.append(a)
        b = img[:, 256:]
        masks.append(b)

    return inputs, masks

path = "/kaggle/input/cityscapes-image-pairs/cityscapes_data/train"
inputs, masks = split_input_mask(path)


class ImgDataset(torch.utils.data.Dataset):
    def __init__(self, x, y=None, transform=None):
        self.x = x
        self.y = y

        self.transform = transform

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        X = self.x[idx]
        if self.transform is not None:
            X = self.transform(X)

        if self.y is not None:
            Y = self.y[idx]
            Y = self.transform(Y)
            return X, Y

        return X
        

import torchvision.transforms as transforms

#train_transform = transforms.Compose([
#    transforms.ToPILImage(),
#    transforms.RandomHorizontalFlip(),
#    transforms.RandomRotation(15),
#    transforms.Resize((256, 256)),
#    transforms.ToTensor(),
#])

train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

batch_size = 128
train_set = ImgDataset(inputs, masks, train_transform)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)

for epoch in range(10):
    running_loss = 0.0
    for inputs, labels in train_loader:
        outputs = model(inputs)
        loss = criterion(outputs['out'], labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, 30, running_loss))



path = "/kaggle/input/cityscapes-image-pairs/cityscapes_data/train/1.jpg"
input_tensor = inputs[2]
image = cv2.imread(path)

train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.ToTensor(),
])

#input_batch = train_transform(input_tensor)
with torch.no_grad():
    output = model(input_tensor.unsqueeze(0))['out'][0]
    
output_predictions = output.argmax(0)
plt.imshow(output_predictions)

Faster R-CNN Object Detection

Detects objects using a pretrained Faster R-CNN model from TorchVision with ResNet50-FPN as backbone.
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.transforms import functional as F
from PIL import Image

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

image_path = "C:/Users/user/Desktop/data/06.jpg"
image = Image.open(image_path)

image_tensor = F.to_tensor(image).unsqueeze(0)

with torch.no_grad():
    prediction = model(image_tensor)

boxes = prediction[0]['boxes']
scores = prediction[0]['scores']
labels = prediction[0]['labels']

for box, score, label in zip(boxes, scores, labels):
    print(f"Label: {label}, Score: {score:.2f}, Box: {box}")

YOLOv4-tiny Object Detection with OpenCV

Performs real-time object detection with OpenCV using YOLOv4-tiny.
import cv2
net = cv2.dnn.readNet("C:/Users/user/Desktop/dnn_model/yolov4-tiny.weights", "C:/Users/user/Desktop/dnn_model/yolov4-tiny.cfg")
model = cv2.dnn_DetectionModel(net)
model.setInputParams(size=(320, 320), scale=1/255)

classes = []
with open("C:/Users/user/Desktop/dnn_model/classes.txt", "r") as file_object:
    for class_name in file_object.readlines():
        class_name = class_name.strip()
        classes.append(class_name)
print("Objects list")
print(classes)

cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

while True:
    ret, frame = cap.read()

    (class_ids, scores, bboxes) = model.detect(frame)
    for class_id, score, bbox in zip(class_ids, scores, bboxes):
        (x, y, w, h) = bbox
        class_name = classes[class_id]
        cv2.putText(frame, str(class_name), (x, y-5), cv2.FONT_HERSHEY_PLAIN, 1, (200, 0, 50), 2)
        cv2.rectangle(frame, (x, y), (x+w, y+h), (200, 0, 50), 3)
    print("class ids", class_ids)
    print("scores", scores)
    print("bboxes", bboxes)

    cv2.imshow("Frame", frame)
    cv2.waitKey(1)