This is a personal notebook of computer vision code snippets and experiments from 2022. Topics include object detection, tracking, segmentation, and more.
import cv2
import numpy as np
import sys
import glob
import time
import torch
class YoloDetector:
def __init__(self, model_name):
self.model = self.load_model(model_name)
self.classes = self.model.names
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using Device:", self.device)
def load_model(self, model_name):
if model_name:
model = torch.hub.load("ultralytics/yolov5", "custom", path=model_name, force_reload=True)
else:
model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=True)
return model
def score_frame(self, frame):
self.model.to(self.device)
downscale_factor = 2
width = int(frame.shape[1] / downscale_factor)
height = int(frame.shape[0] / downscale_factor)
frame = cv2.resize(frame, (width, height))
results = self.model(frame)
labels, cord = results.xyxyn[0][:, -1], results.xyxyn[0][:, :-1]
return labels, cord
def class_to_label(self, x):
return self.classes[int(x)]
def plot_boxes(self, results, frame, height, width, confidence=0.3):
labels, cord = results
detections = []
n = len(labels)
x_shape, y_shape = width, height
for i in range(n):
row = cord[i]
if row[4] >= confidence:
x1, y1, x2, y2 = int(row[0] * x_shape), int(row[1] * y_shape), int(row[2] * x_shape), int(row[3] * y_shape)
if self.class_to_label(labels[i]) == "person":
detections.append(([x1, y1, int(x2-x1), int(y2-y1)], row[4].item(), "person"))
return frame, detections
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
detector = YoloDetector(model_name=None)
from deep_sort_realtime.deepsort_tracker import DeepSort
object_tracker = DeepSort(max_age=5, n_init=2, nms_max_overlap=1.0,
max_cosine_distance=0.3, nn_budget=None,
embedder="mobilenet", half=True, bgr=True,
embedder_gpu=True)
while cap.isOpened():
success, img = cap.read()
results = detector.score_frame(img)
img, detections = detector.plot_boxes(results, img, img.shape[0], img.shape[1], confidence=0.5)
tracks = object_tracker.update_tracks(detections, frame=img)
for track in tracks:
if not track.is_confirmed():
continue
track_id = track.track_id
ltrb = track.to_ltrb()
bbox = ltrb
cv2.rectangle(img, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 0, 255), 2)
cv2.putText(img, "ID: " + str(track_id), (int(bbox[0]), int(bbox[1] - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
cv2.imshow('img', img)
if cv2.waitKey(1) == ord("q"):
break
cap.release()
cv2.destroyAllWindows()
import cv2
import numpy as np
import torch
cap = cv2.VideoCapture("Cars.mp4")
model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=True)
cnt = 0
while True:
ret, frame = cap.read()
results = model(frame)
height = frame.shape[0]
for result in results.xyxy[0]:
if int(result[3]) >= int(0.6 * height - 2):
cv2.rectangle(frame, (int(result[0]), int(result[1])), (int(result[2]), int(result[3])), (0, 255, 0), 2)
cnt += 1
cv2.line(frame, (0, int(0.6 * height)), (frame.shape[1], int(0.6 * height)), (0, 255, 0), 2)
cv2.putText(frame, "count:"+str(cnt), (30, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 1, cv2.LINE_AA)
cv2.imshow("video", frame)
if cv2.waitKey(1) == ord("q"):
break
cap.release()
cv2.destroyAllWindows()
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torch.optim as optim
from torchvision.models.segmentation import deeplabv3_resnet50
pretrained_model = deeplabv3_resnet50(pretrained=True)
model = pretrained_model
model.classifier = nn.Conv2d(2048, 3, kernel_size=1)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
def split_input_mask(path):
inputs = []
masks = []
for file in os.listdir(path):
img = os.path.join(path, file)
img = cv2.imread(img)
a = img[:, :256]
inputs.append(a)
b = img[:, 256:]
masks.append(b)
return inputs, masks
path = "/kaggle/input/cityscapes-image-pairs/cityscapes_data/train"
inputs, masks = split_input_mask(path)
class ImgDataset(torch.utils.data.Dataset):
def __init__(self, x, y=None, transform=None):
self.x = x
self.y = y
self.transform = transform
def __len__(self):
return len(self.x)
def __getitem__(self, idx):
X = self.x[idx]
if self.transform is not None:
X = self.transform(X)
if self.y is not None:
Y = self.y[idx]
Y = self.transform(Y)
return X, Y
return X
import torchvision.transforms as transforms
#train_transform = transforms.Compose([
# transforms.ToPILImage(),
# transforms.RandomHorizontalFlip(),
# transforms.RandomRotation(15),
# transforms.Resize((256, 256)),
# transforms.ToTensor(),
#])
train_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224)),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
batch_size = 128
train_set = ImgDataset(inputs, masks, train_transform)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
for epoch in range(10):
running_loss = 0.0
for inputs, labels in train_loader:
outputs = model(inputs)
loss = criterion(outputs['out'], labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, 30, running_loss))
path = "/kaggle/input/cityscapes-image-pairs/cityscapes_data/train/1.jpg"
input_tensor = inputs[2]
image = cv2.imread(path)
train_transform = transforms.Compose([
transforms.ToPILImage(),
transforms.Resize((224, 224)),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
transforms.ToTensor(),
])
#input_batch = train_transform(input_tensor)
with torch.no_grad():
output = model(input_tensor.unsqueeze(0))['out'][0]
output_predictions = output.argmax(0)
plt.imshow(output_predictions)
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.transforms import functional as F
from PIL import Image
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()
image_path = "C:/Users/user/Desktop/data/06.jpg"
image = Image.open(image_path)
image_tensor = F.to_tensor(image).unsqueeze(0)
with torch.no_grad():
prediction = model(image_tensor)
boxes = prediction[0]['boxes']
scores = prediction[0]['scores']
labels = prediction[0]['labels']
for box, score, label in zip(boxes, scores, labels):
print(f"Label: {label}, Score: {score:.2f}, Box: {box}")
import cv2
net = cv2.dnn.readNet("C:/Users/user/Desktop/dnn_model/yolov4-tiny.weights", "C:/Users/user/Desktop/dnn_model/yolov4-tiny.cfg")
model = cv2.dnn_DetectionModel(net)
model.setInputParams(size=(320, 320), scale=1/255)
classes = []
with open("C:/Users/user/Desktop/dnn_model/classes.txt", "r") as file_object:
for class_name in file_object.readlines():
class_name = class_name.strip()
classes.append(class_name)
print("Objects list")
print(classes)
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
while True:
ret, frame = cap.read()
(class_ids, scores, bboxes) = model.detect(frame)
for class_id, score, bbox in zip(class_ids, scores, bboxes):
(x, y, w, h) = bbox
class_name = classes[class_id]
cv2.putText(frame, str(class_name), (x, y-5), cv2.FONT_HERSHEY_PLAIN, 1, (200, 0, 50), 2)
cv2.rectangle(frame, (x, y), (x+w, y+h), (200, 0, 50), 3)
print("class ids", class_ids)
print("scores", scores)
print("bboxes", bboxes)
cv2.imshow("Frame", frame)
cv2.waitKey(1)