Files
Traffic-Intersection-Monito…/detection_openvino.py

1176 lines
52 KiB
Python

# Detection logic using OpenVINO models (YOLO, etc.)
import os
import sys
import time
import cv2
import numpy as np
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from red_light_violation_pipeline import RedLightViolationPipeline
# --- Install required packages if missing ---
try:
import openvino as ov
except ImportError:
print("Installing openvino...")
os.system('pip install --quiet "openvino>=2024.0.0"')
import openvino as ov
try:
from ultralytics import YOLO
except ImportError:
print("Installing ultralytics...")
os.system('pip install --quiet "ultralytics==8.3.0"')
from ultralytics import YOLO
try:
import nncf
except ImportError:
print("Installing nncf...")
os.system('pip install --quiet "nncf>=2.9.0"')
import nncf
# --- COCO dataset class names ---
COCO_CLASSES = {
0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
}
# Traffic-related classes we're interested in (using standard COCO indices)
TRAFFIC_CLASS_NAMES = COCO_CLASSES
# --- Model Conversion and Quantization ---
def convert_yolo_to_openvino(model_name: str = "yolo11x", half: bool = True) -> Path:
"""Convert YOLOv11x PyTorch model to OpenVINO IR format."""
pt_path = Path(f"{model_name}.pt")
ov_dir = Path(f"{model_name}_openvino_model")
ov_xml = ov_dir / f"{model_name}.xml"
if not ov_xml.exists():
print(f"Exporting {pt_path} to OpenVINO IR...")
model = YOLO(str(pt_path))
model.export(format="openvino", dynamic=True, half=half)
else:
print(f"OpenVINO IR already exists: {ov_xml}")
return ov_xml
def quantize_openvino_model(ov_xml: Path, model_name: str = "yolo11x") -> Path:
"""Quantize OpenVINO IR model to INT8 using NNCF."""
int8_dir = Path(f"{model_name}_openvino_int8_model")
int8_xml = int8_dir / f"{model_name}.xml"
if int8_xml.exists():
print(f"INT8 model already exists: {int8_xml}")
return int8_xml
print("Quantization requires a calibration dataset. Skipping actual quantization in this demo.")
return ov_xml # Return FP32 if no quantization
# --- OpenVINO Inference Pipeline ---
class OpenVINOYOLODetector:
def __init__(self, model_xml: Path, device: str = "AUTO"):
self.core = ov.Core()
self.device = device
self.model = self.core.read_model(model_xml)
self.input_shape = self.model.inputs[0].shape
self.input_height = self.input_shape[2]
self.input_width = self.input_shape[3]
self.ov_config = {}
if device != "CPU":
self.model.reshape({0: [1, 3, 640, 640]})
if "GPU" in device or ("AUTO" in device and "GPU" in self.core.available_devices):
self.ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}
self.compiled_model = self.core.compile_model(model=self.model, device_name=self.device, config=self.ov_config)
self.output_layer = self.compiled_model.output(0)
def preprocess(self, frame: np.ndarray) -> np.ndarray:
img = cv2.resize(frame, (self.input_width, self.input_height))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.astype(np.float32) / 255.0
img = img.transpose(2, 0, 1)[None]
return img
def infer(self, frame: np.ndarray, conf_threshold: float = 0.25) -> List[Dict]:
input_tensor = self.preprocess(frame)
output = self.compiled_model([input_tensor])[self.output_layer]
return self.postprocess(output, frame.shape, conf_threshold)
def postprocess(self, output: np.ndarray, frame_shape, conf_threshold: float) -> List[Dict]:
# Output: (1, 84, 8400) or (84, 8400) or (8400, 84)
if output.ndim == 3:
output = np.squeeze(output)
if output.shape[0] == 84:
output = output.T # (8400, 84)
boxes = output[:, :4]
scores = output[:, 4:]
class_ids = np.argmax(scores, axis=1)
confidences = np.max(scores, axis=1)
detections = []
h, w = frame_shape[:2]
for i, (box, score, class_id) in enumerate(zip(boxes, confidences, class_ids)):
if score < conf_threshold:
continue
x_c, y_c, bw, bh = box
# If normalized, scale to input size
if all(0.0 <= v <= 1.0 for v in box):
x_c *= self.input_width
y_c *= self.input_height
bw *= self.input_width
bh *= self.input_height
# Scale to original frame size
scale_x = w / self.input_width
scale_y = h / self.input_height
x_c *= scale_x
y_c *= scale_y
bw *= scale_x
bh *= scale_y
x1 = int(round(x_c - bw / 2))
y1 = int(round(y_c - bh / 2))
x2 = int(round(x_c + bw / 2))
y2 = int(round(y_c + bh / 2))
x1 = max(0, min(x1, w - 1))
y1 = max(0, min(y1, h - 1))
x2 = max(0, min(x2, w - 1))
y2 = max(0, min(y2, h - 1))
if x2 <= x1 or y2 <= y1:
continue
# Only keep class 9 as traffic light, rename if found
if class_id == 9:
class_name = "traffic light"
elif class_id < len(TRAFFIC_CLASS_NAMES):
class_name = TRAFFIC_CLASS_NAMES[class_id]
else:
continue # Remove unknown/other classes
detections.append({
'bbox': [x1, y1, x2, y2],
'confidence': float(score),
'class_id': int(class_id),
'class_name': class_name
})
return detections
def draw(self, frame: np.ndarray, detections: List[Dict], box_thickness: int = 2) -> np.ndarray:
# 80+ visually distinct colors for COCO classes (BGR)
COCO_COLORS = [
(255, 56, 56), (255, 157, 151), (255, 112, 31), (255, 178, 29), (207, 210, 49),
(72, 249, 10), (146, 204, 23), (61, 219, 134), (26, 147, 52), (0, 212, 187),
(44, 153, 168), (0, 194, 255), (52, 69, 147), (100, 115, 255), (0, 24, 236),
(132, 56, 255), (82, 0, 133), (203, 56, 255), (255, 149, 200), (255, 55, 199),
(255, 255, 56), (255, 255, 151), (255, 255, 31), (255, 255, 29), (207, 255, 49),
(72, 255, 10), (146, 255, 23), (61, 255, 134), (26, 255, 52), (0, 255, 187),
(44, 255, 168), (0, 255, 255), (52, 255, 147), (100, 255, 255), (0, 255, 236),
(132, 255, 255), (82, 255, 133), (203, 255, 255), (255, 255, 200), (255, 255, 199),
(56, 255, 255), (157, 255, 151), (112, 255, 31), (178, 255, 29), (210, 255, 49),
(249, 255, 10), (204, 255, 23), (219, 255, 134), (147, 255, 52), (212, 255, 187),
(153, 255, 168), (194, 255, 255), (69, 255, 147), (115, 255, 255), (24, 255, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49),
(249, 72, 10), (204, 146, 23), (219, 61, 134), (147, 26, 52), (212, 0, 187),
(153, 44, 168), (194, 0, 255), (69, 52, 147), (115, 100, 255), (24, 0, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49),
(249, 72, 10), (204, 146, 23), (219, 61, 134), (147, 26, 52), (212, 0, 187),
(153, 44, 168), (194, 0, 255), (69, 52, 147), (115, 100, 255), (24, 0, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49)
]
for det in detections:
x1, y1, x2, y2 = det['bbox']
label = f"{det['class_name']} {det['confidence']:.2f}"
color = COCO_COLORS[det['class_id'] % len(COCO_COLORS)]
cv2.rectangle(frame, (x1, y1), (x2, y2), color, box_thickness)
cv2.putText(frame, label, (x1, max(y1 - 10, 0)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
return frame
# --- Video/Image/Live Inference ---
def run_inference(detector: OpenVINOYOLODetector, source=0, conf_threshold=0.25, flip=False, use_popup=False, video_width=None):
if isinstance(source, str) and not os.path.exists(source):
print(f"Downloading sample video: {source}")
import requests
url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4"
r = requests.get(url)
with open(source, 'wb') as f:
f.write(r.content)
cap = cv2.VideoCapture(source)
if not cap.isOpened():
print(f"Failed to open video source: {source}")
return
window_name = "YOLOv11x + OpenVINO Detection"
if use_popup:
cv2.namedWindow(window_name, cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE)
frame_count = 0
times = []
while True:
ret, frame = cap.read()
if not ret:
break
if flip:
frame = cv2.flip(frame, 1)
if video_width:
scale = video_width / max(frame.shape[:2])
frame = cv2.resize(frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
start = time.time()
detections = detector.infer(frame, conf_threshold=conf_threshold)
frame = detector.draw(frame, detections)
elapsed = time.time() - start
times.append(elapsed)
if len(times) > 200:
times.pop(0)
fps = 1.0 / np.mean(times) if times else 0
cv2.putText(frame, f"FPS: {fps:.1f}", (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
if use_popup:
cv2.imshow(window_name, frame)
if cv2.waitKey(1) & 0xFF == 27:
break
else:
cv2.imshow(window_name, frame)
if cv2.waitKey(1) & 0xFF == 27:
break
frame_count += 1
cap.release()
cv2.destroyAllWindows()
# --- Main Entrypoint ---
if __name__ == "__main__":
# Choose model: yolo11x or yolo11n, etc.
MODEL_NAME = "yolo11x"
DEVICE = "AUTO" # or "CPU", "GPU"
# Step 1: Convert model if needed
ov_xml = convert_yolo_to_openvino(MODEL_NAME)
# Step 2: Quantize (optional, demo skips actual quantization)
ov_xml = quantize_openvino_model(ov_xml, MODEL_NAME)
# Step 3: Create detector
detector = OpenVINOYOLODetector(ov_xml, device=DEVICE)
# Step 4: Run on webcam, video, or image
# Webcam: source=0, Video: source="video.mp4", Image: source="image.jpg"
run_inference(detector, source=0, conf_threshold=0.25, flip=True, use_popup=True, video_width=1280)
# To run on a video file: run_inference(detector, source="people.mp4", conf_threshold=0.25)
# To run on an image: run_inference(detector, source="image.jpg", conf_threshold=0.25)
# To run async or batch, extend the OpenVINOYOLODetector class with async API as needed.
import numpy as np
import cv2
def postprocess_openvino_yolo(output, conf_threshold=0.4, iou_threshold=0.5, input_shape=(640, 640), original_shape=None):
"""
output: OpenVINO raw output tensor (e.g., shape [1, 25200, 85])
conf_threshold: minimum confidence
iou_threshold: for NMS
input_shape: model input size (w, h)
original_shape: original image size (w, h)
"""
# 1. Squeeze batch dimension
output = np.squeeze(output) # [25200, 85]
# 2. Split predictions
boxes = output[:, :4]
obj_conf = output[:, 4]
class_scores = output[:, 5:]
# 3. Get class with highest score
class_ids = np.argmax(class_scores, axis=1)
class_conf = class_scores[np.arange(len(class_scores)), class_ids]
# 4. Multiply objectness confidence with class confidence
scores = obj_conf * class_conf
# 5. Filter by confidence threshold
mask = scores > conf_threshold
boxes = boxes[mask]
scores = scores[mask]
class_ids = class_ids[mask]
if original_shape is not None:
# Rescale boxes from input_shape to original image shape
input_w, input_h = input_shape
orig_w, orig_h = original_shape
scale_x = orig_w / input_w
scale_y = orig_h / input_h
boxes[:, 0] *= scale_x # x1
boxes[:, 1] *= scale_y # y1
boxes[:, 2] *= scale_x # x2
boxes[:, 3] *= scale_y # y2
# 6. Convert boxes to [x, y, w, h] format for OpenCV NMS
boxes_xywh = []
for box in boxes:
x1, y1, x2, y2 = box
boxes_xywh.append([x1, y1, x2 - x1, y2 - y1])
# 7. Apply NMS
indices = cv2.dnn.NMSBoxes(boxes_xywh, scores.tolist(), conf_threshold, iou_threshold)
# 8. Return filtered boxes
result_boxes = []
result_scores = []
result_classes = []
if len(boxes) > 0 and len(scores) > 0:
indices = cv2.dnn.NMSBoxes(boxes_xywh, scores.tolist(), conf_threshold, iou_threshold)
if len(indices) > 0:
indices = np.array(indices).flatten()
for i in indices:
i = int(i)
result_boxes.append(boxes[i])
result_scores.append(scores[i])
result_classes.append(class_ids[i])
return result_boxes, result_scores, result_classes
import os
import time
import numpy as np
import cv2
from pathlib import Path
from typing import List, Dict, Optional
# Only traffic-related classes for detection
TRAFFIC_CLASS_NAMES = [
'person', 'bicycle', 'car', 'motorcycle', 'bus', 'truck',
'traffic light', 'stop sign', 'parking meter'
]
class OpenVINOVehicleDetector:
def __init__(self, model_path: str = None, device: str = "AUTO", use_quantized: bool = False, enable_ocr: bool = False, confidence_threshold: float = 0.4):
import openvino as ov
self.device = device
self.confidence_threshold = confidence_threshold
self.ocr_reader = None
self.class_names = TRAFFIC_CLASS_NAMES
self.performance_stats = {
'fps': 0,
'avg_inference_time': 0,
'frames_processed': 0,
'backend': f"OpenVINO-{device}",
'total_detections': 0,
'detection_rate': 0
}
self._inference_times = []
self._start_time = time.time()
self._frame_count = 0
# Model selection logic
self.model_path = self._find_best_model(model_path, use_quantized)
self.core = ov.Core()
self.model = self.core.read_model(self.model_path)
# Always reshape to static shape before accessing .shape
self.model.reshape({0: [1, 3, 640, 640]})
self.input_shape = self.model.inputs[0].shape
self.input_height = self.input_shape[2]
self.input_width = self.input_shape[3]
self.ov_config = {}
if device != "CPU":
# Already reshaped above, so nothing more needed here
pass
if "GPU" in device or ("AUTO" in device and "GPU" in self.core.available_devices):
self.ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}
self.compiled_model = self.core.compile_model(model=self.model, device_name=self.device, config=self.ov_config)
self.output_layer = self.compiled_model.output(0)
def _find_best_model(self, model_path, use_quantized):
# Priority: quantized IR > IR > .pt
search_paths = [
Path(model_path) if model_path else None,
Path("yolo11x_openvino_int8_model/yolo11x.xml") if use_quantized else None,
Path("yolo11x_openvino_model/yolo11x.xml"),
Path("rcb/yolo11x_openvino_model/yolo11x.xml"),
Path("yolo11x.xml"),
Path("rcb/yolo11x.xml"),
Path("yolo11x.pt"),
Path("rcb/yolo11x.pt")
]
for p in search_paths:
if p and p.exists():
return str(p)
raise FileNotFoundError("No suitable YOLOv11x model found for OpenVINO.")
def detect_vehicles(self, frame: np.ndarray, conf_threshold: float = None) -> List[Dict]:
if conf_threshold is None:
conf_threshold = 0.1 # Lowered for debugging
start = time.time()
input_tensor = self._preprocess(frame)
output = self.compiled_model([input_tensor])[self.output_layer]
# Debug: print raw output shape
print(f"[DEBUG] Model output shape: {output.shape}")
detections = self._postprocess(output, frame.shape, conf_threshold)
print(f"[DEBUG] Detections after postprocess: {len(detections)}")
elapsed = time.time() - start
self._inference_times.append(elapsed)
self._frame_count += 1
self.performance_stats['frames_processed'] = self._frame_count
self.performance_stats['total_detections'] += len(detections)
if len(self._inference_times) > 100:
self._inference_times.pop(0)
self.performance_stats['avg_inference_time'] = float(np.mean(self._inference_times)) if self._inference_times else 0
total_time = time.time() - self._start_time
self.performance_stats['fps'] = self._frame_count / total_time if total_time > 0 else 0
return detections
def _preprocess(self, frame: np.ndarray) -> np.ndarray:
img = cv2.resize(frame, (self.input_width, self.input_height))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.astype(np.float32) / 255.0
img = img.transpose(2, 0, 1)[None]
return img
def _postprocess(self, output: np.ndarray, frame_shape, conf_threshold: float) -> List[Dict]:
# Output: (1, 84, 8400) or (84, 8400) or (8400, 84)
if output.ndim == 3:
output = np.squeeze(output)
if output.shape[0] == 84:
output = output.T # (8400, 84)
boxes = output[:, :4]
scores = output[:, 4:]
class_ids = np.argmax(scores, axis=1)
confidences = np.max(scores, axis=1)
detections = []
h, w = frame_shape[:2]
for i, (box, score, class_id) in enumerate(zip(boxes, confidences, class_ids)):
if score < conf_threshold:
continue
x_c, y_c, bw, bh = box
# If normalized, scale to input size
if all(0.0 <= v <= 1.0 for v in box):
x_c *= self.input_width
y_c *= self.input_height
bw *= self.input_width
bh *= self.input_height
# Scale to original frame size
scale_x = w / self.input_width
scale_y = h / self.input_height
x_c *= scale_x
y_c *= scale_y
bw *= scale_x
bh *= scale_y
x1 = int(round(x_c - bw / 2))
y1 = int(round(y_c - bh / 2))
x2 = int(round(x_c + bw / 2))
y2 = int(round(y_c + bh / 2))
x1 = max(0, min(x1, w - 1))
y1 = max(0, min(y1, h - 1))
x2 = max(0, min(x2, w - 1))
y2 = max(0, min(y2, h - 1))
if x2 <= x1 or y2 <= y1:
continue
# Only keep class 9 as traffic light, rename if found
if class_id == 9:
class_name = "traffic light"
elif class_id < len(TRAFFIC_CLASS_NAMES):
class_name = TRAFFIC_CLASS_NAMES[class_id]
else:
continue # Remove unknown/other classes
detections.append({
'bbox': [x1, y1, x2, y2],
'confidence': float(score),
'class_id': int(class_id),
'class_name': class_name
})
print(f"[DEBUG] Raw detections before NMS: {len(detections)}")
# Apply NMS
if len(detections) > 0:
boxes = np.array([det['bbox'] for det in detections])
scores = np.array([det['confidence'] for det in detections])
indices = cv2.dnn.NMSBoxes(boxes.tolist(), scores.tolist(), conf_threshold, 0.5)
if isinstance(indices, (list, tuple)) and len(indices) > 0:
indices = np.array(indices).flatten()
elif isinstance(indices, np.ndarray) and indices.size > 0:
indices = indices.flatten()
else:
indices = []
detections = [detections[int(i)] for i in indices] if len(indices) > 0 else []
print(f"[DEBUG] Detections after NMS: {len(detections)}")
return detections
def draw(self, frame: np.ndarray, detections: List[Dict], box_thickness: int = 2) -> np.ndarray:
# 80+ visually distinct colors for COCO classes (BGR)
COCO_COLORS = [
(255, 56, 56), (255, 157, 151), (255, 112, 31), (255, 178, 29), (207, 210, 49),
(72, 249, 10), (146, 204, 23), (61, 219, 134), (26, 147, 52), (0, 212, 187),
(44, 153, 168), (0, 194, 255), (52, 69, 147), (100, 115, 255), (0, 24, 236),
(132, 56, 255), (82, 0, 133), (203, 56, 255), (255, 149, 200), (255, 55, 199),
(255, 255, 56), (255, 255, 151), (255, 255, 31), (255, 255, 29), (207, 255, 49),
(72, 255, 10), (146, 255, 23), (61, 255, 134), (26, 255, 52), (0, 255, 187),
(44, 255, 168), (0, 255, 255), (52, 255, 147), (100, 255, 255), (0, 255, 236),
(132, 255, 255), (82, 255, 133), (203, 255, 255), (255, 255, 200), (255, 255, 199),
(56, 255, 255), (157, 255, 151), (112, 255, 31), (178, 255, 29), (210, 255, 49),
(249, 255, 10), (204, 255, 23), (219, 255, 134), (147, 255, 52), (212, 255, 187),
(153, 255, 168), (194, 255, 255), (69, 255, 147), (115, 255, 255), (24, 255, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49),
(249, 72, 10), (204, 146, 23), (219, 61, 134), (147, 26, 52), (212, 0, 187),
(153, 44, 168), (194, 0, 255), (69, 52, 147), (115, 100, 255), (24, 0, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49),
(249, 72, 10), (204, 146, 23), (219, 61, 134), (147, 26, 52), (212, 0, 187),
(153, 44, 168), (194, 0, 255), (69, 52, 147), (115, 100, 255), (24, 0, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49)
]
for det in detections:
x1, y1, x2, y2 = det['bbox']
label = f"{det['class_name']} {det['confidence']:.2f}"
color = COCO_COLORS[det['class_id'] % len(COCO_COLORS)]
cv2.rectangle(frame, (x1, y1), (x2, y2), color, box_thickness)
cv2.putText(frame, label, (x1, max(y1 - 10, 0)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
return frame
# --- Video/Image/Live Inference ---
def run_inference(detector: OpenVINOYOLODetector, source=0, conf_threshold=0.25, flip=False, use_popup=False, video_width=None):
if isinstance(source, str) and not os.path.exists(source):
print(f"Downloading sample video: {source}")
import requests
url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4"
r = requests.get(url)
with open(source, 'wb') as f:
f.write(r.content)
cap = cv2.VideoCapture(source)
if not cap.isOpened():
print(f"Failed to open video source: {source}")
return
window_name = "YOLOv11x + OpenVINO Detection"
if use_popup:
cv2.namedWindow(window_name, cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE)
frame_count = 0
times = []
while True:
ret, frame = cap.read()
if not ret:
break
if flip:
frame = cv2.flip(frame, 1)
if video_width:
scale = video_width / max(frame.shape[:2])
frame = cv2.resize(frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
start = time.time()
detections = detector.infer(frame, conf_threshold=conf_threshold)
frame = detector.draw(frame, detections)
elapsed = time.time() - start
times.append(elapsed)
if len(times) > 200:
times.pop(0)
fps = 1.0 / np.mean(times) if times else 0
cv2.putText(frame, f"FPS: {fps:.1f}", (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
if use_popup:
cv2.imshow(window_name, frame)
if cv2.waitKey(1) & 0xFF == 27:
break
else:
cv2.imshow(window_name, frame)
if cv2.waitKey(1) & 0xFF == 27:
break
frame_count += 1
cap.release()
cv2.destroyAllWindows()
# --- Main Entrypoint ---
if __name__ == "__main__":
# Choose model: yolo11x or yolo11n, etc.
MODEL_NAME = "yolo11x"
DEVICE = "AUTO" # or "CPU", "GPU"
# Step 1: Convert model if needed
ov_xml = convert_yolo_to_openvino(MODEL_NAME)
# Step 2: Quantize (optional, demo skips actual quantization)
ov_xml = quantize_openvino_model(ov_xml, MODEL_NAME)
# Step 3: Create detector
detector = OpenVINOYOLODetector(ov_xml, device=DEVICE)
# Step 4: Run on webcam, video, or image
# Webcam: source=0, Video: source="video.mp4", Image: source="image.jpg"
run_inference(detector, source=0, conf_threshold=0.25, flip=True, use_popup=True, video_width=1280)
# To run on a video file: run_inference(detector, source="people.mp4", conf_threshold=0.25)
# To run on an image: run_inference(detector, source="image.jpg", conf_threshold=0.25)
# To run async or batch, extend the OpenVINOYOLODetector class with async API as needed.
import numpy as np
import cv2
def postprocess_openvino_yolo(output, conf_threshold=0.4, iou_threshold=0.5, input_shape=(640, 640), original_shape=None):
"""
output: OpenVINO raw output tensor (e.g., shape [1, 25200, 85])
conf_threshold: minimum confidence
iou_threshold: for NMS
input_shape: model input size (w, h)
original_shape: original image size (w, h)
"""
# 1. Squeeze batch dimension
output = np.squeeze(output) # [25200, 85]
# 2. Split predictions
boxes = output[:, :4]
obj_conf = output[:, 4]
class_scores = output[:, 5:]
# 3. Get class with highest score
class_ids = np.argmax(class_scores, axis=1)
class_conf = class_scores[np.arange(len(class_scores)), class_ids]
# 4. Multiply objectness confidence with class confidence
scores = obj_conf * class_conf
# 5. Filter by confidence threshold
mask = scores > conf_threshold
boxes = boxes[mask]
scores = scores[mask]
class_ids = class_ids[mask]
if original_shape is not None:
# Rescale boxes from input_shape to original image shape
input_w, input_h = input_shape
orig_w, orig_h = original_shape
scale_x = orig_w / input_w
scale_y = orig_h / input_h
boxes[:, 0] *= scale_x # x1
boxes[:, 1] *= scale_y # y1
boxes[:, 2] *= scale_x # x2
boxes[:, 3] *= scale_y # y2
# 6. Convert boxes to [x, y, w, h] format for OpenCV NMS
boxes_xywh = []
for box in boxes:
x1, y1, x2, y2 = box
boxes_xywh.append([x1, y1, x2 - x1, y2 - y1])
# 7. Apply NMS
indices = cv2.dnn.NMSBoxes(boxes_xywh, scores.tolist(), conf_threshold, iou_threshold)
# 8. Return filtered boxes
result_boxes = []
result_scores = []
result_classes = []
if len(boxes) > 0 and len(scores) > 0:
indices = cv2.dnn.NMSBoxes(boxes_xywh, scores.tolist(), conf_threshold, iou_threshold)
if len(indices) > 0:
indices = np.array(indices).flatten()
for i in indices:
i = int(i)
result_boxes.append(boxes[i])
result_scores.append(scores[i])
result_classes.append(class_ids[i])
return result_boxes, result_scores, result_classes
import os
import time
import numpy as np
import cv2
from pathlib import Path
from typing import List, Dict, Optional
# Only traffic-related classes for detection
TRAFFIC_CLASS_NAMES = [
'person', 'bicycle', 'car', 'motorcycle', 'bus', 'truck',
'traffic light', 'stop sign', 'parking meter'
]
class OpenVINOVehicleDetector:
def __init__(self, model_path: str = None, device: str = "AUTO", use_quantized: bool = False, enable_ocr: bool = False, confidence_threshold: float = 0.4):
import openvino as ov
self.device = device
self.confidence_threshold = confidence_threshold
self.ocr_reader = None
self.class_names = TRAFFIC_CLASS_NAMES
self.performance_stats = {
'fps': 0,
'avg_inference_time': 0,
'frames_processed': 0,
'backend': f"OpenVINO-{device}",
'total_detections': 0,
'detection_rate': 0
}
self._inference_times = []
self._start_time = time.time()
self._frame_count = 0
# Model selection logic
self.model_path = self._find_best_model(model_path, use_quantized)
self.core = ov.Core()
self.model = self.core.read_model(self.model_path)
# Always reshape to static shape before accessing .shape
self.model.reshape({0: [1, 3, 640, 640]})
self.input_shape = self.model.inputs[0].shape
self.input_height = self.input_shape[2]
self.input_width = self.input_shape[3]
self.ov_config = {}
if device != "CPU":
# Already reshaped above, so nothing more needed here
pass
if "GPU" in device or ("AUTO" in device and "GPU" in self.core.available_devices):
self.ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}
self.compiled_model = self.core.compile_model(model=self.model, device_name=self.device, config=self.ov_config)
self.output_layer = self.compiled_model.output(0)
def _find_best_model(self, model_path, use_quantized):
# Priority: quantized IR > IR > .pt
search_paths = [
Path(model_path) if model_path else None,
Path("yolo11x_openvino_int8_model/yolo11x.xml") if use_quantized else None,
Path("yolo11x_openvino_model/yolo11x.xml"),
Path("rcb/yolo11x_openvino_model/yolo11x.xml"),
Path("yolo11x.xml"),
Path("rcb/yolo11x.xml"),
Path("yolo11x.pt"),
Path("rcb/yolo11x.pt")
]
for p in search_paths:
if p and p.exists():
return str(p)
raise FileNotFoundError("No suitable YOLOv11x model found for OpenVINO.")
def detect_vehicles(self, frame: np.ndarray, conf_threshold: float = None) -> List[Dict]:
if conf_threshold is None:
conf_threshold = 0.1 # Lowered for debugging
start = time.time()
input_tensor = self._preprocess(frame)
output = self.compiled_model([input_tensor])[self.output_layer]
# Debug: print raw output shape
print(f"[DEBUG] Model output shape: {output.shape}")
detections = self._postprocess(output, frame.shape, conf_threshold)
print(f"[DEBUG] Detections after postprocess: {len(detections)}")
elapsed = time.time() - start
self._inference_times.append(elapsed)
self._frame_count += 1
self.performance_stats['frames_processed'] = self._frame_count
self.performance_stats['total_detections'] += len(detections)
if len(self._inference_times) > 100:
self._inference_times.pop(0)
self.performance_stats['avg_inference_time'] = float(np.mean(self._inference_times)) if self._inference_times else 0
total_time = time.time() - self._start_time
self.performance_stats['fps'] = self._frame_count / total_time if total_time > 0 else 0
return detections
def _preprocess(self, frame: np.ndarray) -> np.ndarray:
img = cv2.resize(frame, (self.input_width, self.input_height))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.astype(np.float32) / 255.0
img = img.transpose(2, 0, 1)[None]
return img
def _postprocess(self, output: np.ndarray, frame_shape, conf_threshold: float) -> List[Dict]:
# Output: (1, 84, 8400) or (84, 8400) or (8400, 84)
if output.ndim == 3:
output = np.squeeze(output)
if output.shape[0] == 84:
output = output.T # (8400, 84)
boxes = output[:, :4]
scores = output[:, 4:]
class_ids = np.argmax(scores, axis=1)
confidences = np.max(scores, axis=1)
detections = []
h, w = frame_shape[:2]
for i, (box, score, class_id) in enumerate(zip(boxes, confidences, class_ids)):
if score < conf_threshold:
continue
x_c, y_c, bw, bh = box
# If normalized, scale to input size
if all(0.0 <= v <= 1.0 for v in box):
x_c *= self.input_width
y_c *= self.input_height
bw *= self.input_width
bh *= self.input_height
# Scale to original frame size
scale_x = w / self.input_width
scale_y = h / self.input_height
x_c *= scale_x
y_c *= scale_y
bw *= scale_x
bh *= scale_y
x1 = int(round(x_c - bw / 2))
y1 = int(round(y_c - bh / 2))
x2 = int(round(x_c + bw / 2))
y2 = int(round(y_c + bh / 2))
x1 = max(0, min(x1, w - 1))
y1 = max(0, min(y1, h - 1))
x2 = max(0, min(x2, w - 1))
y2 = max(0, min(y2, h - 1))
if x2 <= x1 or y2 <= y1:
continue
# Only keep class 9 as traffic light, rename if found
if class_id == 9:
class_name = "traffic light"
elif class_id < len(TRAFFIC_CLASS_NAMES):
class_name = TRAFFIC_CLASS_NAMES[class_id]
else:
continue # Remove unknown/other classes
detections.append({
'bbox': [x1, y1, x2, y2],
'confidence': float(score),
'class_id': int(class_id),
'class_name': class_name
})
print(f"[DEBUG] Raw detections before NMS: {len(detections)}")
# Apply NMS
if len(detections) > 0:
boxes = np.array([det['bbox'] for det in detections])
scores = np.array([det['confidence'] for det in detections])
indices = cv2.dnn.NMSBoxes(boxes.tolist(), scores.tolist(), conf_threshold, 0.5)
if isinstance(indices, (list, tuple)) and len(indices) > 0:
indices = np.array(indices).flatten()
elif isinstance(indices, np.ndarray) and indices.size > 0:
indices = indices.flatten()
else:
indices = []
detections = [detections[int(i)] for i in indices] if len(indices) > 0 else []
print(f"[DEBUG] Detections after NMS: {len(detections)}")
return detections
def draw(self, frame: np.ndarray, detections: List[Dict], box_thickness: int = 2) -> np.ndarray:
# 80+ visually distinct colors for COCO classes (BGR)
COCO_COLORS = [
(255, 56, 56), (255, 157, 151), (255, 112, 31), (255, 178, 29), (207, 210, 49),
(72, 249, 10), (146, 204, 23), (61, 219, 134), (26, 147, 52), (0, 212, 187),
(44, 153, 168), (0, 194, 255), (52, 69, 147), (100, 115, 255), (0, 24, 236),
(132, 56, 255), (82, 0, 133), (203, 56, 255), (255, 149, 200), (255, 55, 199),
(255, 255, 56), (255, 255, 151), (255, 255, 31), (255, 255, 29), (207, 255, 49),
(72, 255, 10), (146, 255, 23), (61, 255, 134), (26, 255, 52), (0, 255, 187),
(44, 255, 168), (0, 255, 255), (52, 255, 147), (100, 255, 255), (0, 255, 236),
(132, 255, 255), (82, 255, 133), (203, 255, 255), (255, 255, 200), (255, 255, 199),
(56, 255, 255), (157, 255, 151), (112, 255, 31), (178, 255, 29), (210, 255, 49),
(249, 255, 10), (204, 255, 23), (219, 255, 134), (147, 255, 52), (212, 255, 187),
(153, 255, 168), (194, 255, 255), (69, 255, 147), (115, 255, 255), (24, 255, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49),
(249, 72, 10), (204, 146, 23), (219, 61, 134), (147, 26, 52), (212, 0, 187),
(153, 44, 168), (194, 0, 255), (69, 52, 147), (115, 100, 255), (24, 0, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49),
(249, 72, 10), (204, 146, 23), (219, 61, 134), (147, 26, 52), (212, 0, 187),
(153, 44, 168), (194, 0, 255), (69, 52, 147), (115, 100, 255), (24, 0, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49)
]
for det in detections:
x1, y1, x2, y2 = det['bbox']
label = f"{det['class_name']} {det['confidence']:.2f}"
color = COCO_COLORS[det['class_id'] % len(COCO_COLORS)]
cv2.rectangle(frame, (x1, y1), (x2, y2), color, box_thickness)
cv2.putText(frame, label, (x1, max(y1 - 10, 0)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
return frame
# --- Video/Image/Live Inference ---
def run_inference(detector: OpenVINOYOLODetector, source=0, conf_threshold=0.25, flip=False, use_popup=False, video_width=None):
if isinstance(source, str) and not os.path.exists(source):
print(f"Downloading sample video: {source}")
import requests
url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4"
r = requests.get(url)
with open(source, 'wb') as f:
f.write(r.content)
cap = cv2.VideoCapture(source)
if not cap.isOpened():
print(f"Failed to open video source: {source}")
return
window_name = "YOLOv11x + OpenVINO Detection"
if use_popup:
cv2.namedWindow(window_name, cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE)
frame_count = 0
times = []
while True:
ret, frame = cap.read()
if not ret:
break
if flip:
frame = cv2.flip(frame, 1)
if video_width:
scale = video_width / max(frame.shape[:2])
frame = cv2.resize(frame, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
start = time.time()
detections = detector.infer(frame, conf_threshold=conf_threshold)
frame = detector.draw(frame, detections)
elapsed = time.time() - start
times.append(elapsed)
if len(times) > 200:
times.pop(0)
fps = 1.0 / np.mean(times) if times else 0
cv2.putText(frame, f"FPS: {fps:.1f}", (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2)
if use_popup:
cv2.imshow(window_name, frame)
if cv2.waitKey(1) & 0xFF == 27:
break
else:
cv2.imshow(window_name, frame)
if cv2.waitKey(1) & 0xFF == 27:
break
frame_count += 1
cap.release()
cv2.destroyAllWindows()
# --- Main Entrypoint ---
if __name__ == "__main__":
# Choose model: yolo11x or yolo11n, etc.
MODEL_NAME = "yolo11x"
DEVICE = "AUTO" # or "CPU", "GPU"
# Step 1: Convert model if needed
ov_xml = convert_yolo_to_openvino(MODEL_NAME)
# Step 2: Quantize (optional, demo skips actual quantization)
ov_xml = quantize_openvino_model(ov_xml, MODEL_NAME)
# Step 3: Create detector
detector = OpenVINOYOLODetector(ov_xml, device=DEVICE)
# Step 4: Run on webcam, video, or image
# Webcam: source=0, Video: source="video.mp4", Image: source="image.jpg"
run_inference(detector, source=0, conf_threshold=0.25, flip=True, use_popup=True, video_width=1280)
# To run on a video file: run_inference(detector, source="people.mp4", conf_threshold=0.25)
# To run on an image: run_inference(detector, source="image.jpg", conf_threshold=0.25)
# To run async or batch, extend the OpenVINOYOLODetector class with async API as needed.
import numpy as np
import cv2
def postprocess_openvino_yolo(output, conf_threshold=0.4, iou_threshold=0.5, input_shape=(640, 640), original_shape=None):
"""
output: OpenVINO raw output tensor (e.g., shape [1, 25200, 85])
conf_threshold: minimum confidence
iou_threshold: for NMS
input_shape: model input size (w, h)
original_shape: original image size (w, h)
"""
# 1. Squeeze batch dimension
output = np.squeeze(output) # [25200, 85]
# 2. Split predictions
boxes = output[:, :4]
obj_conf = output[:, 4]
class_scores = output[:, 5:]
# 3. Get class with highest score
class_ids = np.argmax(class_scores, axis=1)
class_conf = class_scores[np.arange(len(class_scores)), class_ids]
# 4. Multiply objectness confidence with class confidence
scores = obj_conf * class_conf
# 5. Filter by confidence threshold
mask = scores > conf_threshold
boxes = boxes[mask]
scores = scores[mask]
class_ids = class_ids[mask]
if original_shape is not None:
# Rescale boxes from input_shape to original image shape
input_w, input_h = input_shape
orig_w, orig_h = original_shape
scale_x = orig_w / input_w
scale_y = orig_h / input_h
boxes[:, 0] *= scale_x # x1
boxes[:, 1] *= scale_y # y1
boxes[:, 2] *= scale_x # x2
boxes[:, 3] *= scale_y # y2
# 6. Convert boxes to [x, y, w, h] format for OpenCV NMS
boxes_xywh = []
for box in boxes:
x1, y1, x2, y2 = box
boxes_xywh.append([x1, y1, x2 - x1, y2 - y1])
# 7. Apply NMS
indices = cv2.dnn.NMSBoxes(boxes_xywh, scores.tolist(), conf_threshold, iou_threshold)
# 8. Return filtered boxes
result_boxes = []
result_scores = []
result_classes = []
if len(boxes) > 0 and len(scores) > 0:
indices = cv2.dnn.NMSBoxes(boxes_xywh, scores.tolist(), conf_threshold, iou_threshold)
if len(indices) > 0:
indices = np.array(indices).flatten()
for i in indices:
i = int(i)
result_boxes.append(boxes[i])
result_scores.append(scores[i])
result_classes.append(class_ids[i])
return result_boxes, result_scores, result_classes
import os
import time
import numpy as np
import cv2
from pathlib import Path
from typing import List, Dict, Optional
# Only traffic-related classes for detection
TRAFFIC_CLASS_NAMES = [
'person', 'bicycle', 'car', 'motorcycle', 'bus', 'truck',
'traffic light', 'stop sign', 'parking meter'
]
class OpenVINOVehicleDetector:
def __init__(self, model_path: str = None, device: str = "AUTO", use_quantized: bool = False, enable_ocr: bool = False, confidence_threshold: float = 0.4):
import openvino as ov
self.device = device
self.confidence_threshold = confidence_threshold
self.ocr_reader = None
self.class_names = TRAFFIC_CLASS_NAMES
self.performance_stats = {
'fps': 0,
'avg_inference_time': 0,
'frames_processed': 0,
'backend': f"OpenVINO-{device}",
'total_detections': 0,
'detection_rate': 0
}
self._inference_times = []
self._start_time = time.time()
self._frame_count = 0
# Model selection logic
self.model_path = self._find_best_model(model_path, use_quantized)
self.core = ov.Core()
self.model = self.core.read_model(self.model_path)
# Always reshape to static shape before accessing .shape
self.model.reshape({0: [1, 3, 640, 640]})
self.input_shape = self.model.inputs[0].shape
self.input_height = self.input_shape[2]
self.input_width = self.input_shape[3]
self.ov_config = {}
if device != "CPU":
# Already reshaped above, so nothing more needed here
pass
if "GPU" in device or ("AUTO" in device and "GPU" in self.core.available_devices):
self.ov_config = {"GPU_DISABLE_WINOGRAD_CONVOLUTION": "YES"}
self.compiled_model = self.core.compile_model(model=self.model, device_name=self.device, config=self.ov_config)
self.output_layer = self.compiled_model.output(0)
def _find_best_model(self, model_path, use_quantized):
# Priority: quantized IR > IR > .pt
search_paths = [
Path(model_path) if model_path else None,
Path("yolo11x_openvino_int8_model/yolo11x.xml") if use_quantized else None,
Path("yolo11x_openvino_model/yolo11x.xml"),
Path("rcb/yolo11x_openvino_model/yolo11x.xml"),
Path("yolo11x.xml"),
Path("rcb/yolo11x.xml"),
Path("yolo11x.pt"),
Path("rcb/yolo11x.pt")
]
for p in search_paths:
if p and p.exists():
return str(p)
raise FileNotFoundError("No suitable YOLOv11x model found for OpenVINO.")
def detect_vehicles(self, frame: np.ndarray, conf_threshold: float = None) -> List[Dict]:
if conf_threshold is None:
conf_threshold = 0.1 # Lowered for debugging
start = time.time()
input_tensor = self._preprocess(frame)
output = self.compiled_model([input_tensor])[self.output_layer]
# Debug: print raw output shape
print(f"[DEBUG] Model output shape: {output.shape}")
detections = self._postprocess(output, frame.shape, conf_threshold)
print(f"[DEBUG] Detections after postprocess: {len(detections)}")
elapsed = time.time() - start
self._inference_times.append(elapsed)
self._frame_count += 1
self.performance_stats['frames_processed'] = self._frame_count
self.performance_stats['total_detections'] += len(detections)
if len(self._inference_times) > 100:
self._inference_times.pop(0)
self.performance_stats['avg_inference_time'] = float(np.mean(self._inference_times)) if self._inference_times else 0
total_time = time.time() - self._start_time
self.performance_stats['fps'] = self._frame_count / total_time if total_time > 0 else 0
return detections
def _preprocess(self, frame: np.ndarray) -> np.ndarray:
img = cv2.resize(frame, (self.input_width, self.input_height))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.astype(np.float32) / 255.0
img = img.transpose(2, 0, 1)[None]
return img
def _postprocess(self, output: np.ndarray, frame_shape, conf_threshold: float) -> List[Dict]:
# Output: (1, 84, 8400) or (84, 8400) or (8400, 84)
if output.ndim == 3:
output = np.squeeze(output)
if output.shape[0] == 84:
output = output.T # (8400, 84)
boxes = output[:, :4]
scores = output[:, 4:]
class_ids = np.argmax(scores, axis=1)
confidences = np.max(scores, axis=1)
detections = []
h, w = frame_shape[:2]
for i, (box, score, class_id) in enumerate(zip(boxes, confidences, class_ids)):
if score < conf_threshold:
continue
x_c, y_c, bw, bh = box
# If normalized, scale to input size
if all(0.0 <= v <= 1.0 for v in box):
x_c *= self.input_width
y_c *= self.input_height
bw *= self.input_width
bh *= self.input_height
# Scale to original frame size
scale_x = w / self.input_width
scale_y = h / self.input_height
x_c *= scale_x
y_c *= scale_y
bw *= scale_x
bh *= scale_y
x1 = int(round(x_c - bw / 2))
y1 = int(round(y_c - bh / 2))
x2 = int(round(x_c + bw / 2))
y2 = int(round(y_c + bh / 2))
x1 = max(0, min(x1, w - 1))
y1 = max(0, min(y1, h - 1))
x2 = max(0, min(x2, w - 1))
y2 = max(0, min(y2, h - 1))
if x2 <= x1 or y2 <= y1:
continue
# Only keep class 9 as traffic light, rename if found
if class_id == 9:
class_name = "traffic light"
elif class_id < len(TRAFFIC_CLASS_NAMES):
class_name = TRAFFIC_CLASS_NAMES[class_id]
else:
continue # Remove unknown/other classes
detections.append({
'bbox': [x1, y1, x2, y2],
'confidence': float(score),
'class_id': int(class_id),
'class_name': class_name
})
print(f"[DEBUG] Raw detections before NMS: {len(detections)}")
# Apply NMS
if len(detections) > 0:
boxes = np.array([det['bbox'] for det in detections])
scores = np.array([det['confidence'] for det in detections])
indices = cv2.dnn.NMSBoxes(boxes.tolist(), scores.tolist(), conf_threshold, 0.5)
if isinstance(indices, (list, tuple)) and len(indices) > 0:
indices = np.array(indices).flatten()
elif isinstance(indices, np.ndarray) and indices.size > 0:
indices = indices.flatten()
else:
indices = []
detections = [detections[int(i)] for i in indices] if len(indices) > 0 else []
print(f"[DEBUG] Detections after NMS: {len(detections)}")
return detections
def draw(self, frame: np.ndarray, detections: List[Dict], box_thickness: int = 2) -> np.ndarray:
# 80+ visually distinct colors for COCO classes (BGR)
COCO_COLORS = [
(255, 56, 56), (255, 157, 151), (255, 112, 31), (255, 178, 29), (207, 210, 49),
(72, 249, 10), (146, 204, 23), (61, 219, 134), (26, 147, 52), (0, 212, 187),
(44, 153, 168), (0, 194, 255), (52, 69, 147), (100, 115, 255), (0, 24, 236),
(132, 56, 255), (82, 0, 133), (203, 56, 255), (255, 149, 200), (255, 55, 199),
(255, 255, 56), (255, 255, 151), (255, 255, 31), (255, 255, 29), (207, 255, 49),
(72, 255, 10), (146, 255, 23), (61, 255, 134), (26, 255, 52), (0, 255, 187),
(44, 255, 168), (0, 255, 255), (52, 255, 147), (100, 255, 255), (0, 255, 236),
(132, 255, 255), (82, 255, 133), (203, 255, 255), (255, 255, 200), (255, 255, 199),
(56, 255, 255), (157, 255, 151), (112, 255, 31), (178, 255, 29), (210, 255, 49),
(249, 255, 10), (204, 255, 23), (219, 255, 134), (147, 255, 52), (212, 255, 187),
(153, 255, 168), (194, 255, 255), (69, 255, 147), (115, 255, 255), (24, 255, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49),
(249, 72, 10), (204, 146, 23), (219, 61, 134), (147, 26, 52), (212, 0, 187),
(153, 44, 168), (194, 0, 255), (69, 52, 147), (115, 100, 255), (24, 0, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49),
(249, 72, 10), (204, 146, 23), (219, 61, 134), (147, 26, 52), (212, 0, 187),
(153, 44, 168), (194, 0, 255), (69, 52, 147), (115, 100, 255), (24, 0, 236),
(56, 132, 255), (157, 82, 151), (112, 203, 31), (178, 255, 29), (210, 255, 49)
]
for det in detections:
x1, y1, x2, y2 = det['bbox']
label = f"{det['class_name']} {det['confidence']:.2f}"
color = COCO_COLORS[det['class_id'] % len(COCO_COLORS)]
cv2.rectangle(frame, (x1, y1), (x2, y2), color, box_thickness)
cv2.putText(frame, label, (x1, max(y1 - 10, 0)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
return frame