Files
2025-08-26 13:24:53 -07:00

486 lines
22 KiB
Python

from PySide6.QtWidgets import (
QWidget, QVBoxLayout, QHBoxLayout, QLabel, QPushButton,
QTextEdit, QLineEdit, QFrame, QGroupBox, QScrollArea
)
from PySide6.QtCore import Qt, Signal, Slot, QTimer
from PySide6.QtGui import QPixmap, QImage, QFont, QColor
import sys
import os
import cv2
import numpy as np
from pathlib import Path
from datetime import datetime
# Add parent directory to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.annotation_utils import convert_cv_to_qimage, convert_cv_to_pixmap
class VLMInsightsWidget(QWidget):
"""Widget for Vision Language Model insights in settings panel."""
analyze_frame_requested = Signal(np.ndarray, str) # image, prompt
def __init__(self):
super().__init__()
print("[VLM INSIGHTS DEBUG] Initializing VLM Insights Widget")
self.setupUI()
self.current_frame = None
self.detection_data = None # Store detection data from video controller
self.is_video_paused = False
self.setVisible(True) # Make it visible by default in config panel
print("[VLM INSIGHTS DEBUG] VLM Insights Widget initialized")
def setupUI(self):
"""Set up the user interface."""
main_layout = QVBoxLayout()
main_layout.setSpacing(8)
main_layout.setContentsMargins(10, 10, 10, 10)
# === VLM Insights Group ===
insights_group = QGroupBox(" Scene Analysis")
insights_group.setStyleSheet("""
QGroupBox {
font-weight: bold;
font-size: 14px;
color: #00d4aa;
border: 2px solid #00d4aa;
border-radius: 8px;
margin-top: 12px;
padding-top: 10px;
}
QGroupBox::title {
subcontrol-origin: margin;
left: 10px;
padding: 0 8px 0 8px;
background-color: #2b2b2b;
}
""")
insights_layout = QVBoxLayout()
# Status label
self.status_label = QLabel("📹 Pause video to analyze current frame")
self.status_label.setStyleSheet("color: #888; font-style: italic; padding: 5px;")
insights_layout.addWidget(self.status_label)
# Current frame thumbnail
self.frame_thumbnail = QLabel("No frame")
self.frame_thumbnail.setAlignment(Qt.AlignCenter)
self.frame_thumbnail.setFixedSize(150, 100)
self.frame_thumbnail.setStyleSheet("""
background-color: #1e1e1e;
border: 1px solid #444;
border-radius: 4px;
""")
insights_layout.addWidget(self.frame_thumbnail)
# Custom prompt input
prompt_layout = QHBoxLayout()
self.prompt_input = QLineEdit()
self.prompt_input.setPlaceholderText("Enter your question about the scene...")
self.prompt_input.setStyleSheet("""
QLineEdit {
background-color: #1e1e1e;
border: 1px solid #444;
border-radius: 4px;
padding: 8px;
color: white;
font-size: 12px;
}
QLineEdit:focus {
border: 1px solid #00d4aa;
}
""")
self.prompt_input.returnPressed.connect(self._analyze_custom)
self.analyze_custom_btn = QPushButton(" Analyze")
self.analyze_custom_btn.setStyleSheet(self._get_button_style())
self.analyze_custom_btn.clicked.connect(self._analyze_custom)
prompt_layout.addWidget(self.prompt_input)
prompt_layout.addWidget(self.analyze_custom_btn)
insights_layout.addLayout(prompt_layout)
# Results area with scroll
results_scroll = QScrollArea()
results_scroll.setWidgetResizable(True)
results_scroll.setMaximumHeight(200)
results_scroll.setStyleSheet("""
QScrollArea {
background-color: #1e1e1e;
border: 1px solid #444;
border-radius: 4px;
}
""")
self.results_text = QTextEdit()
self.results_text.setReadOnly(True)
self.results_text.setStyleSheet("""
QTextEdit {
background-color: #1e1e1e;
border: none;
color: #e0e0e0;
font-size: 11px;
padding: 8px;
}
""")
self.results_text.setPlaceholderText("AI insights will appear here...")
results_scroll.setWidget(self.results_text)
insights_layout.addWidget(results_scroll)
insights_group.setLayout(insights_layout)
main_layout.addWidget(insights_group)
main_layout.addStretch()
self.setLayout(main_layout)
# Initially disable analysis buttons
self._set_analysis_enabled(False)
def _get_button_style(self):
"""Get consistent button styling."""
return """
QPushButton {
background-color: #00d4aa;
color: #1a1a1a;
border: none;
border-radius: 4px;
padding: 6px 12px;
font-weight: bold;
font-size: 11px;
}
QPushButton:hover {
background-color: #00b89a;
}
QPushButton:pressed {
background-color: #008f7a;
}
QPushButton:disabled {
background-color: #444;
color: #888;
}
"""
def _set_analysis_enabled(self, enabled):
"""Enable/disable analysis buttons."""
self.analyze_custom_btn.setEnabled(enabled)
self.prompt_input.setEnabled(enabled)
@Slot(bool)
def on_video_paused(self, is_paused):
"""Called when video is paused/unpaused."""
print(f"[VLM INSIGHTS DEBUG] Video pause state changed: {is_paused}")
print(f"[VLM INSIGHTS DEBUG] Current frame available: {self.current_frame is not None}")
self.is_video_paused = is_paused
if is_paused:
self.status_label.setText("✅ Video paused - Frame ready for analysis")
self.status_label.setStyleSheet("color: #00d4aa; font-weight: bold; padding: 5px;")
self._set_analysis_enabled(True)
self.setVisible(True)
print("[VLM INSIGHTS DEBUG] VLM insights widget made visible and enabled")
# If we have a current frame, update the thumbnail immediately
if self.current_frame is not None:
print(f"[VLM INSIGHTS DEBUG] Updating thumbnail with current frame: {self.current_frame.shape}")
self._update_thumbnail(self.current_frame)
else:
print("[VLM INSIGHTS DEBUG] No current frame available for thumbnail")
else:
self.status_label.setText("📹 Pause video to analyze current frame")
self.status_label.setStyleSheet("color: #888; font-style: italic; padding: 5px;")
self._set_analysis_enabled(False)
print("[VLM INSIGHTS DEBUG] VLM insights widget disabled")
@Slot(np.ndarray)
def set_current_frame(self, frame):
"""Set the current frame for analysis."""
print(f"[VLM INSIGHTS DEBUG] Received frame: {frame.shape if frame is not None else 'None'}")
if frame is not None:
self.current_frame = frame.copy()
self._update_thumbnail(self.current_frame)
def set_detection_data(self, detection_data):
"""Set detection data from video controller for rich VLM analysis."""
print(f"[VLM INSIGHTS DEBUG] Received detection data")
print(f"[VLM INSIGHTS DEBUG] Data keys: {list(detection_data.keys()) if detection_data else 'None'}")
self.detection_data = detection_data
if detection_data and 'detections' in detection_data:
detections = detection_data['detections']
print(f"[VLM INSIGHTS DEBUG] Detections count: {len(detections)}")
# Log some detection info for debugging
for i, det in enumerate(detections[:3]): # Show first 3
if hasattr(det, '__dict__'):
print(f"[VLM INSIGHTS DEBUG] Detection {i}: {type(det)} - {getattr(det, 'class_name', 'unknown')}")
elif isinstance(det, dict):
print(f"[VLM INSIGHTS DEBUG] Detection {i}: {det.get('class_name', det.get('label', 'unknown'))}")
else:
print(f"[VLM INSIGHTS DEBUG] Detection {i}: {type(det)}")
print(f"[VLM INSIGHTS DEBUG] Detection data stored successfully")
def _update_thumbnail(self, frame):
"""Update the frame thumbnail display."""
if frame is None:
print("[VLM INSIGHTS DEBUG] Cannot update thumbnail - no frame provided")
return
try:
# Create thumbnail
h, w = frame.shape[:2]
if h > 0 and w > 0:
# Scale to fit thumbnail
thumb_h, thumb_w = 100, 150
scale = min(thumb_w/w, thumb_h/h)
new_w, new_h = int(w*scale), int(h*scale)
thumbnail = cv2.resize(frame, (new_w, new_h))
# Convert to QPixmap
if len(thumbnail.shape) == 3:
rgb_thumbnail = cv2.cvtColor(thumbnail, cv2.COLOR_BGR2RGB)
h, w, ch = rgb_thumbnail.shape
bytes_per_line = ch * w
qimage = QImage(rgb_thumbnail.data, w, h, bytes_per_line, QImage.Format_RGB888)
else:
h, w = thumbnail.shape
bytes_per_line = w
qimage = QImage(thumbnail.data, w, h, bytes_per_line, QImage.Format_Grayscale8)
pixmap = QPixmap.fromImage(qimage)
self.frame_thumbnail.setPixmap(pixmap)
print(f"[VLM INSIGHTS DEBUG] Frame thumbnail updated successfully")
except Exception as e:
print(f"[VLM INSIGHTS DEBUG] Error updating thumbnail: {e}")
def _quick_analyze(self, prompt):
"""Perform quick analysis with predefined prompt."""
print(f"[VLM INSIGHTS DEBUG] _quick_analyze called")
print(f"[VLM INSIGHTS DEBUG] Current frame available: {self.current_frame is not None}")
print(f"[VLM INSIGHTS DEBUG] Detection data available: {self.detection_data is not None}")
print(f"[VLM INSIGHTS DEBUG] Video paused: {self.is_video_paused}")
print(f"[VLM INSIGHTS DEBUG] Prompt: {prompt[:50]}...")
if self.current_frame is not None and self.is_video_paused:
print(f"[VLM INSIGHTS DEBUG] Enhancing prompt with detection data")
# Enhance prompt with detection data
enhanced_prompt = self._enhance_prompt_with_detections(prompt)
print(f"[VLM INSIGHTS DEBUG] Enhanced prompt length: {len(enhanced_prompt)} characters")
print(f"[VLM INSIGHTS DEBUG] Emitting analyze_frame_requested signal")
self.results_text.append(f"\n🔍 Analyzing: {prompt[:50]}...")
self.analyze_frame_requested.emit(self.current_frame, enhanced_prompt)
print(f"[VLM INSIGHTS DEBUG] Signal emitted successfully")
else:
print(f"[VLM INSIGHTS DEBUG] Cannot analyze - frame: {self.current_frame is not None}, paused: {self.is_video_paused}")
if self.current_frame is None:
self.results_text.append("\n❌ No frame available for analysis")
if not self.is_video_paused:
self.results_text.append("\n❌ Video must be paused for analysis")
def _enhance_prompt_with_detections(self, base_prompt):
"""Enhance the analysis prompt with detection data."""
try:
enhanced_parts = [base_prompt]
if self.detection_data and 'detections' in self.detection_data:
detections = self.detection_data['detections']
enhanced_parts.append(f"\n\nDETECTION CONTEXT:")
enhanced_parts.append(f"Total detections: {len(detections)}")
# Categorize detections
vehicles = []
traffic_lights = []
other_objects = []
for det in detections:
if hasattr(det, 'class_name'):
class_name = det.class_name
bbox = getattr(det, 'bbox', None)
track_id = getattr(det, 'track_id', None)
confidence = getattr(det, 'confidence', None)
elif isinstance(det, dict):
class_name = det.get('class_name', det.get('label', 'unknown'))
bbox = det.get('bbox', det.get('box', None))
track_id = det.get('track_id', det.get('id', None))
confidence = det.get('confidence', det.get('conf', None))
else:
continue
detection_info = {
'class': class_name,
'bbox': bbox,
'track_id': track_id,
'confidence': confidence
}
if class_name in ['car', 'truck', 'bus', 'motorcycle', 'vehicle']:
vehicles.append(detection_info)
elif 'traffic' in class_name.lower() or 'light' in class_name.lower():
traffic_lights.append(detection_info)
else:
other_objects.append(detection_info)
# Add vehicle information
if vehicles:
enhanced_parts.append(f"\nVEHICLES ({len(vehicles)}):")
for i, vehicle in enumerate(vehicles):
track_info = f" (ID: {vehicle['track_id']})" if vehicle['track_id'] else ""
conf_info = f" (conf: {vehicle['confidence']:.2f})" if vehicle['confidence'] else ""
bbox_info = f" at {vehicle['bbox']}" if vehicle['bbox'] else ""
enhanced_parts.append(f" - {vehicle['class']}{track_info}{conf_info}{bbox_info}")
# Add traffic light information
if traffic_lights:
enhanced_parts.append(f"\nTRAFFIC LIGHTS ({len(traffic_lights)}):")
for tl in traffic_lights:
conf_info = f" (conf: {tl['confidence']:.2f})" if tl['confidence'] else ""
bbox_info = f" at {tl['bbox']}" if tl['bbox'] else ""
enhanced_parts.append(f" - {tl['class']}{conf_info}{bbox_info}")
# Add other objects
if other_objects:
enhanced_parts.append(f"\nOTHER OBJECTS ({len(other_objects)}):")
for obj in other_objects:
conf_info = f" (conf: {obj['confidence']:.2f})" if obj['confidence'] else ""
enhanced_parts.append(f" - {obj['class']}{conf_info}")
# Add additional context from detection data
if self.detection_data:
fps = self.detection_data.get('fps', 0)
if fps > 0:
enhanced_parts.append(f"\nVIDEO INFO: FPS: {fps:.1f}")
# Add crosswalk information
crosswalk_detected = self.detection_data.get('crosswalk_detected', False)
crosswalk_bbox = self.detection_data.get('crosswalk_bbox', None)
violation_line_y = self.detection_data.get('violation_line_y', None)
if crosswalk_detected:
enhanced_parts.append(f"\nCROSSWALK INFO:")
enhanced_parts.append(f" - Crosswalk detected: YES")
if crosswalk_bbox:
enhanced_parts.append(f" - Crosswalk location: {crosswalk_bbox}")
if violation_line_y:
enhanced_parts.append(f" - Violation line at y={violation_line_y}")
else:
enhanced_parts.append(f"\nCROSSWALK INFO:")
enhanced_parts.append(f" - Crosswalk detected: NO")
# Add traffic light information
traffic_light = self.detection_data.get('traffic_light', {})
if traffic_light:
color = traffic_light.get('color', 'unknown')
confidence = traffic_light.get('confidence', 0)
enhanced_parts.append(f"\nTRAFFIC LIGHT STATUS:")
enhanced_parts.append(f" - Current color: {color.upper()}")
enhanced_parts.append(f" - Confidence: {confidence:.2f}")
# Special instructions for color analysis
if "color" in base_prompt.lower() or "colour" in base_prompt.lower():
enhanced_parts.append(f"\nSPECIAL INSTRUCTIONS:")
enhanced_parts.append(f"- Carefully examine the image for vehicle colors")
enhanced_parts.append(f"- Look at each car's body color (red, blue, white, black, silver, etc.)")
enhanced_parts.append(f"- Ignore detection data for color questions - analyze the image visually")
enhanced_parts.append(f"- List the prominent colors you can identify in the vehicles")
enhanced_parts.append(f"\nAnswer the question directly based on visual analysis of the image. Be concise and specific:")
enhanced_parts.append(f"Question: {base_prompt}")
enhanced_parts.append(f"Answer:")
enhanced_prompt = "\n".join(enhanced_parts)
print(f"[VLM INSIGHTS DEBUG] Enhanced prompt created: {len(enhanced_prompt)} chars")
return enhanced_prompt
except Exception as e:
print(f"[VLM INSIGHTS DEBUG] Error enhancing prompt: {e}")
return base_prompt
def _analyze_custom(self):
"""Perform analysis with custom prompt."""
print(f"[VLM INSIGHTS DEBUG] _analyze_custom called")
prompt = self.prompt_input.text().strip()
print(f"[VLM INSIGHTS DEBUG] Custom prompt: '{prompt}'")
print(f"[VLM INSIGHTS DEBUG] Current frame available: {self.current_frame is not None}")
print(f"[VLM INSIGHTS DEBUG] Detection data available: {self.detection_data is not None}")
print(f"[VLM INSIGHTS DEBUG] Video paused: {self.is_video_paused}")
if prompt and self.current_frame is not None and self.is_video_paused:
print(f"[VLM INSIGHTS DEBUG] Enhancing custom prompt with detection data")
# Enhance prompt with detection data
enhanced_prompt = self._enhance_prompt_with_detections(prompt)
print(f"[VLM INSIGHTS DEBUG] Emitting analyze_frame_requested signal for custom prompt")
self.results_text.append(f"\n Question: {prompt}")
self.analyze_frame_requested.emit(self.current_frame, enhanced_prompt)
self.prompt_input.clear()
print(f"[VLM INSIGHTS DEBUG] Custom analysis signal emitted successfully")
else:
print(f"[VLM INSIGHTS DEBUG] Cannot analyze custom - prompt: '{prompt}', frame: {self.current_frame is not None}, paused: {self.is_video_paused}")
if not prompt:
self.results_text.append("\n❌ Please enter a prompt for analysis")
elif self.current_frame is None:
self.results_text.append("\n❌ No frame available for analysis")
elif not self.is_video_paused:
self.results_text.append("\n❌ Video must be paused for analysis")
@Slot(object)
def on_analysis_result(self, result):
"""Display analysis result."""
print(f"[VLM INSIGHTS DEBUG] Received analysis result: {type(result)}")
print(f"[VLM INSIGHTS DEBUG] Result content: {str(result)[:200]}...")
# Extract the actual response text from the result
response_text = ""
try:
if isinstance(result, dict):
print(f"[VLM INSIGHTS DEBUG] Result is dict with keys: {list(result.keys())}")
# Check if it's the OpenVINO response format
if 'response' in result:
response_text = str(result['response'])
print(f"[VLM INSIGHTS DEBUG] Extracted response from dict")
elif 'message' in result:
response_text = str(result['message'])
print(f"[VLM INSIGHTS DEBUG] Extracted message from dict")
else:
response_text = str(result)
print(f"[VLM INSIGHTS DEBUG] Using dict as string")
elif isinstance(result, str):
response_text = result
print(f"[VLM INSIGHTS DEBUG] Result is already string")
else:
# Try to convert any other type to string
response_text = str(result)
print(f"[VLM INSIGHTS DEBUG] Converted to string")
# Clean up the response text
response_text = response_text.strip()
if not response_text:
response_text = "No response text found in result."
except Exception as e:
print(f"[VLM INSIGHTS DEBUG] Error extracting response: {e}")
response_text = f"Error extracting response: {str(e)}"
timestamp = datetime.now().strftime("%H:%M:%S")
formatted_response = f" {response_text}"
print(f"[VLM INSIGHTS DEBUG] Final response: {response_text[:100]}...")
self.results_text.append(f" Answer: {formatted_response}\n")
# Auto-scroll to bottom
scrollbar = self.results_text.verticalScrollBar()
scrollbar.setValue(scrollbar.maximum())
print(f"[VLM INSIGHTS DEBUG] Analysis result displayed successfully")