from PySide6.QtWidgets import ( QWidget, QVBoxLayout, QHBoxLayout, QLabel, QPushButton, QTextEdit, QLineEdit, QFrame, QGroupBox, QScrollArea ) from PySide6.QtCore import Qt, Signal, Slot, QTimer from PySide6.QtGui import QPixmap, QImage, QFont, QColor import sys import os import cv2 import numpy as np from pathlib import Path from datetime import datetime # Add parent directory to path for imports sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils.annotation_utils import convert_cv_to_qimage, convert_cv_to_pixmap class VLMInsightsWidget(QWidget): """Widget for Vision Language Model insights in settings panel.""" analyze_frame_requested = Signal(np.ndarray, str) # image, prompt def __init__(self): super().__init__() print("[VLM INSIGHTS DEBUG] Initializing VLM Insights Widget") self.setupUI() self.current_frame = None self.detection_data = None # Store detection data from video controller self.is_video_paused = False self.setVisible(True) # Make it visible by default in config panel print("[VLM INSIGHTS DEBUG] VLM Insights Widget initialized") def setupUI(self): """Set up the user interface.""" main_layout = QVBoxLayout() main_layout.setSpacing(8) main_layout.setContentsMargins(10, 10, 10, 10) # === VLM Insights Group === insights_group = QGroupBox(" Scene Analysis") insights_group.setStyleSheet(""" QGroupBox { font-weight: bold; font-size: 14px; color: #00d4aa; border: 2px solid #00d4aa; border-radius: 8px; margin-top: 12px; padding-top: 10px; } QGroupBox::title { subcontrol-origin: margin; left: 10px; padding: 0 8px 0 8px; background-color: #2b2b2b; } """) insights_layout = QVBoxLayout() # Status label self.status_label = QLabel("šŸ“¹ Pause video to analyze current frame") self.status_label.setStyleSheet("color: #888; font-style: italic; padding: 5px;") insights_layout.addWidget(self.status_label) # Current frame thumbnail self.frame_thumbnail = QLabel("No frame") self.frame_thumbnail.setAlignment(Qt.AlignCenter) self.frame_thumbnail.setFixedSize(150, 100) self.frame_thumbnail.setStyleSheet(""" background-color: #1e1e1e; border: 1px solid #444; border-radius: 4px; """) insights_layout.addWidget(self.frame_thumbnail) # Custom prompt input prompt_layout = QHBoxLayout() self.prompt_input = QLineEdit() self.prompt_input.setPlaceholderText("Enter your question about the scene...") self.prompt_input.setStyleSheet(""" QLineEdit { background-color: #1e1e1e; border: 1px solid #444; border-radius: 4px; padding: 8px; color: white; font-size: 12px; } QLineEdit:focus { border: 1px solid #00d4aa; } """) self.prompt_input.returnPressed.connect(self._analyze_custom) self.analyze_custom_btn = QPushButton(" Analyze") self.analyze_custom_btn.setStyleSheet(self._get_button_style()) self.analyze_custom_btn.clicked.connect(self._analyze_custom) prompt_layout.addWidget(self.prompt_input) prompt_layout.addWidget(self.analyze_custom_btn) insights_layout.addLayout(prompt_layout) # Results area with scroll results_scroll = QScrollArea() results_scroll.setWidgetResizable(True) results_scroll.setMaximumHeight(200) results_scroll.setStyleSheet(""" QScrollArea { background-color: #1e1e1e; border: 1px solid #444; border-radius: 4px; } """) self.results_text = QTextEdit() self.results_text.setReadOnly(True) self.results_text.setStyleSheet(""" QTextEdit { background-color: #1e1e1e; border: none; color: #e0e0e0; font-size: 11px; padding: 8px; } """) self.results_text.setPlaceholderText("AI insights will appear here...") results_scroll.setWidget(self.results_text) insights_layout.addWidget(results_scroll) insights_group.setLayout(insights_layout) main_layout.addWidget(insights_group) main_layout.addStretch() self.setLayout(main_layout) # Initially disable analysis buttons self._set_analysis_enabled(False) def _get_button_style(self): """Get consistent button styling.""" return """ QPushButton { background-color: #00d4aa; color: #1a1a1a; border: none; border-radius: 4px; padding: 6px 12px; font-weight: bold; font-size: 11px; } QPushButton:hover { background-color: #00b89a; } QPushButton:pressed { background-color: #008f7a; } QPushButton:disabled { background-color: #444; color: #888; } """ def _set_analysis_enabled(self, enabled): """Enable/disable analysis buttons.""" self.analyze_custom_btn.setEnabled(enabled) self.prompt_input.setEnabled(enabled) @Slot(bool) def on_video_paused(self, is_paused): """Called when video is paused/unpaused.""" print(f"[VLM INSIGHTS DEBUG] Video pause state changed: {is_paused}") print(f"[VLM INSIGHTS DEBUG] Current frame available: {self.current_frame is not None}") self.is_video_paused = is_paused if is_paused: self.status_label.setText("āœ… Video paused - Frame ready for analysis") self.status_label.setStyleSheet("color: #00d4aa; font-weight: bold; padding: 5px;") self._set_analysis_enabled(True) self.setVisible(True) print("[VLM INSIGHTS DEBUG] VLM insights widget made visible and enabled") # If we have a current frame, update the thumbnail immediately if self.current_frame is not None: print(f"[VLM INSIGHTS DEBUG] Updating thumbnail with current frame: {self.current_frame.shape}") self._update_thumbnail(self.current_frame) else: print("[VLM INSIGHTS DEBUG] No current frame available for thumbnail") else: self.status_label.setText("šŸ“¹ Pause video to analyze current frame") self.status_label.setStyleSheet("color: #888; font-style: italic; padding: 5px;") self._set_analysis_enabled(False) print("[VLM INSIGHTS DEBUG] VLM insights widget disabled") @Slot(np.ndarray) def set_current_frame(self, frame): """Set the current frame for analysis.""" print(f"[VLM INSIGHTS DEBUG] Received frame: {frame.shape if frame is not None else 'None'}") if frame is not None: self.current_frame = frame.copy() self._update_thumbnail(self.current_frame) def set_detection_data(self, detection_data): """Set detection data from video controller for rich VLM analysis.""" print(f"[VLM INSIGHTS DEBUG] Received detection data") print(f"[VLM INSIGHTS DEBUG] Data keys: {list(detection_data.keys()) if detection_data else 'None'}") self.detection_data = detection_data if detection_data and 'detections' in detection_data: detections = detection_data['detections'] print(f"[VLM INSIGHTS DEBUG] Detections count: {len(detections)}") # Log some detection info for debugging for i, det in enumerate(detections[:3]): # Show first 3 if hasattr(det, '__dict__'): print(f"[VLM INSIGHTS DEBUG] Detection {i}: {type(det)} - {getattr(det, 'class_name', 'unknown')}") elif isinstance(det, dict): print(f"[VLM INSIGHTS DEBUG] Detection {i}: {det.get('class_name', det.get('label', 'unknown'))}") else: print(f"[VLM INSIGHTS DEBUG] Detection {i}: {type(det)}") print(f"[VLM INSIGHTS DEBUG] Detection data stored successfully") def _update_thumbnail(self, frame): """Update the frame thumbnail display.""" if frame is None: print("[VLM INSIGHTS DEBUG] Cannot update thumbnail - no frame provided") return try: # Create thumbnail h, w = frame.shape[:2] if h > 0 and w > 0: # Scale to fit thumbnail thumb_h, thumb_w = 100, 150 scale = min(thumb_w/w, thumb_h/h) new_w, new_h = int(w*scale), int(h*scale) thumbnail = cv2.resize(frame, (new_w, new_h)) # Convert to QPixmap if len(thumbnail.shape) == 3: rgb_thumbnail = cv2.cvtColor(thumbnail, cv2.COLOR_BGR2RGB) h, w, ch = rgb_thumbnail.shape bytes_per_line = ch * w qimage = QImage(rgb_thumbnail.data, w, h, bytes_per_line, QImage.Format_RGB888) else: h, w = thumbnail.shape bytes_per_line = w qimage = QImage(thumbnail.data, w, h, bytes_per_line, QImage.Format_Grayscale8) pixmap = QPixmap.fromImage(qimage) self.frame_thumbnail.setPixmap(pixmap) print(f"[VLM INSIGHTS DEBUG] Frame thumbnail updated successfully") except Exception as e: print(f"[VLM INSIGHTS DEBUG] Error updating thumbnail: {e}") def _quick_analyze(self, prompt): """Perform quick analysis with predefined prompt.""" print(f"[VLM INSIGHTS DEBUG] _quick_analyze called") print(f"[VLM INSIGHTS DEBUG] Current frame available: {self.current_frame is not None}") print(f"[VLM INSIGHTS DEBUG] Detection data available: {self.detection_data is not None}") print(f"[VLM INSIGHTS DEBUG] Video paused: {self.is_video_paused}") print(f"[VLM INSIGHTS DEBUG] Prompt: {prompt[:50]}...") if self.current_frame is not None and self.is_video_paused: print(f"[VLM INSIGHTS DEBUG] Enhancing prompt with detection data") # Enhance prompt with detection data enhanced_prompt = self._enhance_prompt_with_detections(prompt) print(f"[VLM INSIGHTS DEBUG] Enhanced prompt length: {len(enhanced_prompt)} characters") print(f"[VLM INSIGHTS DEBUG] Emitting analyze_frame_requested signal") self.results_text.append(f"\nšŸ” Analyzing: {prompt[:50]}...") self.analyze_frame_requested.emit(self.current_frame, enhanced_prompt) print(f"[VLM INSIGHTS DEBUG] Signal emitted successfully") else: print(f"[VLM INSIGHTS DEBUG] Cannot analyze - frame: {self.current_frame is not None}, paused: {self.is_video_paused}") if self.current_frame is None: self.results_text.append("\nāŒ No frame available for analysis") if not self.is_video_paused: self.results_text.append("\nāŒ Video must be paused for analysis") def _enhance_prompt_with_detections(self, base_prompt): """Enhance the analysis prompt with detection data.""" try: enhanced_parts = [base_prompt] if self.detection_data and 'detections' in self.detection_data: detections = self.detection_data['detections'] enhanced_parts.append(f"\n\nDETECTION CONTEXT:") enhanced_parts.append(f"Total detections: {len(detections)}") # Categorize detections vehicles = [] traffic_lights = [] other_objects = [] for det in detections: if hasattr(det, 'class_name'): class_name = det.class_name bbox = getattr(det, 'bbox', None) track_id = getattr(det, 'track_id', None) confidence = getattr(det, 'confidence', None) elif isinstance(det, dict): class_name = det.get('class_name', det.get('label', 'unknown')) bbox = det.get('bbox', det.get('box', None)) track_id = det.get('track_id', det.get('id', None)) confidence = det.get('confidence', det.get('conf', None)) else: continue detection_info = { 'class': class_name, 'bbox': bbox, 'track_id': track_id, 'confidence': confidence } if class_name in ['car', 'truck', 'bus', 'motorcycle', 'vehicle']: vehicles.append(detection_info) elif 'traffic' in class_name.lower() or 'light' in class_name.lower(): traffic_lights.append(detection_info) else: other_objects.append(detection_info) # Add vehicle information if vehicles: enhanced_parts.append(f"\nVEHICLES ({len(vehicles)}):") for i, vehicle in enumerate(vehicles): track_info = f" (ID: {vehicle['track_id']})" if vehicle['track_id'] else "" conf_info = f" (conf: {vehicle['confidence']:.2f})" if vehicle['confidence'] else "" bbox_info = f" at {vehicle['bbox']}" if vehicle['bbox'] else "" enhanced_parts.append(f" - {vehicle['class']}{track_info}{conf_info}{bbox_info}") # Add traffic light information if traffic_lights: enhanced_parts.append(f"\nTRAFFIC LIGHTS ({len(traffic_lights)}):") for tl in traffic_lights: conf_info = f" (conf: {tl['confidence']:.2f})" if tl['confidence'] else "" bbox_info = f" at {tl['bbox']}" if tl['bbox'] else "" enhanced_parts.append(f" - {tl['class']}{conf_info}{bbox_info}") # Add other objects if other_objects: enhanced_parts.append(f"\nOTHER OBJECTS ({len(other_objects)}):") for obj in other_objects: conf_info = f" (conf: {obj['confidence']:.2f})" if obj['confidence'] else "" enhanced_parts.append(f" - {obj['class']}{conf_info}") # Add additional context from detection data if self.detection_data: fps = self.detection_data.get('fps', 0) if fps > 0: enhanced_parts.append(f"\nVIDEO INFO: FPS: {fps:.1f}") # Add crosswalk information crosswalk_detected = self.detection_data.get('crosswalk_detected', False) crosswalk_bbox = self.detection_data.get('crosswalk_bbox', None) violation_line_y = self.detection_data.get('violation_line_y', None) if crosswalk_detected: enhanced_parts.append(f"\nCROSSWALK INFO:") enhanced_parts.append(f" - Crosswalk detected: YES") if crosswalk_bbox: enhanced_parts.append(f" - Crosswalk location: {crosswalk_bbox}") if violation_line_y: enhanced_parts.append(f" - Violation line at y={violation_line_y}") else: enhanced_parts.append(f"\nCROSSWALK INFO:") enhanced_parts.append(f" - Crosswalk detected: NO") # Add traffic light information traffic_light = self.detection_data.get('traffic_light', {}) if traffic_light: color = traffic_light.get('color', 'unknown') confidence = traffic_light.get('confidence', 0) enhanced_parts.append(f"\nTRAFFIC LIGHT STATUS:") enhanced_parts.append(f" - Current color: {color.upper()}") enhanced_parts.append(f" - Confidence: {confidence:.2f}") # Special instructions for color analysis if "color" in base_prompt.lower() or "colour" in base_prompt.lower(): enhanced_parts.append(f"\nSPECIAL INSTRUCTIONS:") enhanced_parts.append(f"- Carefully examine the image for vehicle colors") enhanced_parts.append(f"- Look at each car's body color (red, blue, white, black, silver, etc.)") enhanced_parts.append(f"- Ignore detection data for color questions - analyze the image visually") enhanced_parts.append(f"- List the prominent colors you can identify in the vehicles") enhanced_parts.append(f"\nAnswer the question directly based on visual analysis of the image. Be concise and specific:") enhanced_parts.append(f"Question: {base_prompt}") enhanced_parts.append(f"Answer:") enhanced_prompt = "\n".join(enhanced_parts) print(f"[VLM INSIGHTS DEBUG] Enhanced prompt created: {len(enhanced_prompt)} chars") return enhanced_prompt except Exception as e: print(f"[VLM INSIGHTS DEBUG] Error enhancing prompt: {e}") return base_prompt def _analyze_custom(self): """Perform analysis with custom prompt.""" print(f"[VLM INSIGHTS DEBUG] _analyze_custom called") prompt = self.prompt_input.text().strip() print(f"[VLM INSIGHTS DEBUG] Custom prompt: '{prompt}'") print(f"[VLM INSIGHTS DEBUG] Current frame available: {self.current_frame is not None}") print(f"[VLM INSIGHTS DEBUG] Detection data available: {self.detection_data is not None}") print(f"[VLM INSIGHTS DEBUG] Video paused: {self.is_video_paused}") if prompt and self.current_frame is not None and self.is_video_paused: print(f"[VLM INSIGHTS DEBUG] Enhancing custom prompt with detection data") # Enhance prompt with detection data enhanced_prompt = self._enhance_prompt_with_detections(prompt) print(f"[VLM INSIGHTS DEBUG] Emitting analyze_frame_requested signal for custom prompt") self.results_text.append(f"\n Question: {prompt}") self.analyze_frame_requested.emit(self.current_frame, enhanced_prompt) self.prompt_input.clear() print(f"[VLM INSIGHTS DEBUG] Custom analysis signal emitted successfully") else: print(f"[VLM INSIGHTS DEBUG] Cannot analyze custom - prompt: '{prompt}', frame: {self.current_frame is not None}, paused: {self.is_video_paused}") if not prompt: self.results_text.append("\nāŒ Please enter a prompt for analysis") elif self.current_frame is None: self.results_text.append("\nāŒ No frame available for analysis") elif not self.is_video_paused: self.results_text.append("\nāŒ Video must be paused for analysis") @Slot(object) def on_analysis_result(self, result): """Display analysis result.""" print(f"[VLM INSIGHTS DEBUG] Received analysis result: {type(result)}") print(f"[VLM INSIGHTS DEBUG] Result content: {str(result)[:200]}...") # Extract the actual response text from the result response_text = "" try: if isinstance(result, dict): print(f"[VLM INSIGHTS DEBUG] Result is dict with keys: {list(result.keys())}") # Check if it's the OpenVINO response format if 'response' in result: response_text = str(result['response']) print(f"[VLM INSIGHTS DEBUG] Extracted response from dict") elif 'message' in result: response_text = str(result['message']) print(f"[VLM INSIGHTS DEBUG] Extracted message from dict") else: response_text = str(result) print(f"[VLM INSIGHTS DEBUG] Using dict as string") elif isinstance(result, str): response_text = result print(f"[VLM INSIGHTS DEBUG] Result is already string") else: # Try to convert any other type to string response_text = str(result) print(f"[VLM INSIGHTS DEBUG] Converted to string") # Clean up the response text response_text = response_text.strip() if not response_text: response_text = "No response text found in result." except Exception as e: print(f"[VLM INSIGHTS DEBUG] Error extracting response: {e}") response_text = f"Error extracting response: {str(e)}" timestamp = datetime.now().strftime("%H:%M:%S") formatted_response = f" {response_text}" print(f"[VLM INSIGHTS DEBUG] Final response: {response_text[:100]}...") self.results_text.append(f" Answer: {formatted_response}\n") # Auto-scroll to bottom scrollbar = self.results_text.verticalScrollBar() scrollbar.setValue(scrollbar.maximum()) print(f"[VLM INSIGHTS DEBUG] Analysis result displayed successfully")