Traffic-Intersection-Monito…/qt_app_pyside1/ui/vlm_insights_widget.py

from PySide6.QtWidgets import (
    QWidget, QVBoxLayout, QHBoxLayout, QLabel, QPushButton,
    QTextEdit, QLineEdit, QFrame, QGroupBox, QScrollArea
)
from PySide6.QtCore import Qt, Signal, Slot, QTimer
from PySide6.QtGui import QPixmap, QImage, QFont, QColor

import sys
import os
import cv2
import numpy as np
from pathlib import Path
from datetime import datetime

# Add parent directory to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils.annotation_utils import convert_cv_to_qimage, convert_cv_to_pixmap

class VLMInsightsWidget(QWidget):
    """Widget for Vision Language Model insights in settings panel."""
    analyze_frame_requested = Signal(np.ndarray, str)  # image, prompt

    def __init__(self):
        super().__init__()
        print("[VLM INSIGHTS DEBUG] Initializing VLM Insights Widget")
        self.setupUI()
        self.current_frame = None
        self.detection_data = None  # Store detection data from video controller
        self.is_video_paused = False
        self.setVisible(True)  # Make it visible by default in config panel
        print("[VLM INSIGHTS DEBUG] VLM Insights Widget initialized")

    def setupUI(self):
        """Set up the user interface."""
        main_layout = QVBoxLayout()
        main_layout.setSpacing(8)
        main_layout.setContentsMargins(10, 10, 10, 10)

        # === VLM Insights Group ===
        insights_group = QGroupBox(" Scene Analysis")
        insights_group.setStyleSheet("""
            QGroupBox {
                font-weight: bold;
                font-size: 14px;
                color: #00d4aa;
                border: 2px solid #00d4aa;
                border-radius: 8px;
                margin-top: 12px;
                padding-top: 10px;
            }
            QGroupBox::title {
                subcontrol-origin: margin;
                left: 10px;
                padding: 0 8px 0 8px;
                background-color: #2b2b2b;
            }
        """)
        insights_layout = QVBoxLayout()

        # Status label
        self.status_label = QLabel("📹 Pause video to analyze current frame")
        self.status_label.setStyleSheet("color: #888; font-style: italic; padding: 5px;")
        insights_layout.addWidget(self.status_label)

        # Current frame thumbnail
        self.frame_thumbnail = QLabel("No frame")
        self.frame_thumbnail.setAlignment(Qt.AlignCenter)
        self.frame_thumbnail.setFixedSize(150, 100)
        self.frame_thumbnail.setStyleSheet("""
            background-color: #1e1e1e;
            border: 1px solid #444;
            border-radius: 4px;
        """)
        insights_layout.addWidget(self.frame_thumbnail)

        # Custom prompt input
        prompt_layout = QHBoxLayout()
        self.prompt_input = QLineEdit()
        self.prompt_input.setPlaceholderText("Enter your question about the scene...")
        self.prompt_input.setStyleSheet("""
            QLineEdit {
                background-color: #1e1e1e;
                border: 1px solid #444;
                border-radius: 4px;
                padding: 8px;
                color: white;
                font-size: 12px;
            }
            QLineEdit:focus {
                border: 1px solid #00d4aa;
            }
        """)
        self.prompt_input.returnPressed.connect(self._analyze_custom)

        self.analyze_custom_btn = QPushButton(" Analyze")
        self.analyze_custom_btn.setStyleSheet(self._get_button_style())
        self.analyze_custom_btn.clicked.connect(self._analyze_custom)

        prompt_layout.addWidget(self.prompt_input)
        prompt_layout.addWidget(self.analyze_custom_btn)
        insights_layout.addLayout(prompt_layout)

        # Results area with scroll
        results_scroll = QScrollArea()
        results_scroll.setWidgetResizable(True)
        results_scroll.setMaximumHeight(200)
        results_scroll.setStyleSheet("""
            QScrollArea {
                background-color: #1e1e1e;
                border: 1px solid #444;
                border-radius: 4px;
            }
        """)

        self.results_text = QTextEdit()
        self.results_text.setReadOnly(True)
        self.results_text.setStyleSheet("""
            QTextEdit {
                background-color: #1e1e1e;
                border: none;
                color: #e0e0e0;
                font-size: 11px;
                padding: 8px;
            }
        """)
        self.results_text.setPlaceholderText("AI insights will appear here...")
        results_scroll.setWidget(self.results_text)
        insights_layout.addWidget(results_scroll)

        insights_group.setLayout(insights_layout)
        main_layout.addWidget(insights_group)

        main_layout.addStretch()
        self.setLayout(main_layout)

        # Initially disable analysis buttons
        self._set_analysis_enabled(False)

    def _get_button_style(self):
        """Get consistent button styling."""
        return """
            QPushButton {
                background-color: #00d4aa;
                color: #1a1a1a;
                border: none;
                border-radius: 4px;
                padding: 6px 12px;
                font-weight: bold;
                font-size: 11px;
            }
            QPushButton:hover {
                background-color: #00b89a;
            }
            QPushButton:pressed {
                background-color: #008f7a;
            }
            QPushButton:disabled {
                background-color: #444;
                color: #888;
            }
        """

    def _set_analysis_enabled(self, enabled):
        """Enable/disable analysis buttons."""
        self.analyze_custom_btn.setEnabled(enabled)
        self.prompt_input.setEnabled(enabled)

    @Slot(bool)
    def on_video_paused(self, is_paused):
        """Called when video is paused/unpaused."""
        print(f"[VLM INSIGHTS DEBUG] Video pause state changed: {is_paused}")
        print(f"[VLM INSIGHTS DEBUG] Current frame available: {self.current_frame is not None}")

        self.is_video_paused = is_paused
        if is_paused:
            self.status_label.setText("✅ Video paused - Frame ready for analysis")
            self.status_label.setStyleSheet("color: #00d4aa; font-weight: bold; padding: 5px;")
            self._set_analysis_enabled(True)
            self.setVisible(True)
            print("[VLM INSIGHTS DEBUG] VLM insights widget made visible and enabled")

            # If we have a current frame, update the thumbnail immediately
            if self.current_frame is not None:
                print(f"[VLM INSIGHTS DEBUG] Updating thumbnail with current frame: {self.current_frame.shape}")
                self._update_thumbnail(self.current_frame)
            else:
                print("[VLM INSIGHTS DEBUG] No current frame available for thumbnail")

        else:
            self.status_label.setText("📹 Pause video to analyze current frame")
            self.status_label.setStyleSheet("color: #888; font-style: italic; padding: 5px;")
            self._set_analysis_enabled(False)
            print("[VLM INSIGHTS DEBUG] VLM insights widget disabled")

    @Slot(np.ndarray)
    def set_current_frame(self, frame):
        """Set the current frame for analysis."""
        print(f"[VLM INSIGHTS DEBUG] Received frame: {frame.shape if frame is not None else 'None'}")
        if frame is not None:
            self.current_frame = frame.copy()
            self._update_thumbnail(self.current_frame)

    def set_detection_data(self, detection_data):
        """Set detection data from video controller for rich VLM analysis."""
        print(f"[VLM INSIGHTS DEBUG] Received detection data")
        print(f"[VLM INSIGHTS DEBUG] Data keys: {list(detection_data.keys()) if detection_data else 'None'}")

        self.detection_data = detection_data

        if detection_data and 'detections' in detection_data:
            detections = detection_data['detections']
            print(f"[VLM INSIGHTS DEBUG] Detections count: {len(detections)}")

            # Log some detection info for debugging
            for i, det in enumerate(detections[:3]):  # Show first 3
                if hasattr(det, '__dict__'):
                    print(f"[VLM INSIGHTS DEBUG] Detection {i}: {type(det)} - {getattr(det, 'class_name', 'unknown')}")
                elif isinstance(det, dict):
                    print(f"[VLM INSIGHTS DEBUG] Detection {i}: {det.get('class_name', det.get('label', 'unknown'))}")
                else:
                    print(f"[VLM INSIGHTS DEBUG] Detection {i}: {type(det)}")

        print(f"[VLM INSIGHTS DEBUG] Detection data stored successfully")

    def _update_thumbnail(self, frame):
        """Update the frame thumbnail display."""
        if frame is None:
            print("[VLM INSIGHTS DEBUG] Cannot update thumbnail - no frame provided")
            return

        try:
            # Create thumbnail
            h, w = frame.shape[:2]
            if h > 0 and w > 0:
                # Scale to fit thumbnail
                thumb_h, thumb_w = 100, 150
                scale = min(thumb_w/w, thumb_h/h)
                new_w, new_h = int(w*scale), int(h*scale)

                thumbnail = cv2.resize(frame, (new_w, new_h))

                # Convert to QPixmap
                if len(thumbnail.shape) == 3:
                    rgb_thumbnail = cv2.cvtColor(thumbnail, cv2.COLOR_BGR2RGB)
                    h, w, ch = rgb_thumbnail.shape
                    bytes_per_line = ch * w
                    qimage = QImage(rgb_thumbnail.data, w, h, bytes_per_line, QImage.Format_RGB888)
                else:
                    h, w = thumbnail.shape
                    bytes_per_line = w
                    qimage = QImage(thumbnail.data, w, h, bytes_per_line, QImage.Format_Grayscale8)

                pixmap = QPixmap.fromImage(qimage)
                self.frame_thumbnail.setPixmap(pixmap)
                print(f"[VLM INSIGHTS DEBUG] Frame thumbnail updated successfully")
        except Exception as e:
            print(f"[VLM INSIGHTS DEBUG] Error updating thumbnail: {e}")

    def _quick_analyze(self, prompt):
        """Perform quick analysis with predefined prompt."""
        print(f"[VLM INSIGHTS DEBUG] _quick_analyze called")
        print(f"[VLM INSIGHTS DEBUG] Current frame available: {self.current_frame is not None}")
        print(f"[VLM INSIGHTS DEBUG] Detection data available: {self.detection_data is not None}")
        print(f"[VLM INSIGHTS DEBUG] Video paused: {self.is_video_paused}")
        print(f"[VLM INSIGHTS DEBUG] Prompt: {prompt[:50]}...")

        if self.current_frame is not None and self.is_video_paused:
            print(f"[VLM INSIGHTS DEBUG] Enhancing prompt with detection data")

            # Enhance prompt with detection data
            enhanced_prompt = self._enhance_prompt_with_detections(prompt)
            print(f"[VLM INSIGHTS DEBUG] Enhanced prompt length: {len(enhanced_prompt)} characters")

            print(f"[VLM INSIGHTS DEBUG] Emitting analyze_frame_requested signal")
            self.results_text.append(f"\n🔍 Analyzing: {prompt[:50]}...")
            self.analyze_frame_requested.emit(self.current_frame, enhanced_prompt)
            print(f"[VLM INSIGHTS DEBUG] Signal emitted successfully")
        else:
            print(f"[VLM INSIGHTS DEBUG] Cannot analyze - frame: {self.current_frame is not None}, paused: {self.is_video_paused}")
            if self.current_frame is None:
                self.results_text.append("\n❌ No frame available for analysis")
            if not self.is_video_paused:
                self.results_text.append("\n❌ Video must be paused for analysis")

    def _enhance_prompt_with_detections(self, base_prompt):
        """Enhance the analysis prompt with detection data."""
        try:
            enhanced_parts = [base_prompt]

            if self.detection_data and 'detections' in self.detection_data:
                detections = self.detection_data['detections']

                enhanced_parts.append(f"\n\nDETECTION CONTEXT:")
                enhanced_parts.append(f"Total detections: {len(detections)}")

                # Categorize detections
                vehicles = []
                traffic_lights = []
                other_objects = []

                for det in detections:
                    if hasattr(det, 'class_name'):
                        class_name = det.class_name
                        bbox = getattr(det, 'bbox', None)
                        track_id = getattr(det, 'track_id', None)
                        confidence = getattr(det, 'confidence', None)
                    elif isinstance(det, dict):
                        class_name = det.get('class_name', det.get('label', 'unknown'))
                        bbox = det.get('bbox', det.get('box', None))
                        track_id = det.get('track_id', det.get('id', None))
                        confidence = det.get('confidence', det.get('conf', None))
                    else:
                        continue

                    detection_info = {
                        'class': class_name,
                        'bbox': bbox,
                        'track_id': track_id,
                        'confidence': confidence
                    }

                    if class_name in ['car', 'truck', 'bus', 'motorcycle', 'vehicle']:
                        vehicles.append(detection_info)
                    elif 'traffic' in class_name.lower() or 'light' in class_name.lower():
                        traffic_lights.append(detection_info)
                    else:
                        other_objects.append(detection_info)

                # Add vehicle information
                if vehicles:
                    enhanced_parts.append(f"\nVEHICLES ({len(vehicles)}):")
                    for i, vehicle in enumerate(vehicles):
                        track_info = f" (ID: {vehicle['track_id']})" if vehicle['track_id'] else ""
                        conf_info = f" (conf: {vehicle['confidence']:.2f})" if vehicle['confidence'] else ""
                        bbox_info = f" at {vehicle['bbox']}" if vehicle['bbox'] else ""
                        enhanced_parts.append(f"  - {vehicle['class']}{track_info}{conf_info}{bbox_info}")

                # Add traffic light information
                if traffic_lights:
                    enhanced_parts.append(f"\nTRAFFIC LIGHTS ({len(traffic_lights)}):")
                    for tl in traffic_lights:
                        conf_info = f" (conf: {tl['confidence']:.2f})" if tl['confidence'] else ""
                        bbox_info = f" at {tl['bbox']}" if tl['bbox'] else ""
                        enhanced_parts.append(f"  - {tl['class']}{conf_info}{bbox_info}")

                # Add other objects
                if other_objects:
                    enhanced_parts.append(f"\nOTHER OBJECTS ({len(other_objects)}):")
                    for obj in other_objects:
                        conf_info = f" (conf: {obj['confidence']:.2f})" if obj['confidence'] else ""
                        enhanced_parts.append(f"  - {obj['class']}{conf_info}")

            # Add additional context from detection data
            if self.detection_data:
                fps = self.detection_data.get('fps', 0)
                if fps > 0:
                    enhanced_parts.append(f"\nVIDEO INFO: FPS: {fps:.1f}")

                # Add crosswalk information
                crosswalk_detected = self.detection_data.get('crosswalk_detected', False)
                crosswalk_bbox = self.detection_data.get('crosswalk_bbox', None)
                violation_line_y = self.detection_data.get('violation_line_y', None)

                if crosswalk_detected:
                    enhanced_parts.append(f"\nCROSSWALK INFO:")
                    enhanced_parts.append(f"  - Crosswalk detected: YES")
                    if crosswalk_bbox:
                        enhanced_parts.append(f"  - Crosswalk location: {crosswalk_bbox}")
                    if violation_line_y:
                        enhanced_parts.append(f"  - Violation line at y={violation_line_y}")
                else:
                    enhanced_parts.append(f"\nCROSSWALK INFO:")
                    enhanced_parts.append(f"  - Crosswalk detected: NO")

                # Add traffic light information
                traffic_light = self.detection_data.get('traffic_light', {})
                if traffic_light:
                    color = traffic_light.get('color', 'unknown')
                    confidence = traffic_light.get('confidence', 0)
                    enhanced_parts.append(f"\nTRAFFIC LIGHT STATUS:")
                    enhanced_parts.append(f"  - Current color: {color.upper()}")
                    enhanced_parts.append(f"  - Confidence: {confidence:.2f}")

            # Special instructions for color analysis
            if "color" in base_prompt.lower() or "colour" in base_prompt.lower():
                enhanced_parts.append(f"\nSPECIAL INSTRUCTIONS:")
                enhanced_parts.append(f"- Carefully examine the image for vehicle colors")
                enhanced_parts.append(f"- Look at each car's body color (red, blue, white, black, silver, etc.)")
                enhanced_parts.append(f"- Ignore detection data for color questions - analyze the image visually")
                enhanced_parts.append(f"- List the prominent colors you can identify in the vehicles")

            enhanced_parts.append(f"\nAnswer the question directly based on visual analysis of the image. Be concise and specific:")
            enhanced_parts.append(f"Question: {base_prompt}")
            enhanced_parts.append(f"Answer:")

            enhanced_prompt = "\n".join(enhanced_parts)
            print(f"[VLM INSIGHTS DEBUG] Enhanced prompt created: {len(enhanced_prompt)} chars")
            return enhanced_prompt

        except Exception as e:
            print(f"[VLM INSIGHTS DEBUG] Error enhancing prompt: {e}")
            return base_prompt

    def _analyze_custom(self):
        """Perform analysis with custom prompt."""
        print(f"[VLM INSIGHTS DEBUG] _analyze_custom called")
        prompt = self.prompt_input.text().strip()
        print(f"[VLM INSIGHTS DEBUG] Custom prompt: '{prompt}'")
        print(f"[VLM INSIGHTS DEBUG] Current frame available: {self.current_frame is not None}")
        print(f"[VLM INSIGHTS DEBUG] Detection data available: {self.detection_data is not None}")
        print(f"[VLM INSIGHTS DEBUG] Video paused: {self.is_video_paused}")

        if prompt and self.current_frame is not None and self.is_video_paused:
            print(f"[VLM INSIGHTS DEBUG] Enhancing custom prompt with detection data")

            # Enhance prompt with detection data
            enhanced_prompt = self._enhance_prompt_with_detections(prompt)

            print(f"[VLM INSIGHTS DEBUG] Emitting analyze_frame_requested signal for custom prompt")
            self.results_text.append(f"\n Question: {prompt}")
            self.analyze_frame_requested.emit(self.current_frame, enhanced_prompt)
            self.prompt_input.clear()
            print(f"[VLM INSIGHTS DEBUG] Custom analysis signal emitted successfully")
        else:
            print(f"[VLM INSIGHTS DEBUG] Cannot analyze custom - prompt: '{prompt}', frame: {self.current_frame is not None}, paused: {self.is_video_paused}")
            if not prompt:
                self.results_text.append("\n❌ Please enter a prompt for analysis")
            elif self.current_frame is None:
                self.results_text.append("\n❌ No frame available for analysis")
            elif not self.is_video_paused:
                self.results_text.append("\n❌ Video must be paused for analysis")

    @Slot(object)
    def on_analysis_result(self, result):
        """Display analysis result."""
        print(f"[VLM INSIGHTS DEBUG] Received analysis result: {type(result)}")
        print(f"[VLM INSIGHTS DEBUG] Result content: {str(result)[:200]}...")

        # Extract the actual response text from the result
        response_text = ""

        try:
            if isinstance(result, dict):
                print(f"[VLM INSIGHTS DEBUG] Result is dict with keys: {list(result.keys())}")

                # Check if it's the OpenVINO response format
                if 'response' in result:
                    response_text = str(result['response'])
                    print(f"[VLM INSIGHTS DEBUG] Extracted response from dict")
                elif 'message' in result:
                    response_text = str(result['message'])
                    print(f"[VLM INSIGHTS DEBUG] Extracted message from dict")
                else:
                    response_text = str(result)
                    print(f"[VLM INSIGHTS DEBUG] Using dict as string")
            elif isinstance(result, str):
                response_text = result
                print(f"[VLM INSIGHTS DEBUG] Result is already string")
            else:
                # Try to convert any other type to string
                response_text = str(result)
                print(f"[VLM INSIGHTS DEBUG] Converted to string")

            # Clean up the response text
            response_text = response_text.strip()

            if not response_text:
                response_text = "No response text found in result."

        except Exception as e:
            print(f"[VLM INSIGHTS DEBUG] Error extracting response: {e}")
            response_text = f"Error extracting response: {str(e)}"

        timestamp = datetime.now().strftime("%H:%M:%S")
        formatted_response = f" {response_text}"

        print(f"[VLM INSIGHTS DEBUG] Final response: {response_text[:100]}...")
        self.results_text.append(f" Answer: {formatted_response}\n")

        # Auto-scroll to bottom
        scrollbar = self.results_text.verticalScrollBar()
        scrollbar.setValue(scrollbar.maximum())

        print(f"[VLM INSIGHTS DEBUG] Analysis result displayed successfully")