cleanup and files added

2025-08-26 13:24:53 -07:00
parent a379d7a063
commit 51a14cd61c
8968 changed files with 1292619 additions and 0 deletions
--- a/qt_app_pyside1/vlm_backend/Dockerfile
+++ b/qt_app_pyside1/vlm_backend/Dockerfile
@@ -0,0 +1,29 @@
+FROM python:3.10-slim
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    libgl1 \
+    libglib2.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY app.py utils.py ./
+
+# Environment variables
+ENV VLM_MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
+ENV LOCAL_EMBED_MODEL_ID="CLIP-ViT-H-14"
+ENV PYTHONUNBUFFERED=1
+
+# Expose the API port
+EXPOSE 8399
+
+# Run the application
+CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "8399"]
--- a/qt_app_pyside1/vlm_backend/app.py
+++ b/qt_app_pyside1/vlm_backend/app.py
@@ -0,0 +1,186 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from pathlib import Path
+from functools import cache
+import sys
+import os
+import argparse
+import logging
+import datetime
+import re
+import string
+import unicodedata
+from PIL import Image
+import time
+import random
+from io import BytesIO
+import base64
+import requests
+import shutil
+import tempfile
+import copy
+import json
+from flask import Flask, request, jsonify
+
+# Import utils for image processing
+from utils import image_to_url, generate_image_hash
+
+# Optional: Import model downloader
+from model_downloader import download_vl_model
+
+# Get environment variables
+VLM_MODEL_NAME = os.getenv("VLM_MODEL_NAME", "Qwen/Qwen2.5-VL-7B-Instruct")
+LOCAL_EMBED_MODEL_ID = os.getenv("LOCAL_EMBED_MODEL_ID", "CLIP-ViT-H-14")
+MODEL_DIR = os.getenv("MODEL_DIR", "./models")
+DOWNLOAD_MODELS = os.getenv("DOWNLOAD_MODELS", "True").lower() == "true"
+
+# Configure logging
+logger = logging.getLogger('vlm_backend')
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s.%(msecs)03d [%(name)s]: %(message)s",
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+
+# Create Flask application
+app = Flask(__name__)
+
+# Global model variables
+ov_model = None
+tokenizer = None
+
+# Load models
+def load_models():
+    global ov_model, tokenizer
+    
+    logger.info(f"Loading VLM model: {VLM_MODEL_NAME}")
+    
+    try:
+        # Check if models should be downloaded
+        if DOWNLOAD_MODELS:
+            logger.info("Auto-downloading model is enabled")
+            model_path = download_vl_model(VLM_MODEL_NAME, MODEL_DIR, "FP16")
+            logger.info(f"Model downloaded to {model_path}")
+            
+            # In a real implementation, you would load the OpenVINO model here:
+            # from optimum.intel.openvino import OVModelForCausalLM
+            # from transformers import AutoTokenizer
+            # tokenizer = AutoTokenizer.from_pretrained(model_path)
+            # ov_model = OVModelForCausalLM.from_pretrained(model_path)
+        else:
+            logger.info("Using pre-downloaded model")
+            # Similarly, you would load your pre-downloaded model here
+    
+        logger.info("Models loaded successfully")
+        return True
+    except Exception as e:
+        logger.error(f"Error loading models: {e}")
+        return False
+
+# Simulate VLM processing
+def process_image_query(image_data, query, task="vqa"):
+    """
+    Process an image with a query using the VLM model.
+    
+    Args:
+        image_data: Base64 encoded image
+        query: Text query to process
+        task: The task to perform (vqa or search)
+        
+    Returns:
+        Dictionary with the processing results
+    """
+    try:
+        # Decode the image
+        image_bytes = base64.b64decode(image_data)
+        image = Image.open(BytesIO(image_bytes))
+        
+        # Log the request
+        logger.info(f"Processing {task} request with query: {query}")
+        logger.info(f"Image size: {image.size}")
+        
+        # Generate a simulated response
+        if task == "vqa":
+            # Simulate a Visual Question Answering response
+            response = {
+                "answer": f"This is a simulated VLM response for: '{query}'. In a real implementation, this would be generated by a Vision Language Model based on the image content."
+            }
+        else:
+            # Simulate a Visual Search response
+            response = {
+                "answer": f"Search results for: '{query}'. Found 5 similar images (simulated results)."
+            }
+        
+        # Add a slight delay to simulate processing time
+        time.sleep(1.5)
+        
+        return response
+    except Exception as e:
+        logger.error(f"Error processing image: {str(e)}")
+        return {"error": str(e)}
+
+# API endpoints
+@app.route('/vqa', methods=['POST'])
+def vqa_endpoint():
+    try:
+        # Get data from request
+        data = request.json
+        if not data:
+            return jsonify({"error": "No data provided"}), 400
+        
+        # Extract image and query
+        image_data = data.get('image')
+        query = data.get('query')
+        
+        if not image_data or not query:
+            return jsonify({"error": "Image and query are required"}), 400
+        
+        # Process the request
+        result = process_image_query(image_data, query, task="vqa")
+        
+        return jsonify(result)
+    except Exception as e:
+        logger.error(f"Error in VQA endpoint: {str(e)}")
+        return jsonify({"error": str(e)}), 500
+
+@app.route('/search', methods=['POST'])
+def search_endpoint():
+    try:
+        # Get data from request
+        data = request.json
+        if not data:
+            return jsonify({"error": "No data provided"}), 400
+        
+        # Extract image and query
+        image_data = data.get('image')
+        query = data.get('query')
+        
+        if not image_data or not query:
+            return jsonify({"error": "Image and query are required"}), 400
+        
+        # Process the request
+        result = process_image_query(image_data, query, task="search")
+        
+        return jsonify(result)
+    except Exception as e:
+        logger.error(f"Error in search endpoint: {str(e)}")
+        return jsonify({"error": str(e)}), 500
+
+@app.route('/health', methods=['GET'])
+def health_check():
+    return jsonify({"status": "ok", "model": VLM_MODEL_NAME})
+
+if __name__ == '__main__':
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Vision Language Model Backend')
+    parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to run the server on')
+    parser.add_argument('--port', type=int, default=8399, help='Port to run the server on')
+    parser.add_argument('--debug', action='store_true', help='Run in debug mode')
+    args = parser.parse_args()
+    
+    # Load models
+    load_models()
+    
+    # Run the Flask application
+    app.run(host=args.host, port=args.port, debug=args.debug)
--- a/qt_app_pyside1/vlm_backend/model_downloader.py
+++ b/qt_app_pyside1/vlm_backend/model_downloader.py
@@ -0,0 +1,174 @@
+import os
+import sys
+import logging
+import time
+import argparse
+import json
+from pathlib import Path
+import subprocess
+import shutil
+import requests
+import zipfile
+import io
+import tempfile
+from tqdm import tqdm
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s.%(msecs)03d [%(name)s]: %(message)s",
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger('model_downloader')
+
+def download_file(url, output_path, chunk_size=8192):
+    """
+    Download a file from URL with progress bar.
+    
+    Args:
+        url: URL to download from
+        output_path: Path to save the file
+        chunk_size: Size of chunks to download
+        
+    Returns:
+        Path to the downloaded file
+    """
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        
+        # Get file size for progress bar
+        total_size = int(response.headers.get('content-length', 0))
+        
+        # Show download progress
+        desc = f"Downloading {os.path.basename(url)}"
+        with open(output_path, 'wb') as f, tqdm(
+            desc=desc,
+            total=total_size,
+            unit='B',
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as bar:
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+                    bar.update(len(chunk))
+        
+        return output_path
+    except Exception as e:
+        logger.error(f"Error downloading file: {str(e)}")
+        if os.path.exists(output_path):
+            os.remove(output_path)
+        raise e
+
+def download_openvino_model(repo_id="ezelanza/llava-next-video-openvino", output_dir="./models"):
+    """
+    Download pre-converted OpenVINO model from Hugging Face.
+    
+    Args:
+        repo_id: Hugging Face repository ID
+        output_dir: Directory to save the downloaded model
+        
+    Returns:
+        Path to the downloaded model directory
+    """
+    # Create a model-specific output directory
+    model_id = repo_id.split('/')[-1].replace("-openvino", "")
+    output_path = Path(output_dir) / f"{model_id}_openvino_model"
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    # Check if model already exists (look for XML and BIN files)
+    if list(output_path.glob("*.xml")) and list(output_path.glob("*.bin")):
+        logger.info(f"OpenVINO model already exists at {output_path}")
+        return output_path
+    
+    try:
+        # Define files to download
+        files_to_download = [
+            "openvino_model.xml",
+            "openvino_model.bin",
+            "config.json",
+            "tokenizer_config.json",
+            "special_tokens_map.json",
+            "tokenizer.model"
+        ]
+        
+        # Base URL for raw files on Hugging Face
+        base_url = f"https://huggingface.co/{repo_id}/resolve/main"
+        
+        # Download each file
+        for filename in files_to_download:
+            file_url = f"{base_url}/{filename}"
+            output_file = output_path / filename
+            
+            try:
+                download_file(file_url, output_file)
+                logger.info(f"Successfully downloaded {filename}")
+            except Exception as e:
+                logger.warning(f"Could not download {filename}: {str(e)}")
+                # Continue with other files even if one fails
+                continue
+        
+        # Create a model info file
+        model_info = {
+            "model_name": repo_id,
+            "download_date": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "files": [f.name for f in output_path.glob("*") if f.is_file()]
+        }
+        
+        with open(output_path / "model_info.json", "w") as f:
+            json.dump(model_info, f, indent=2)
+        
+        logger.info(f"Model files downloaded to {output_path}")
+        return output_path
+        
+    except Exception as e:
+        logger.error(f"Error downloading model: {str(e)}")
+        raise e
+
+def download_vl_model(model_name="ezelanza/llava-next-video-openvino", output_dir="./models", precision=None):
+    """
+    Download a Vision-Language model.
+    
+    Args:
+        model_name: Name of the model identifier on Hugging Face
+        output_dir: Directory to save the model
+        precision: Not used in this implementation but kept for API compatibility
+    
+    Returns:
+        Path to the downloaded model
+    """
+    start_time = time.time()
+    logger.info(f"Starting download of {model_name}...")
+    
+    # Create output directory if it doesn't exist
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    
+    # Download the OpenVINO model
+    model_path = download_openvino_model(model_name, output_dir)
+    
+    total_time = time.time() - start_time
+    logger.info(f"Completed in {total_time:.2f} seconds. Model saved at {model_path}")
+    
+    return model_path
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download pre-converted Vision-Language models in OpenVINO format")
+    parser.add_argument("--model_name", type=str, default="ezelanza/llava-next-video-openvino", 
+                        help="Name of the model repository on Hugging Face Hub")
+    parser.add_argument("--output_dir", type=str, default="./models", 
+                        help="Directory to save the OpenVINO model")
+    
+    args = parser.parse_args()
+    
+    try:
+        model_path = download_vl_model(args.model_name, args.output_dir)
+        print(f"\nModel successfully downloaded to: {model_path}")
+        print(f"Files downloaded:")
+        for file in sorted(os.listdir(model_path)):
+            file_path = os.path.join(model_path, file)
+            size_mb = os.path.getsize(file_path) / (1024 * 1024)
+            print(f"  - {file} ({size_mb:.2f} MB)")
+    except Exception as e:
+        print(f"\nError downloading model: {e}")
+        sys.exit(1)
--- a/qt_app_pyside1/vlm_backend/requirements.txt
+++ b/qt_app_pyside1/vlm_backend/requirements.txt
@@ -0,0 +1,12 @@
+flask==3.0.0
+pillow==10.3.0
+numpy==1.26.4
+requests==2.32.3
+python-dotenv==1.1.0
+transformers==4.35.0
+torch==2.5.1
+openvino==2024.6.0
+optimum==1.16.0
+optimum-intel==1.12.0
+huggingface_hub==0.20.3
+tqdm==4.67.1
--- a/qt_app_pyside1/vlm_backend/utils.py
+++ b/qt_app_pyside1/vlm_backend/utils.py
@@ -0,0 +1,126 @@
+# Copyright 2018-2021 Streamlit Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import imghdr
+import io
+import mimetypes
+from typing import cast
+from urllib.parse import urlparse
+
+import numpy as np
+from PIL import Image, ImageFile
+import hashlib
+import base64
+
+# Maximum content width
+MAXIMUM_CONTENT_WIDTH = 1460  # 2 * 730
+
+def _image_has_alpha_channel(image):
+    """Check if the image has an alpha channel."""
+    if image.mode in ("RGBA", "LA") or (
+        image.mode == "P" and "transparency" in image.info
+    ):
+        return True
+    else:
+        return False
+
+def _format_from_image_type(image, output_format):
+    """Determine the output format based on image type."""
+    output_format = output_format.upper()
+    if output_format == "JPEG" or output_format == "PNG":
+        return output_format
+
+    # We are forgiving on the spelling of JPEG
+    if output_format == "JPG":
+        return "JPEG"
+
+    if _image_has_alpha_channel(image):
+        return "PNG"
+
+    return "JPEG"
+
+def _PIL_to_bytes(image, format="JPEG", quality=100):
+    """Convert PIL image to bytes."""
+    tmp = io.BytesIO()
+
+    # User must have specified JPEG, so we must convert it
+    if format == "JPEG" and _image_has_alpha_channel(image):
+        image = image.convert("RGB")
+
+    image.save(tmp, format=format, quality=quality)
+
+    return tmp.getvalue()
+
+def _BytesIO_to_bytes(data):
+    """Convert BytesIO to bytes."""
+    data.seek(0)
+    return data.getvalue()
+
+def _normalize_to_bytes(data, width, output_format):
+    """Normalize image data to bytes with proper format and size."""
+    image = Image.open(io.BytesIO(data))
+    actual_width, actual_height = image.size
+    format = _format_from_image_type(image, output_format)
+    if output_format.lower() == "auto":
+        ext = imghdr.what(None, data)
+        mimetype = mimetypes.guess_type("image.%s" % ext)[0]
+    else:
+        mimetype = "image/" + format.lower()
+
+    if width < 0 and actual_width > MAXIMUM_CONTENT_WIDTH:
+        width = MAXIMUM_CONTENT_WIDTH
+
+    if width > 0 and actual_width > width:
+        new_height = int(1.0 * actual_height * width / actual_width)
+        image = image.resize((width, new_height), resample=Image.BILINEAR)
+        data = _PIL_to_bytes(image, format=format, quality=90)
+        mimetype = "image/" + format.lower()
+
+    return data, mimetype
+
+def generate_image_hash(image, mimetype):
+    """Generate a SHA-224 hash for an image."""
+    hasher = hashlib.sha224()
+    hasher.update(image)
+    hasher.update(mimetype.encode())
+    return hasher.hexdigest()
+
+def image_to_url(image, width=-1, output_format="auto"):
+    """
+    Convert an image to a data URL.
+    
+    Args:
+        image: The image data
+        width: Target width (negative means preserve original if under max width)
+        output_format: Output format (auto, jpeg, png)
+        
+    Returns:
+        Data URL of the image
+    """
+    image_data, mimetype = _normalize_to_bytes(image, width, output_format)
+    image_base64 = base64.b64encode(image_data).decode("utf-8")
+    return f"data:{mimetype};base64,{image_base64}"
+
+def video_to_url(video_data):
+    """
+    Convert video data to a data URL.
+    
+    Args:
+        video_data: The video data
+        
+    Returns:
+        Data URL of the video
+    """
+    video_base64 = base64.b64encode(video_data).decode("utf-8")
+    return f"data:video/mp4;base64,{video_base64}"