cleanup and files added
This commit is contained in:
29
qt_app_pyside1/vlm_backend/Dockerfile
Normal file
29
qt_app_pyside1/vlm_backend/Dockerfile
Normal file
@@ -0,0 +1,29 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY app.py utils.py ./
|
||||
|
||||
# Environment variables
|
||||
ENV VLM_MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
ENV LOCAL_EMBED_MODEL_ID="CLIP-ViT-H-14"
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# Expose the API port
|
||||
EXPOSE 8399
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "8399"]
|
||||
186
qt_app_pyside1/vlm_backend/app.py
Normal file
186
qt_app_pyside1/vlm_backend/app.py
Normal file
@@ -0,0 +1,186 @@
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from pathlib import Path
|
||||
from functools import cache
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import logging
|
||||
import datetime
|
||||
import re
|
||||
import string
|
||||
import unicodedata
|
||||
from PIL import Image
|
||||
import time
|
||||
import random
|
||||
from io import BytesIO
|
||||
import base64
|
||||
import requests
|
||||
import shutil
|
||||
import tempfile
|
||||
import copy
|
||||
import json
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
# Import utils for image processing
|
||||
from utils import image_to_url, generate_image_hash
|
||||
|
||||
# Optional: Import model downloader
|
||||
from model_downloader import download_vl_model
|
||||
|
||||
# Get environment variables
|
||||
VLM_MODEL_NAME = os.getenv("VLM_MODEL_NAME", "Qwen/Qwen2.5-VL-7B-Instruct")
|
||||
LOCAL_EMBED_MODEL_ID = os.getenv("LOCAL_EMBED_MODEL_ID", "CLIP-ViT-H-14")
|
||||
MODEL_DIR = os.getenv("MODEL_DIR", "./models")
|
||||
DOWNLOAD_MODELS = os.getenv("DOWNLOAD_MODELS", "True").lower() == "true"
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger('vlm_backend')
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(levelname)s] %(asctime)s.%(msecs)03d [%(name)s]: %(message)s",
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
# Create Flask application
|
||||
app = Flask(__name__)
|
||||
|
||||
# Global model variables
|
||||
ov_model = None
|
||||
tokenizer = None
|
||||
|
||||
# Load models
|
||||
def load_models():
|
||||
global ov_model, tokenizer
|
||||
|
||||
logger.info(f"Loading VLM model: {VLM_MODEL_NAME}")
|
||||
|
||||
try:
|
||||
# Check if models should be downloaded
|
||||
if DOWNLOAD_MODELS:
|
||||
logger.info("Auto-downloading model is enabled")
|
||||
model_path = download_vl_model(VLM_MODEL_NAME, MODEL_DIR, "FP16")
|
||||
logger.info(f"Model downloaded to {model_path}")
|
||||
|
||||
# In a real implementation, you would load the OpenVINO model here:
|
||||
# from optimum.intel.openvino import OVModelForCausalLM
|
||||
# from transformers import AutoTokenizer
|
||||
# tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
# ov_model = OVModelForCausalLM.from_pretrained(model_path)
|
||||
else:
|
||||
logger.info("Using pre-downloaded model")
|
||||
# Similarly, you would load your pre-downloaded model here
|
||||
|
||||
logger.info("Models loaded successfully")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading models: {e}")
|
||||
return False
|
||||
|
||||
# Simulate VLM processing
|
||||
def process_image_query(image_data, query, task="vqa"):
|
||||
"""
|
||||
Process an image with a query using the VLM model.
|
||||
|
||||
Args:
|
||||
image_data: Base64 encoded image
|
||||
query: Text query to process
|
||||
task: The task to perform (vqa or search)
|
||||
|
||||
Returns:
|
||||
Dictionary with the processing results
|
||||
"""
|
||||
try:
|
||||
# Decode the image
|
||||
image_bytes = base64.b64decode(image_data)
|
||||
image = Image.open(BytesIO(image_bytes))
|
||||
|
||||
# Log the request
|
||||
logger.info(f"Processing {task} request with query: {query}")
|
||||
logger.info(f"Image size: {image.size}")
|
||||
|
||||
# Generate a simulated response
|
||||
if task == "vqa":
|
||||
# Simulate a Visual Question Answering response
|
||||
response = {
|
||||
"answer": f"This is a simulated VLM response for: '{query}'. In a real implementation, this would be generated by a Vision Language Model based on the image content."
|
||||
}
|
||||
else:
|
||||
# Simulate a Visual Search response
|
||||
response = {
|
||||
"answer": f"Search results for: '{query}'. Found 5 similar images (simulated results)."
|
||||
}
|
||||
|
||||
# Add a slight delay to simulate processing time
|
||||
time.sleep(1.5)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing image: {str(e)}")
|
||||
return {"error": str(e)}
|
||||
|
||||
# API endpoints
|
||||
@app.route('/vqa', methods=['POST'])
|
||||
def vqa_endpoint():
|
||||
try:
|
||||
# Get data from request
|
||||
data = request.json
|
||||
if not data:
|
||||
return jsonify({"error": "No data provided"}), 400
|
||||
|
||||
# Extract image and query
|
||||
image_data = data.get('image')
|
||||
query = data.get('query')
|
||||
|
||||
if not image_data or not query:
|
||||
return jsonify({"error": "Image and query are required"}), 400
|
||||
|
||||
# Process the request
|
||||
result = process_image_query(image_data, query, task="vqa")
|
||||
|
||||
return jsonify(result)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in VQA endpoint: {str(e)}")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
@app.route('/search', methods=['POST'])
|
||||
def search_endpoint():
|
||||
try:
|
||||
# Get data from request
|
||||
data = request.json
|
||||
if not data:
|
||||
return jsonify({"error": "No data provided"}), 400
|
||||
|
||||
# Extract image and query
|
||||
image_data = data.get('image')
|
||||
query = data.get('query')
|
||||
|
||||
if not image_data or not query:
|
||||
return jsonify({"error": "Image and query are required"}), 400
|
||||
|
||||
# Process the request
|
||||
result = process_image_query(image_data, query, task="search")
|
||||
|
||||
return jsonify(result)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in search endpoint: {str(e)}")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health_check():
|
||||
return jsonify({"status": "ok", "model": VLM_MODEL_NAME})
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description='Vision Language Model Backend')
|
||||
parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to run the server on')
|
||||
parser.add_argument('--port', type=int, default=8399, help='Port to run the server on')
|
||||
parser.add_argument('--debug', action='store_true', help='Run in debug mode')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load models
|
||||
load_models()
|
||||
|
||||
# Run the Flask application
|
||||
app.run(host=args.host, port=args.port, debug=args.debug)
|
||||
174
qt_app_pyside1/vlm_backend/model_downloader.py
Normal file
174
qt_app_pyside1/vlm_backend/model_downloader.py
Normal file
@@ -0,0 +1,174 @@
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import time
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
import shutil
|
||||
import requests
|
||||
import zipfile
|
||||
import io
|
||||
import tempfile
|
||||
from tqdm import tqdm
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(levelname)s] %(asctime)s.%(msecs)03d [%(name)s]: %(message)s",
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger('model_downloader')
|
||||
|
||||
def download_file(url, output_path, chunk_size=8192):
|
||||
"""
|
||||
Download a file from URL with progress bar.
|
||||
|
||||
Args:
|
||||
url: URL to download from
|
||||
output_path: Path to save the file
|
||||
chunk_size: Size of chunks to download
|
||||
|
||||
Returns:
|
||||
Path to the downloaded file
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Get file size for progress bar
|
||||
total_size = int(response.headers.get('content-length', 0))
|
||||
|
||||
# Show download progress
|
||||
desc = f"Downloading {os.path.basename(url)}"
|
||||
with open(output_path, 'wb') as f, tqdm(
|
||||
desc=desc,
|
||||
total=total_size,
|
||||
unit='B',
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
) as bar:
|
||||
for chunk in response.iter_content(chunk_size=chunk_size):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
f.write(chunk)
|
||||
bar.update(len(chunk))
|
||||
|
||||
return output_path
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading file: {str(e)}")
|
||||
if os.path.exists(output_path):
|
||||
os.remove(output_path)
|
||||
raise e
|
||||
|
||||
def download_openvino_model(repo_id="ezelanza/llava-next-video-openvino", output_dir="./models"):
|
||||
"""
|
||||
Download pre-converted OpenVINO model from Hugging Face.
|
||||
|
||||
Args:
|
||||
repo_id: Hugging Face repository ID
|
||||
output_dir: Directory to save the downloaded model
|
||||
|
||||
Returns:
|
||||
Path to the downloaded model directory
|
||||
"""
|
||||
# Create a model-specific output directory
|
||||
model_id = repo_id.split('/')[-1].replace("-openvino", "")
|
||||
output_path = Path(output_dir) / f"{model_id}_openvino_model"
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Check if model already exists (look for XML and BIN files)
|
||||
if list(output_path.glob("*.xml")) and list(output_path.glob("*.bin")):
|
||||
logger.info(f"OpenVINO model already exists at {output_path}")
|
||||
return output_path
|
||||
|
||||
try:
|
||||
# Define files to download
|
||||
files_to_download = [
|
||||
"openvino_model.xml",
|
||||
"openvino_model.bin",
|
||||
"config.json",
|
||||
"tokenizer_config.json",
|
||||
"special_tokens_map.json",
|
||||
"tokenizer.model"
|
||||
]
|
||||
|
||||
# Base URL for raw files on Hugging Face
|
||||
base_url = f"https://huggingface.co/{repo_id}/resolve/main"
|
||||
|
||||
# Download each file
|
||||
for filename in files_to_download:
|
||||
file_url = f"{base_url}/{filename}"
|
||||
output_file = output_path / filename
|
||||
|
||||
try:
|
||||
download_file(file_url, output_file)
|
||||
logger.info(f"Successfully downloaded {filename}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not download {filename}: {str(e)}")
|
||||
# Continue with other files even if one fails
|
||||
continue
|
||||
|
||||
# Create a model info file
|
||||
model_info = {
|
||||
"model_name": repo_id,
|
||||
"download_date": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"files": [f.name for f in output_path.glob("*") if f.is_file()]
|
||||
}
|
||||
|
||||
with open(output_path / "model_info.json", "w") as f:
|
||||
json.dump(model_info, f, indent=2)
|
||||
|
||||
logger.info(f"Model files downloaded to {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading model: {str(e)}")
|
||||
raise e
|
||||
|
||||
def download_vl_model(model_name="ezelanza/llava-next-video-openvino", output_dir="./models", precision=None):
|
||||
"""
|
||||
Download a Vision-Language model.
|
||||
|
||||
Args:
|
||||
model_name: Name of the model identifier on Hugging Face
|
||||
output_dir: Directory to save the model
|
||||
precision: Not used in this implementation but kept for API compatibility
|
||||
|
||||
Returns:
|
||||
Path to the downloaded model
|
||||
"""
|
||||
start_time = time.time()
|
||||
logger.info(f"Starting download of {model_name}...")
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download the OpenVINO model
|
||||
model_path = download_openvino_model(model_name, output_dir)
|
||||
|
||||
total_time = time.time() - start_time
|
||||
logger.info(f"Completed in {total_time:.2f} seconds. Model saved at {model_path}")
|
||||
|
||||
return model_path
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Download pre-converted Vision-Language models in OpenVINO format")
|
||||
parser.add_argument("--model_name", type=str, default="ezelanza/llava-next-video-openvino",
|
||||
help="Name of the model repository on Hugging Face Hub")
|
||||
parser.add_argument("--output_dir", type=str, default="./models",
|
||||
help="Directory to save the OpenVINO model")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
model_path = download_vl_model(args.model_name, args.output_dir)
|
||||
print(f"\nModel successfully downloaded to: {model_path}")
|
||||
print(f"Files downloaded:")
|
||||
for file in sorted(os.listdir(model_path)):
|
||||
file_path = os.path.join(model_path, file)
|
||||
size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
||||
print(f" - {file} ({size_mb:.2f} MB)")
|
||||
except Exception as e:
|
||||
print(f"\nError downloading model: {e}")
|
||||
sys.exit(1)
|
||||
12
qt_app_pyside1/vlm_backend/requirements.txt
Normal file
12
qt_app_pyside1/vlm_backend/requirements.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
flask==3.0.0
|
||||
pillow==10.3.0
|
||||
numpy==1.26.4
|
||||
requests==2.32.3
|
||||
python-dotenv==1.1.0
|
||||
transformers==4.35.0
|
||||
torch==2.5.1
|
||||
openvino==2024.6.0
|
||||
optimum==1.16.0
|
||||
optimum-intel==1.12.0
|
||||
huggingface_hub==0.20.3
|
||||
tqdm==4.67.1
|
||||
126
qt_app_pyside1/vlm_backend/utils.py
Normal file
126
qt_app_pyside1/vlm_backend/utils.py
Normal file
@@ -0,0 +1,126 @@
|
||||
# Copyright 2018-2021 Streamlit Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import imghdr
|
||||
import io
|
||||
import mimetypes
|
||||
from typing import cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image, ImageFile
|
||||
import hashlib
|
||||
import base64
|
||||
|
||||
# Maximum content width
|
||||
MAXIMUM_CONTENT_WIDTH = 1460 # 2 * 730
|
||||
|
||||
def _image_has_alpha_channel(image):
|
||||
"""Check if the image has an alpha channel."""
|
||||
if image.mode in ("RGBA", "LA") or (
|
||||
image.mode == "P" and "transparency" in image.info
|
||||
):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _format_from_image_type(image, output_format):
|
||||
"""Determine the output format based on image type."""
|
||||
output_format = output_format.upper()
|
||||
if output_format == "JPEG" or output_format == "PNG":
|
||||
return output_format
|
||||
|
||||
# We are forgiving on the spelling of JPEG
|
||||
if output_format == "JPG":
|
||||
return "JPEG"
|
||||
|
||||
if _image_has_alpha_channel(image):
|
||||
return "PNG"
|
||||
|
||||
return "JPEG"
|
||||
|
||||
def _PIL_to_bytes(image, format="JPEG", quality=100):
|
||||
"""Convert PIL image to bytes."""
|
||||
tmp = io.BytesIO()
|
||||
|
||||
# User must have specified JPEG, so we must convert it
|
||||
if format == "JPEG" and _image_has_alpha_channel(image):
|
||||
image = image.convert("RGB")
|
||||
|
||||
image.save(tmp, format=format, quality=quality)
|
||||
|
||||
return tmp.getvalue()
|
||||
|
||||
def _BytesIO_to_bytes(data):
|
||||
"""Convert BytesIO to bytes."""
|
||||
data.seek(0)
|
||||
return data.getvalue()
|
||||
|
||||
def _normalize_to_bytes(data, width, output_format):
|
||||
"""Normalize image data to bytes with proper format and size."""
|
||||
image = Image.open(io.BytesIO(data))
|
||||
actual_width, actual_height = image.size
|
||||
format = _format_from_image_type(image, output_format)
|
||||
if output_format.lower() == "auto":
|
||||
ext = imghdr.what(None, data)
|
||||
mimetype = mimetypes.guess_type("image.%s" % ext)[0]
|
||||
else:
|
||||
mimetype = "image/" + format.lower()
|
||||
|
||||
if width < 0 and actual_width > MAXIMUM_CONTENT_WIDTH:
|
||||
width = MAXIMUM_CONTENT_WIDTH
|
||||
|
||||
if width > 0 and actual_width > width:
|
||||
new_height = int(1.0 * actual_height * width / actual_width)
|
||||
image = image.resize((width, new_height), resample=Image.BILINEAR)
|
||||
data = _PIL_to_bytes(image, format=format, quality=90)
|
||||
mimetype = "image/" + format.lower()
|
||||
|
||||
return data, mimetype
|
||||
|
||||
def generate_image_hash(image, mimetype):
|
||||
"""Generate a SHA-224 hash for an image."""
|
||||
hasher = hashlib.sha224()
|
||||
hasher.update(image)
|
||||
hasher.update(mimetype.encode())
|
||||
return hasher.hexdigest()
|
||||
|
||||
def image_to_url(image, width=-1, output_format="auto"):
|
||||
"""
|
||||
Convert an image to a data URL.
|
||||
|
||||
Args:
|
||||
image: The image data
|
||||
width: Target width (negative means preserve original if under max width)
|
||||
output_format: Output format (auto, jpeg, png)
|
||||
|
||||
Returns:
|
||||
Data URL of the image
|
||||
"""
|
||||
image_data, mimetype = _normalize_to_bytes(image, width, output_format)
|
||||
image_base64 = base64.b64encode(image_data).decode("utf-8")
|
||||
return f"data:{mimetype};base64,{image_base64}"
|
||||
|
||||
def video_to_url(video_data):
|
||||
"""
|
||||
Convert video data to a data URL.
|
||||
|
||||
Args:
|
||||
video_data: The video data
|
||||
|
||||
Returns:
|
||||
Data URL of the video
|
||||
"""
|
||||
video_base64 = base64.b64encode(video_data).decode("utf-8")
|
||||
return f"data:video/mp4;base64,{video_base64}"
|
||||
Reference in New Issue
Block a user