cleanup and files added

This commit is contained in:
2025-08-26 13:24:53 -07:00
parent a379d7a063
commit 51a14cd61c
8968 changed files with 1292619 additions and 0 deletions

View File

@@ -0,0 +1,29 @@
FROM python:3.10-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
libgl1 \
libglib2.0-0 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app.py utils.py ./
# Environment variables
ENV VLM_MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
ENV LOCAL_EMBED_MODEL_ID="CLIP-ViT-H-14"
ENV PYTHONUNBUFFERED=1
# Expose the API port
EXPOSE 8399
# Run the application
CMD ["python", "app.py", "--host", "0.0.0.0", "--port", "8399"]

View File

@@ -0,0 +1,186 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
from functools import cache
import sys
import os
import argparse
import logging
import datetime
import re
import string
import unicodedata
from PIL import Image
import time
import random
from io import BytesIO
import base64
import requests
import shutil
import tempfile
import copy
import json
from flask import Flask, request, jsonify
# Import utils for image processing
from utils import image_to_url, generate_image_hash
# Optional: Import model downloader
from model_downloader import download_vl_model
# Get environment variables
VLM_MODEL_NAME = os.getenv("VLM_MODEL_NAME", "Qwen/Qwen2.5-VL-7B-Instruct")
LOCAL_EMBED_MODEL_ID = os.getenv("LOCAL_EMBED_MODEL_ID", "CLIP-ViT-H-14")
MODEL_DIR = os.getenv("MODEL_DIR", "./models")
DOWNLOAD_MODELS = os.getenv("DOWNLOAD_MODELS", "True").lower() == "true"
# Configure logging
logger = logging.getLogger('vlm_backend')
logging.basicConfig(
level=logging.INFO,
format="[%(levelname)s] %(asctime)s.%(msecs)03d [%(name)s]: %(message)s",
datefmt='%Y-%m-%d %H:%M:%S'
)
# Create Flask application
app = Flask(__name__)
# Global model variables
ov_model = None
tokenizer = None
# Load models
def load_models():
global ov_model, tokenizer
logger.info(f"Loading VLM model: {VLM_MODEL_NAME}")
try:
# Check if models should be downloaded
if DOWNLOAD_MODELS:
logger.info("Auto-downloading model is enabled")
model_path = download_vl_model(VLM_MODEL_NAME, MODEL_DIR, "FP16")
logger.info(f"Model downloaded to {model_path}")
# In a real implementation, you would load the OpenVINO model here:
# from optimum.intel.openvino import OVModelForCausalLM
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# ov_model = OVModelForCausalLM.from_pretrained(model_path)
else:
logger.info("Using pre-downloaded model")
# Similarly, you would load your pre-downloaded model here
logger.info("Models loaded successfully")
return True
except Exception as e:
logger.error(f"Error loading models: {e}")
return False
# Simulate VLM processing
def process_image_query(image_data, query, task="vqa"):
"""
Process an image with a query using the VLM model.
Args:
image_data: Base64 encoded image
query: Text query to process
task: The task to perform (vqa or search)
Returns:
Dictionary with the processing results
"""
try:
# Decode the image
image_bytes = base64.b64decode(image_data)
image = Image.open(BytesIO(image_bytes))
# Log the request
logger.info(f"Processing {task} request with query: {query}")
logger.info(f"Image size: {image.size}")
# Generate a simulated response
if task == "vqa":
# Simulate a Visual Question Answering response
response = {
"answer": f"This is a simulated VLM response for: '{query}'. In a real implementation, this would be generated by a Vision Language Model based on the image content."
}
else:
# Simulate a Visual Search response
response = {
"answer": f"Search results for: '{query}'. Found 5 similar images (simulated results)."
}
# Add a slight delay to simulate processing time
time.sleep(1.5)
return response
except Exception as e:
logger.error(f"Error processing image: {str(e)}")
return {"error": str(e)}
# API endpoints
@app.route('/vqa', methods=['POST'])
def vqa_endpoint():
try:
# Get data from request
data = request.json
if not data:
return jsonify({"error": "No data provided"}), 400
# Extract image and query
image_data = data.get('image')
query = data.get('query')
if not image_data or not query:
return jsonify({"error": "Image and query are required"}), 400
# Process the request
result = process_image_query(image_data, query, task="vqa")
return jsonify(result)
except Exception as e:
logger.error(f"Error in VQA endpoint: {str(e)}")
return jsonify({"error": str(e)}), 500
@app.route('/search', methods=['POST'])
def search_endpoint():
try:
# Get data from request
data = request.json
if not data:
return jsonify({"error": "No data provided"}), 400
# Extract image and query
image_data = data.get('image')
query = data.get('query')
if not image_data or not query:
return jsonify({"error": "Image and query are required"}), 400
# Process the request
result = process_image_query(image_data, query, task="search")
return jsonify(result)
except Exception as e:
logger.error(f"Error in search endpoint: {str(e)}")
return jsonify({"error": str(e)}), 500
@app.route('/health', methods=['GET'])
def health_check():
return jsonify({"status": "ok", "model": VLM_MODEL_NAME})
if __name__ == '__main__':
# Parse command line arguments
parser = argparse.ArgumentParser(description='Vision Language Model Backend')
parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to run the server on')
parser.add_argument('--port', type=int, default=8399, help='Port to run the server on')
parser.add_argument('--debug', action='store_true', help='Run in debug mode')
args = parser.parse_args()
# Load models
load_models()
# Run the Flask application
app.run(host=args.host, port=args.port, debug=args.debug)

View File

@@ -0,0 +1,174 @@
import os
import sys
import logging
import time
import argparse
import json
from pathlib import Path
import subprocess
import shutil
import requests
import zipfile
import io
import tempfile
from tqdm import tqdm
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="[%(levelname)s] %(asctime)s.%(msecs)03d [%(name)s]: %(message)s",
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger('model_downloader')
def download_file(url, output_path, chunk_size=8192):
"""
Download a file from URL with progress bar.
Args:
url: URL to download from
output_path: Path to save the file
chunk_size: Size of chunks to download
Returns:
Path to the downloaded file
"""
try:
response = requests.get(url, stream=True)
response.raise_for_status()
# Get file size for progress bar
total_size = int(response.headers.get('content-length', 0))
# Show download progress
desc = f"Downloading {os.path.basename(url)}"
with open(output_path, 'wb') as f, tqdm(
desc=desc,
total=total_size,
unit='B',
unit_scale=True,
unit_divisor=1024,
) as bar:
for chunk in response.iter_content(chunk_size=chunk_size):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
bar.update(len(chunk))
return output_path
except Exception as e:
logger.error(f"Error downloading file: {str(e)}")
if os.path.exists(output_path):
os.remove(output_path)
raise e
def download_openvino_model(repo_id="ezelanza/llava-next-video-openvino", output_dir="./models"):
"""
Download pre-converted OpenVINO model from Hugging Face.
Args:
repo_id: Hugging Face repository ID
output_dir: Directory to save the downloaded model
Returns:
Path to the downloaded model directory
"""
# Create a model-specific output directory
model_id = repo_id.split('/')[-1].replace("-openvino", "")
output_path = Path(output_dir) / f"{model_id}_openvino_model"
output_path.mkdir(parents=True, exist_ok=True)
# Check if model already exists (look for XML and BIN files)
if list(output_path.glob("*.xml")) and list(output_path.glob("*.bin")):
logger.info(f"OpenVINO model already exists at {output_path}")
return output_path
try:
# Define files to download
files_to_download = [
"openvino_model.xml",
"openvino_model.bin",
"config.json",
"tokenizer_config.json",
"special_tokens_map.json",
"tokenizer.model"
]
# Base URL for raw files on Hugging Face
base_url = f"https://huggingface.co/{repo_id}/resolve/main"
# Download each file
for filename in files_to_download:
file_url = f"{base_url}/{filename}"
output_file = output_path / filename
try:
download_file(file_url, output_file)
logger.info(f"Successfully downloaded {filename}")
except Exception as e:
logger.warning(f"Could not download {filename}: {str(e)}")
# Continue with other files even if one fails
continue
# Create a model info file
model_info = {
"model_name": repo_id,
"download_date": time.strftime("%Y-%m-%d %H:%M:%S"),
"files": [f.name for f in output_path.glob("*") if f.is_file()]
}
with open(output_path / "model_info.json", "w") as f:
json.dump(model_info, f, indent=2)
logger.info(f"Model files downloaded to {output_path}")
return output_path
except Exception as e:
logger.error(f"Error downloading model: {str(e)}")
raise e
def download_vl_model(model_name="ezelanza/llava-next-video-openvino", output_dir="./models", precision=None):
"""
Download a Vision-Language model.
Args:
model_name: Name of the model identifier on Hugging Face
output_dir: Directory to save the model
precision: Not used in this implementation but kept for API compatibility
Returns:
Path to the downloaded model
"""
start_time = time.time()
logger.info(f"Starting download of {model_name}...")
# Create output directory if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Download the OpenVINO model
model_path = download_openvino_model(model_name, output_dir)
total_time = time.time() - start_time
logger.info(f"Completed in {total_time:.2f} seconds. Model saved at {model_path}")
return model_path
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download pre-converted Vision-Language models in OpenVINO format")
parser.add_argument("--model_name", type=str, default="ezelanza/llava-next-video-openvino",
help="Name of the model repository on Hugging Face Hub")
parser.add_argument("--output_dir", type=str, default="./models",
help="Directory to save the OpenVINO model")
args = parser.parse_args()
try:
model_path = download_vl_model(args.model_name, args.output_dir)
print(f"\nModel successfully downloaded to: {model_path}")
print(f"Files downloaded:")
for file in sorted(os.listdir(model_path)):
file_path = os.path.join(model_path, file)
size_mb = os.path.getsize(file_path) / (1024 * 1024)
print(f" - {file} ({size_mb:.2f} MB)")
except Exception as e:
print(f"\nError downloading model: {e}")
sys.exit(1)

View File

@@ -0,0 +1,12 @@
flask==3.0.0
pillow==10.3.0
numpy==1.26.4
requests==2.32.3
python-dotenv==1.1.0
transformers==4.35.0
torch==2.5.1
openvino==2024.6.0
optimum==1.16.0
optimum-intel==1.12.0
huggingface_hub==0.20.3
tqdm==4.67.1

View File

@@ -0,0 +1,126 @@
# Copyright 2018-2021 Streamlit Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import imghdr
import io
import mimetypes
from typing import cast
from urllib.parse import urlparse
import numpy as np
from PIL import Image, ImageFile
import hashlib
import base64
# Maximum content width
MAXIMUM_CONTENT_WIDTH = 1460 # 2 * 730
def _image_has_alpha_channel(image):
"""Check if the image has an alpha channel."""
if image.mode in ("RGBA", "LA") or (
image.mode == "P" and "transparency" in image.info
):
return True
else:
return False
def _format_from_image_type(image, output_format):
"""Determine the output format based on image type."""
output_format = output_format.upper()
if output_format == "JPEG" or output_format == "PNG":
return output_format
# We are forgiving on the spelling of JPEG
if output_format == "JPG":
return "JPEG"
if _image_has_alpha_channel(image):
return "PNG"
return "JPEG"
def _PIL_to_bytes(image, format="JPEG", quality=100):
"""Convert PIL image to bytes."""
tmp = io.BytesIO()
# User must have specified JPEG, so we must convert it
if format == "JPEG" and _image_has_alpha_channel(image):
image = image.convert("RGB")
image.save(tmp, format=format, quality=quality)
return tmp.getvalue()
def _BytesIO_to_bytes(data):
"""Convert BytesIO to bytes."""
data.seek(0)
return data.getvalue()
def _normalize_to_bytes(data, width, output_format):
"""Normalize image data to bytes with proper format and size."""
image = Image.open(io.BytesIO(data))
actual_width, actual_height = image.size
format = _format_from_image_type(image, output_format)
if output_format.lower() == "auto":
ext = imghdr.what(None, data)
mimetype = mimetypes.guess_type("image.%s" % ext)[0]
else:
mimetype = "image/" + format.lower()
if width < 0 and actual_width > MAXIMUM_CONTENT_WIDTH:
width = MAXIMUM_CONTENT_WIDTH
if width > 0 and actual_width > width:
new_height = int(1.0 * actual_height * width / actual_width)
image = image.resize((width, new_height), resample=Image.BILINEAR)
data = _PIL_to_bytes(image, format=format, quality=90)
mimetype = "image/" + format.lower()
return data, mimetype
def generate_image_hash(image, mimetype):
"""Generate a SHA-224 hash for an image."""
hasher = hashlib.sha224()
hasher.update(image)
hasher.update(mimetype.encode())
return hasher.hexdigest()
def image_to_url(image, width=-1, output_format="auto"):
"""
Convert an image to a data URL.
Args:
image: The image data
width: Target width (negative means preserve original if under max width)
output_format: Output format (auto, jpeg, png)
Returns:
Data URL of the image
"""
image_data, mimetype = _normalize_to_bytes(image, width, output_format)
image_base64 = base64.b64encode(image_data).decode("utf-8")
return f"data:{mimetype};base64,{image_base64}"
def video_to_url(video_data):
"""
Convert video data to a data URL.
Args:
video_data: The video data
Returns:
Data URL of the video
"""
video_base64 = base64.b64encode(video_data).decode("utf-8")
return f"data:video/mp4;base64,{video_base64}"