From faeb7d16f662ab4246d1380a35fc7aa004114189 Mon Sep 17 00:00:00 2001
From: maybleMyers <benjimon678@yahoo.com>
Date: Mon, 8 Dec 2025 17:37:59 -0800
Subject: [PATCH] update reqs for vlm

---
 requirements_vlm.txt |  47 +++++
 vlm.py               | 428 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 413 insertions(+), 62 deletions(-)
 create mode 100644 requirements_vlm.txt

diff --git a/requirements_vlm.txt b/requirements_vlm.txt
new file mode 100644
index 00000000..d9ec6dbe
--- /dev/null
+++ b/requirements_vlm.txt
@@ -0,0 +1,47 @@
+# Requirements for vlm.py - Qwen3-VL Chat Interface
+# Use a separate virtual environment to avoid conflicts with main Forge app
+#
+# Setup:
+#   python -m venv venv_vlm
+#   venv_vlm\Scripts\activate  (Windows)
+#   source venv_vlm/bin/activate  (Linux/Mac)
+#   pip install -r requirements_vlm.txt
+#
+# Run:
+#   python vlm.py
+
+# PyTorch - install first with CUDA support
+# pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128
+torch>=2.4.0
+torchvision
+
+# vLLM for high-performance inference
+vllm>=0.11.0
+
+# Qwen VL utilities
+qwen-vl-utils>=0.0.14
+
+# Transformers (fallback backend)
+transformers>=4.51.0
+accelerate
+safetensors
+
+# Gradio UI
+gradio>=5.0.0
+gradio-client
+
+# Image/Video processing
+Pillow>=10.0.0
+opencv-python
+
+# Other dependencies
+numpy
+tqdm
+pydantic>=2.0.0
+huggingface-hub>=0.20.0
+
+# Optional: Flash Attention 2 (for faster inference)
+# pip install flash-attn --no-build-isolation
+
+# Optional: bitsandbytes for quantization (transformers backend)
+# pip install bitsandbytes
diff --git a/vlm.py b/vlm.py
index 5749d776..43cee258 100644
--- a/vlm.py
+++ b/vlm.py
@@ -26,6 +26,27 @@ except ImportError:
     CV2_AVAILABLE = False
     print("Warning: opencv-python not installed. Video support will be limited.")
 
+# Try to import vLLM for high-performance inference
+try:
+    from vllm import LLM, SamplingParams
+    VLLM_AVAILABLE = True
+    print("vLLM loaded successfully")
+except ImportError as e:
+    VLLM_AVAILABLE = False
+    print(f"Warning: vLLM not available. Install with: pip install vllm>=0.11.0")
+    print(f"  Import error: {e}")
+except Exception as e:
+    VLLM_AVAILABLE = False
+    print(f"Warning: vLLM import failed: {e}")
+
+# Try to import qwen-vl-utils for image processing
+try:
+    from qwen_vl_utils import process_vision_info
+    QWEN_VL_UTILS_AVAILABLE = True
+except ImportError:
+    QWEN_VL_UTILS_AVAILABLE = False
+    print("Warning: qwen-vl-utils not installed. Install with: pip install qwen-vl-utils")
+
 # Default model paths (relative to models/LLM)
 DEFAULT_MODELS = {
     "Qwen3-VL-8B-Caption-V4.5": "models/LLM/Qwen3-VL-8B-Caption-V4.5",
@@ -82,13 +103,36 @@ def extract_video_frames(video_path: str, max_frames: int = 8, target_size: Tupl
 class VLMManager:
     """Manages Qwen VL model loading, inference, and memory."""
 
-    def __init__(self, low_vram: bool = False):
+    def __init__(self, low_vram: bool = False, backend: str = "auto"):
+        """
+        Initialize VLM Manager.
+
+        Args:
+            low_vram: Enable low VRAM mode for transformers backend
+            backend: "vllm", "transformers", or "auto" (vLLM if available, else transformers)
+        """
         self.model = None
         self.processor = None
         self.model_name = None
         self.low_vram = low_vram
         self.device = self._get_device()
 
+        # vLLM specific attributes
+        self.vllm_model = None
+        self.model_path = None
+
+        # Determine backend
+        if backend == "auto":
+            self.backend = "vllm" if VLLM_AVAILABLE else "transformers"
+        else:
+            self.backend = backend
+
+        if self.backend == "vllm" and not VLLM_AVAILABLE:
+            print("Warning: vLLM requested but not available. Falling back to transformers.")
+            self.backend = "transformers"
+
+        print(f"VLM Backend: {self.backend}")
+
     def _get_device(self) -> torch.device:
         """Get the best available device."""
         if torch.cuda.is_available():
@@ -148,6 +192,77 @@ class VLMManager:
 
         return "qwen2_5_vl"  # Default fallback
 
+    def _load_with_vllm(self, model_name: str, quantization: str = "none", progress=gr.Progress()) -> str:
+        """Load model using vLLM backend for high-performance inference."""
+        if not VLLM_AVAILABLE:
+            return "vLLM is not available. Please install with: pip install vllm>=0.11.0"
+
+        # Check if already loaded
+        if self.vllm_model is not None and self.model_name == model_name:
+            return f"Model '{model_name}' is already loaded (vLLM)."
+
+        # Unload existing model first
+        if self.vllm_model is not None:
+            self.unload_model()
+
+        progress(0.1, desc="Loading model with vLLM...")
+
+        # Determine model path
+        if model_name in DEFAULT_MODELS:
+            model_path = DEFAULT_MODELS[model_name]
+        else:
+            model_path = f"models/LLM/{model_name}"
+
+        if not Path(model_path).exists():
+            return f"Model path not found: {model_path}"
+
+        try:
+            # Detect model type
+            model_type = self._detect_model_type(model_path)
+            print(f"Detected model type: {model_type}")
+
+            progress(0.3, desc="Initializing vLLM engine...")
+
+            # Configure vLLM loading options
+            vllm_kwargs = {
+                "model": model_path,
+                "trust_remote_code": True,
+                "dtype": "bfloat16",
+                "max_model_len": 4096,  # Adjust based on your VRAM
+                "gpu_memory_utilization": 0.9,
+            }
+
+            # Handle quantization
+            if quantization == "4bit":
+                vllm_kwargs["quantization"] = "awq"  # or "gptq" depending on model
+                print("Using AWQ 4-bit quantization with vLLM")
+            elif quantization == "8bit":
+                vllm_kwargs["quantization"] = "fp8"
+                print("Using FP8 quantization with vLLM")
+
+            # Enable multimodal for VL models
+            vllm_kwargs["limit_mm_per_prompt"] = {"image": 10, "video": 2}
+
+            progress(0.5, desc=f"Loading {model_type} with vLLM...")
+
+            self.vllm_model = LLM(**vllm_kwargs)
+            self.model_path = model_path
+            self.model_name = model_name
+
+            # Also load processor for chat template
+            from transformers import AutoProcessor
+            self.processor = AutoProcessor.from_pretrained(model_path)
+
+            progress(1.0, desc="Model loaded with vLLM!")
+
+            quant_info = f", {quantization}" if quantization != "none" else ""
+            return f"Successfully loaded '{model_name}' with vLLM ({model_type}{quant_info})"
+
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return f"Failed to load model with vLLM: {str(e)}"
+
     def load_model(self, model_name: str, quantization: str = "none", use_flash_attn: bool = False, vram_buffer: int = 0, progress=gr.Progress()) -> str:
         """Load a Qwen VL model.
 
@@ -161,6 +276,10 @@ class VLMManager:
         if model_name == "No models found":
             return "No models available. Please download a model first."
 
+        # Use vLLM backend if selected
+        if self.backend == "vllm":
+            return self._load_with_vllm(model_name, quantization, progress)
+
         # Check if already loaded
         if self.model is not None and self.model_name == model_name:
             return f"Model '{model_name}' is already loaded."
@@ -287,22 +406,36 @@ class VLMManager:
 
     def unload_model(self) -> str:
         """Unload the current model to free memory."""
-        if self.model is None:
+        # Check if any model is loaded (either vLLM or transformers)
+        if self.model is None and self.vllm_model is None:
             return "No model is currently loaded."
 
         model_name = self.model_name
+        backend_used = "vLLM" if self.vllm_model is not None else "transformers"
+
+        # Unload vLLM model
+        if self.vllm_model is not None:
+            del self.vllm_model
+            self.vllm_model = None
+            self.model_path = None
+
+        # Unload transformers model
+        if self.model is not None:
+            del self.model
+            self.model = None
+
+        # Clean up processor
+        if self.processor is not None:
+            del self.processor
+            self.processor = None
 
-        del self.model
-        del self.processor
-        self.model = None
-        self.processor = None
         self.model_name = None
 
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
 
-        return f"Unloaded '{model_name}' and freed memory."
+        return f"Unloaded '{model_name}' ({backend_used}) and freed memory."
 
     def get_memory_info(self) -> str:
         """Get current GPU memory usage."""
@@ -315,6 +448,117 @@ class VLMManager:
 
         return f"GPU Memory: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved, {total:.1f}GB total"
 
+    def _generate_with_vllm(
+        self,
+        messages: List[Dict[str, Any]],
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50,
+        repetition_penalty: float = 1.1,
+        video_max_frames: int = 8,
+    ) -> str:
+        """Generate a response using vLLM backend."""
+        if self.vllm_model is None:
+            return "Error: No vLLM model loaded."
+
+        try:
+            # Process messages to extract images and prepare for vLLM
+            images = []
+            processed_messages = []
+
+            for msg in messages:
+                if isinstance(msg.get("content"), list):
+                    new_content = []
+                    for item in msg["content"]:
+                        if item.get("type") == "image" and "image" in item:
+                            img = item["image"]
+                            images.append(img)
+                            # For vLLM, use placeholder in text
+                            new_content.append({"type": "image"})
+                        elif item.get("type") == "video" and "video" in item:
+                            # Process video into frames
+                            video_path = item["video"]
+                            if isinstance(video_path, str) and os.path.exists(video_path):
+                                try:
+                                    frames = extract_video_frames(video_path, max_frames=video_max_frames)
+                                    for frame in frames:
+                                        images.append(frame)
+                                        new_content.append({"type": "image"})
+                                    if frames:
+                                        new_content.append({"type": "text", "text": f"[The above {len(frames)} images are frames extracted from a video]"})
+                                except Exception as e:
+                                    new_content.append({"type": "text", "text": f"[Video processing error: {str(e)}]"})
+                        elif item.get("type") == "text":
+                            new_content.append(item)
+                        else:
+                            new_content.append(item)
+                    processed_messages.append({"role": msg["role"], "content": new_content})
+                else:
+                    processed_messages.append(msg)
+
+            # Apply chat template using processor
+            text_input = self.processor.apply_chat_template(
+                processed_messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+
+            print(f"[vLLM Debug] Prompt preview: {text_input[:500]}...")
+            print(f"[vLLM Debug] Number of images: {len(images)}")
+
+            # Configure sampling parameters
+            sampling_params = SamplingParams(
+                max_tokens=max_new_tokens,
+                temperature=temperature if temperature > 0 else 0.001,
+                top_p=top_p,
+                top_k=top_k if top_k > 0 else -1,
+                repetition_penalty=repetition_penalty,
+            )
+
+            # Prepare multimodal inputs for vLLM
+            if images:
+                # Convert PIL images to format vLLM expects
+                mm_data = {"image": images}
+                inputs = {
+                    "prompt": text_input,
+                    "multi_modal_data": mm_data,
+                }
+            else:
+                inputs = {"prompt": text_input}
+
+            # Generate with timing
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            outputs = self.vllm_model.generate([inputs], sampling_params=sampling_params)
+
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            end_time = time.perf_counter()
+
+            # Extract response
+            response = outputs[0].outputs[0].text
+
+            # Calculate throughput
+            num_generated_tokens = len(outputs[0].outputs[0].token_ids)
+            generation_time = end_time - start_time
+            tokens_per_sec = num_generated_tokens / generation_time if generation_time > 0 else 0
+
+            print(f"[vLLM Inference] Generated {num_generated_tokens} tokens in {generation_time:.2f}s ({tokens_per_sec:.2f} tok/s)")
+
+            # Clean up thinking tags if present
+            if "</think>" in response:
+                response = response.split("</think>")[-1].strip()
+
+            return response
+
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return f"Error during vLLM generation: {str(e)}"
+
     @torch.inference_mode()
     def generate(
         self,
@@ -327,6 +571,18 @@ class VLMManager:
         video_max_frames: int = 8,
     ) -> str:
         """Generate a response from the model."""
+        # Use vLLM backend if loaded
+        if self.vllm_model is not None:
+            return self._generate_with_vllm(
+                messages=messages,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                video_max_frames=video_max_frames,
+            )
+
         if self.model is None:
             return "Error: No model loaded. Please load a model first."
 
@@ -442,10 +698,25 @@ class VLMManager:
 vlm_manager: Optional[VLMManager] = None
 
 
-def initialize_manager(low_vram: bool = False):
+def initialize_manager(low_vram: bool = False, backend: str = "auto"):
     """Initialize the global VLM manager."""
     global vlm_manager
-    vlm_manager = VLMManager(low_vram=low_vram)
+    vlm_manager = VLMManager(low_vram=low_vram, backend=backend)
+
+
+def switch_backend_handler(backend: str):
+    """Handle backend switching from UI."""
+    global vlm_manager
+    if vlm_manager is not None:
+        # Unload current model first
+        vlm_manager.unload_model()
+
+    # Get current low_vram setting
+    low_vram = vlm_manager.low_vram if vlm_manager else False
+
+    # Reinitialize with new backend
+    vlm_manager = VLMManager(low_vram=low_vram, backend=backend)
+    return f"Switched to {vlm_manager.backend} backend"
 
 
 def load_model_handler(model_name: str, quantization: str, use_flash_attn: bool, vram_buffer: int, progress=gr.Progress()):
@@ -484,7 +755,8 @@ def chat_handler(
     auto_unload: bool = False,
 ):
     """Handle chat messages from UI."""
-    if vlm_manager is None or vlm_manager.model is None:
+    # Check if any model is loaded (either transformers or vLLM)
+    if vlm_manager is None or (vlm_manager.model is None and vlm_manager.vllm_model is None):
         return history + [(message, "Error: No model loaded. Please load a model first.")], ""
 
     # Build messages list for the model
@@ -657,56 +929,57 @@ def create_ui():
     """Create the Gradio interface."""
     available_models = vlm_manager.get_available_models() if vlm_manager else ["Manager not initialized"]
 
-    with gr.Blocks(
-        title="Chromaforge VLM",
-        theme=themes.Default(
-            primary_hue=colors.Color(
-                name="custom",
-                c50="#E6F0FF",
-                c100="#CCE0FF",
-                c200="#99C1FF",
-                c300="#66A3FF",
-                c400="#3384FF",
-                c500="#0060df",
-                c600="#0052C2",
-                c700="#003D91",
-                c800="#002961",
-                c900="#001430",
-                c950="#000A18"
-            )
-        ),
-        css="""
-        .gallery-item:first-child { border: 2px solid #4CAF50 !important; }
-        .gallery-item:first-child:hover { border-color: #45a049 !important; }
-        .green-btn {
-            background: linear-gradient(to bottom right, #2ecc71, #27ae60) !important;
-            color: white !important;
-            border: none !important;
-        }
-        .green-btn:hover {
-            background: linear-gradient(to bottom right, #27ae60, #219651) !important;
-        }
-        .refresh-btn {
-            max-width: 40px !important;
-            min-width: 40px !important;
-            height: 40px !important;
-            border-radius: 50% !important;
-            padding: 0 !important;
-            display: flex !important;
-            align-items: center !important;
-            justify-content: center !important;
-        }
-        .light-blue-btn {
-            background: linear-gradient(to bottom right, #AEC6CF, #9AB8C4) !important;
-            color: #333 !important;
-            border: 1px solid #9AB8C4 !important;
-        }
-        .light-blue-btn:hover {
-            background: linear-gradient(to bottom right, #9AB8C4, #8AA9B5) !important;
-            border-color: #8AA9B5 !important;
-        }
-        """,
-    ) as demo:
+    # Theme for Gradio 6.x (passed to launch() instead of Blocks())
+    global vlm_theme, vlm_css
+    vlm_theme = themes.Default(
+        primary_hue=colors.Color(
+            name="custom",
+            c50="#E6F0FF",
+            c100="#CCE0FF",
+            c200="#99C1FF",
+            c300="#66A3FF",
+            c400="#3384FF",
+            c500="#0060df",
+            c600="#0052C2",
+            c700="#003D91",
+            c800="#002961",
+            c900="#001430",
+            c950="#000A18"
+        )
+    )
+    vlm_css = """
+    .gallery-item:first-child { border: 2px solid #4CAF50 !important; }
+    .gallery-item:first-child:hover { border-color: #45a049 !important; }
+    .green-btn {
+        background: linear-gradient(to bottom right, #2ecc71, #27ae60) !important;
+        color: white !important;
+        border: none !important;
+    }
+    .green-btn:hover {
+        background: linear-gradient(to bottom right, #27ae60, #219651) !important;
+    }
+    .refresh-btn {
+        max-width: 40px !important;
+        min-width: 40px !important;
+        height: 40px !important;
+        border-radius: 50% !important;
+        padding: 0 !important;
+        display: flex !important;
+        align-items: center !important;
+        justify-content: center !important;
+    }
+    .light-blue-btn {
+        background: linear-gradient(to bottom right, #AEC6CF, #9AB8C4) !important;
+        color: #333 !important;
+        border: 1px solid #9AB8C4 !important;
+    }
+    .light-blue-btn:hover {
+        background: linear-gradient(to bottom right, #9AB8C4, #8AA9B5) !important;
+        border-color: #8AA9B5 !important;
+    }
+    """
+
+    with gr.Blocks(title="Chromaforge VLM") as demo:
         with gr.Row():
             # Left column - Settings (shared across tabs)
             with gr.Column(scale=1):
@@ -721,6 +994,21 @@ def create_ui():
 
                 refresh_models_btn = gr.Button("Refresh Model List", size="sm")
 
+                # Backend selection (vLLM or transformers)
+                backend_choices = ["auto", "vllm", "transformers"] if VLLM_AVAILABLE else ["transformers"]
+                backend_dropdown = gr.Dropdown(
+                    choices=backend_choices,
+                    value="auto" if VLLM_AVAILABLE else "transformers",
+                    label="Backend",
+                    info="vLLM: faster inference, transformers: more compatible",
+                    interactive=True,
+                )
+                backend_status = gr.Textbox(
+                    label="Backend Status",
+                    value=f"Current: {vlm_manager.backend if vlm_manager else 'not initialized'}",
+                    interactive=False,
+                )
+
                 quantization_dropdown = gr.Dropdown(
                     choices=["none", "4bit", "8bit"],
                     value="none",
@@ -830,7 +1118,6 @@ def create_ui():
                         chatbot = gr.Chatbot(
                             label="Conversation",
                             height=400,
-                            show_copy_button=True,
                         )
 
                         with gr.Row():
@@ -908,6 +1195,13 @@ def create_ui():
             outputs=[model_status],
         )
 
+        # Backend switching handler
+        backend_dropdown.change(
+            fn=switch_backend_handler,
+            inputs=[backend_dropdown],
+            outputs=[backend_status],
+        )
+
         unload_btn.click(
             fn=unload_model_handler,
             outputs=[model_status],
@@ -995,6 +1289,13 @@ def main():
         action="store_true",
         help="Enable low VRAM mode for smaller GPUs",
     )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=["auto", "vllm", "transformers"],
+        default="auto",
+        help="Backend for model inference (default: auto - uses vLLM if available)",
+    )
 
     args = parser.parse_args()
 
@@ -1005,13 +1306,14 @@ def main():
     print("Chromaforge VLM Chat Interface")
     print("=" * 60)
     print(f"Low VRAM mode: {'enabled' if args.lowvram else 'disabled'}")
+    print(f"Backend: {args.backend}" + (" (vLLM available)" if VLLM_AVAILABLE else " (vLLM not available)"))
     print(f"Server: http://{host}:{args.port}")
     if args.listen:
         print("LAN access: enabled (listening on 0.0.0.0)")
     print("=" * 60)
 
     # Initialize the manager
-    initialize_manager(low_vram=args.lowvram)
+    initialize_manager(low_vram=args.lowvram, backend=args.backend)
 
     # Create and launch the UI
     demo = create_ui()
@@ -1019,6 +1321,8 @@ def main():
         server_name=host,
         server_port=args.port,
         share=args.share,
+        theme=vlm_theme,
+        css=vlm_css,
     )