diff --git a/vlm.py b/vlm.py index 2689300d..e37c2b93 100644 --- a/vlm.py +++ b/vlm.py @@ -26,18 +26,51 @@ except ImportError: CV2_AVAILABLE = False print("Warning: opencv-python not installed. Video support will be limited.") -# Try to import vLLM for high-performance inference -try: - from vllm import LLM, SamplingParams - VLLM_AVAILABLE = True - print("vLLM loaded successfully") -except ImportError as e: - VLLM_AVAILABLE = False - print(f"Warning: vLLM not available. Install with: pip install vllm>=0.11.0") - print(f" Import error: {e}") -except Exception as e: - VLLM_AVAILABLE = False - print(f"Warning: vLLM import failed: {e}") +# vLLM will be imported lazily to allow setting VLLM_USE_V1 before import +VLLM_AVAILABLE = False +LLM = None +SamplingParams = None + +def _check_vllm_available(): + """Check if vLLM can be imported (without actually importing it).""" + try: + import importlib.util + spec = importlib.util.find_spec("vllm") + return spec is not None + except Exception: + return False + +def _import_vllm(use_v0: bool = False): + """Import vLLM with optional V0 engine setting.""" + global VLLM_AVAILABLE, LLM, SamplingParams + + if use_v0: + os.environ["VLLM_USE_V1"] = "0" + print("Setting VLLM_USE_V1=0 for CPU offloading support") + + try: + from vllm import LLM as _LLM, SamplingParams as _SamplingParams + LLM = _LLM + SamplingParams = _SamplingParams + VLLM_AVAILABLE = True + print("vLLM loaded successfully") + return True + except ImportError as e: + VLLM_AVAILABLE = False + print(f"Warning: vLLM not available. Install with: pip install vllm>=0.11.0") + print(f" Import error: {e}") + return False + except Exception as e: + VLLM_AVAILABLE = False + print(f"Warning: vLLM import failed: {e}") + return False + +# Check if vLLM is available (but don't import yet) +_VLLM_CAN_IMPORT = _check_vllm_available() +if _VLLM_CAN_IMPORT: + print("vLLM detected, will be loaded when needed") +else: + print("Warning: vLLM not available. Install with: pip install vllm>=0.11.0") # Try to import qwen-vl-utils for image processing try: @@ -121,13 +154,13 @@ class VLMManager: self.vllm_model = None self.model_path = None - # Determine backend + # Determine backend (use _VLLM_CAN_IMPORT since vLLM is lazily imported) if backend == "auto": - self.backend = "vllm" if VLLM_AVAILABLE else "transformers" + self.backend = "vllm" if _VLLM_CAN_IMPORT else "transformers" else: self.backend = backend - if self.backend == "vllm" and not VLLM_AVAILABLE: + if self.backend == "vllm" and not _VLLM_CAN_IMPORT: print("Warning: vLLM requested but not available. Falling back to transformers.") self.backend = "transformers" @@ -194,8 +227,18 @@ class VLMManager: def _load_with_vllm(self, model_name: str, quantization: str = "none", cpu_offload: int = 0, progress=gr.Progress()) -> str: """Load model using vLLM backend for high-performance inference.""" + global VLLM_AVAILABLE, LLM, SamplingParams + + # Determine if we need CPU offloading (requires V0 engine) + offload_gb = cpu_offload if cpu_offload > 0 else (16 if self.low_vram else 0) + use_v0 = offload_gb > 0 + + # Import vLLM with appropriate engine version if not VLLM_AVAILABLE: - return "vLLM is not available. Please install with: pip install vllm>=0.11.0" + if not _VLLM_CAN_IMPORT: + return "vLLM is not available. Please install with: pip install vllm>=0.11.0" + if not _import_vllm(use_v0=use_v0): + return "Failed to import vLLM. Check console for errors." # Check if already loaded if self.vllm_model is not None and self.model_name == model_name: @@ -233,14 +276,10 @@ class VLMManager: } # Handle CPU offloading (from slider or low_vram mode) - # Note: cpu_offload_gb only works with V0 engine - offload_gb = cpu_offload if cpu_offload > 0 else (16 if self.low_vram else 0) if offload_gb > 0: - import os - os.environ["VLLM_USE_V1"] = "0" # Force V0 engine for CPU offload vllm_kwargs["cpu_offload_gb"] = offload_gb vllm_kwargs["gpu_memory_utilization"] = 0.85 - print(f"CPU Offload: Using V0 engine with {offload_gb}GB offload to CPU") + print(f"CPU Offload: {offload_gb}GB will be offloaded to CPU") # Handle quantization if quantization == "4bit": @@ -1007,10 +1046,10 @@ def create_ui(): refresh_models_btn = gr.Button("Refresh Model List", size="sm") # Backend selection (vLLM or transformers) - backend_choices = ["auto", "vllm", "transformers"] if VLLM_AVAILABLE else ["transformers"] + backend_choices = ["auto", "vllm", "transformers"] if _VLLM_CAN_IMPORT else ["transformers"] backend_dropdown = gr.Dropdown( choices=backend_choices, - value="auto" if VLLM_AVAILABLE else "transformers", + value="auto" if _VLLM_CAN_IMPORT else "transformers", label="Backend", info="vLLM: faster inference, transformers: more compatible", interactive=True, @@ -1326,7 +1365,7 @@ def main(): print("Chromaforge VLM Chat Interface") print("=" * 60) print(f"Low VRAM mode: {'enabled' if args.lowvram else 'disabled'}") - print(f"Backend: {args.backend}" + (" (vLLM available)" if VLLM_AVAILABLE else " (vLLM not available)")) + print(f"Backend: {args.backend}" + (" (vLLM available)" if _VLLM_CAN_IMPORT else " (vLLM not available)")) print(f"Server: http://{host}:{args.port}") if args.listen: print("LAN access: enabled (listening on 0.0.0.0)")