From 06bacece84523c46b74edced84b50cfd2d46acad Mon Sep 17 00:00:00 2001 From: maybleMyers Date: Mon, 8 Dec 2025 15:15:19 -0800 Subject: [PATCH] update 30b req --- vlm.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/vlm.py b/vlm.py index 880e90de..4f20858c 100644 --- a/vlm.py +++ b/vlm.py @@ -224,19 +224,32 @@ class VLMManager: elif quantization == "8bit": try: from transformers import BitsAndBytesConfig + + # Clear GPU memory before loading + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + + print("Using 8-bit quantization...") + + # 8-bit quantization must happen on GPU (bitsandbytes requirement) + # But we can control memory by loading weights incrementally load_kwargs["quantization_config"] = BitsAndBytesConfig( load_in_8bit=True, - llm_int8_enable_fp32_cpu_offload=True, + # Skip quantizing certain modules to save memory during loading + llm_int8_skip_modules=["lm_head", "embed_tokens"], ) - load_kwargs["device_map"] = "auto" - # Use disk offloading for very large models during quantization + load_kwargs["device_map"] = "sequential" # Load layers one by one + load_kwargs["max_memory"] = {0: "45GiB", "cpu": "100GiB"} + + # Offload folder for overflow offload_dir = Path(tempfile.gettempdir()) / "vlm_offload" offload_dir.mkdir(exist_ok=True) load_kwargs["offload_folder"] = str(offload_dir) - load_kwargs["offload_state_dict"] = True - print(f"Using 8-bit quantization (offloading to {offload_dir})") - except ImportError: - print("Warning: bitsandbytes not installed, falling back to bfloat16") + + except ImportError as e: + print(f"Warning: bitsandbytes not installed, falling back to bfloat16") load_kwargs["torch_dtype"] = torch.bfloat16 load_kwargs["device_map"] = "auto" else: