Simplify Unload Model

macrocosm-os · bkb2135 · Mar 18, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 14, 2025
commit 25b9586c5658e5572b962c9bb6c72c8b775b3436
diff --git a/prompting/llms/hf_text_image.py b/prompting/llms/hf_text_image.py
@@ -2,7 +2,7 @@
 
 try:
     import torch
-    from transformers import AutoProcessor, AutoModelForImageTextToText
+    from transformers import AutoModelForImageTextToText, AutoProcessor
 except ImportError:
     logger.warning("Transformers or torch is not installed. This module will not be available.")
 

diff --git a/prompting/llms/model_manager.py b/prompting/llms/model_manager.py
@@ -1,5 +1,6 @@
 import asyncio
 import gc
+import time
 from typing import Dict
 
 import torch
@@ -49,11 +50,13 @@ def load_model(self, model_config: ModelConfig, force: bool = True):
                     logger.debug(f"Enough RAM for model {model_config.llm_model_id} free")
                     GPUInfo.log_gpu_info()
                     break
-            
+
             # If no active models remain but GPU is still showing significant usage,
             # perform emergency cleanup
             if len(self.active_models) == 0 and GPUInfo.gpu_utilization > 0.25:  # More than 25% still in use
-                logger.warning(f"GPU still showing high utilization after unloading all models. Performing emergency cleanup.")
+                logger.warning(
+                    "GPU still showing high utilization after unloading all models. Performing emergency cleanup."
+                )
                 self._emergency_gpu_cleanup()
 
         if self.used_ram + model_config.min_ram > self.total_ram or GPUInfo.free_memory < model_config.min_ram:
@@ -83,6 +86,37 @@ def load_model(self, model_config: ModelConfig, force: bool = True):
         except Exception as e:
             logger.exception(f"Failed to load model {model_config.llm_model_id}. Error: {str(e)}")
 
+    def _cleanup_pytorch_model(self, model_instance, model_config: ModelConfig):
+        """Handle cleanup specifically for PyTorch-based models."""
+        if hasattr(model_instance.llm, "model"):
+            try:
+                # Check if it's a PyTorch model with a 'to' method
+                if hasattr(model_instance.llm.model, "to"):
+                    logger.debug(f"Moving model {model_config.llm_model_id} to CPU before deletion")
+                    model_instance.llm.model.to("cpu")
+                    time.sleep(0.1)
+
+                    # Explicitly set requires_grad to False for all parameters if possible
+                    if hasattr(model_instance.llm.model, "parameters"):
+                        for param in model_instance.llm.model.parameters():
+                            if hasattr(param, "requires_grad"):
+                                param.requires_grad = False
+
+            except Exception as e:
+                logger.debug(f"Could not move model to CPU: {str(e)}, proceeding with direct deletion")
+
+            # Delete the model reference and any cached states
+            if hasattr(model_instance.llm.model, "_clear_cache"):
+                model_instance.llm.model._clear_cache()
+
+            # Explicitly delete model components if available
+            if hasattr(model_instance.llm.model, "modules"):
+                for module in list(model_instance.llm.model.modules()):
+                    del module
+
+            # Final deletion of model
+            del model_instance.llm.model
+
     def unload_model(self, model_config: ModelConfig):
         if model_config not in self.active_models:
             logger.warning("Couldn't find model to unload.")
@@ -107,36 +141,7 @@ def unload_model(self, model_config: ModelConfig):
             # Handle pipeline-based models with a hybrid approach
             if hasattr(model_instance, "llm"):
                 # Try to move model to CPU first if it's a PyTorch model
-                if hasattr(model_instance.llm, "model"):
-                    try:
-                        # Check if it's a PyTorch model with a 'to' method
-                        if hasattr(model_instance.llm.model, "to"):
-                            logger.debug(f"Moving model {model_config.llm_model_id} to CPU before deletion")
-                            model_instance.llm.model.to("cpu")
-                            # Small delay to allow memory transfer
-                            import time
-                            time.sleep(0.1)
-
-                            # Explicitly set requires_grad to False for all parameters if possible
-                            if hasattr(model_instance.llm.model, "parameters"):
-                                for param in model_instance.llm.model.parameters():
-                                    if hasattr(param, "requires_grad"):
-                                        param.requires_grad = False
-
-                    except Exception as e:
-                        logger.debug(f"Could not move model to CPU: {str(e)}, proceeding with direct deletion")
-
-                    # Delete the model reference and any cached states
-                    if hasattr(model_instance.llm.model, "_clear_cache"):
-                        model_instance.llm.model._clear_cache()
-
-                    # Explicitly delete model components if available
-                    if hasattr(model_instance.llm.model, "modules"):
-                        for module in list(model_instance.llm.model.modules()):
-                            del module
-
-                    # Final deletion of model
-                    del model_instance.llm.model
+                self._cleanup_pytorch_model(model_instance, model_config)
 
                 # Handle tokenizer
                 if hasattr(model_instance.llm, "tokenizer"):
@@ -159,18 +164,19 @@ def unload_model(self, model_config: ModelConfig):
                 torch.cuda.reset_peak_memory_stats()
                 # Synchronize CUDA to ensure operations are complete
                 torch.cuda.synchronize()
-                
+
                 # Force additional cleanup with multiple empty_cache calls
                 torch.cuda.empty_cache()
-                
+
                 # Wait a bit longer to ensure memory is released back to the system
                 import time
+
                 time.sleep(0.5)
                 torch.cuda.empty_cache()
-                
+
                 # Additional synchronization point
                 torch.cuda.synchronize()
-            
+
             # One final garbage collection
             gc.collect()
 
@@ -254,38 +260,38 @@ def _emergency_gpu_cleanup(self):
         doesn't free up expected memory.
         """
         logger.info("Performing emergency GPU cleanup")
-        
+
         # Reset model tracking state
         self.active_models = {}
         self.used_ram = 0.0
-        
+
         # Run aggressive cleanup sequence
         import time
-        
+
         # Multiple rounds of garbage collection
         for _ in range(3):
             gc.collect()
             torch.cuda.empty_cache()
             time.sleep(0.1)
-        
+
         # Force CUDA synchronization
         torch.cuda.synchronize()
-        
+
         # Reset all CUDA cached memory
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
         torch.cuda.reset_accumulated_memory_stats()
-            
+
         # Try to release all unreachable objects
         gc.collect(generation=2)
-        
+
         # Delay to allow OS to reclaim memory
         time.sleep(1.0)
-        
+
         # Final cache clear
         torch.cuda.empty_cache()
         torch.cuda.synchronize()
-        
+
         logger.info(f"Emergency cleanup complete. Current GPU utilization: {GPUInfo.gpu_utilization * 100:.2f}%")
         GPUInfo.log_gpu_info()
 

diff --git a/shared/settings.py b/shared/settings.py
@@ -127,7 +127,10 @@ class SharedSettings(BaseSettings):
     SUBTENSOR_NETWORK: Optional[str] = Field(None, env="SUBTENSOR_NETWORK")
     MAX_ALLOWED_VRAM_GB: float = Field(62, env="MAX_ALLOWED_VRAM_GB")
     PROXY_URL: Optional[str] = Field(None, env="PROXY_URL")
-    LLM_MODEL: list[str] = ["hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4", "mrfakename/mistral-small-3.1-24b-instruct-2503-hf"]
+    LLM_MODEL: list[str] = [
+        "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4",
+        "mrfakename/mistral-small-3.1-24b-instruct-2503-hf",
+    ]
     SAMPLING_PARAMS: dict[str, Any] = {
         "temperature": 0.7,
         "top_p": 0.95,