Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sn1 437 implement gemma 3 27 b it #644

Merged
merged 24 commits into from
Mar 18, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Simplify Unload Model
  • Loading branch information
richwardle committed Mar 18, 2025
commit 25b9586c5658e5572b962c9bb6c72c8b775b3436
2 changes: 1 addition & 1 deletion prompting/llms/hf_text_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

try:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from transformers import AutoModelForImageTextToText, AutoProcessor
except ImportError:
logger.warning("Transformers or torch is not installed. This module will not be available.")

Expand Down
96 changes: 51 additions & 45 deletions prompting/llms/model_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import gc
import time
from typing import Dict

import torch
Expand Down Expand Up @@ -49,11 +50,13 @@ def load_model(self, model_config: ModelConfig, force: bool = True):
logger.debug(f"Enough RAM for model {model_config.llm_model_id} free")
GPUInfo.log_gpu_info()
break

# If no active models remain but GPU is still showing significant usage,
# perform emergency cleanup
if len(self.active_models) == 0 and GPUInfo.gpu_utilization > 0.25: # More than 25% still in use
logger.warning(f"GPU still showing high utilization after unloading all models. Performing emergency cleanup.")
logger.warning(
"GPU still showing high utilization after unloading all models. Performing emergency cleanup."
)
self._emergency_gpu_cleanup()

if self.used_ram + model_config.min_ram > self.total_ram or GPUInfo.free_memory < model_config.min_ram:
Expand Down Expand Up @@ -83,6 +86,37 @@ def load_model(self, model_config: ModelConfig, force: bool = True):
except Exception as e:
logger.exception(f"Failed to load model {model_config.llm_model_id}. Error: {str(e)}")

def _cleanup_pytorch_model(self, model_instance, model_config: ModelConfig):
"""Handle cleanup specifically for PyTorch-based models."""
if hasattr(model_instance.llm, "model"):
try:
# Check if it's a PyTorch model with a 'to' method
if hasattr(model_instance.llm.model, "to"):
logger.debug(f"Moving model {model_config.llm_model_id} to CPU before deletion")
model_instance.llm.model.to("cpu")
time.sleep(0.1)

# Explicitly set requires_grad to False for all parameters if possible
if hasattr(model_instance.llm.model, "parameters"):
for param in model_instance.llm.model.parameters():
if hasattr(param, "requires_grad"):
param.requires_grad = False

except Exception as e:
logger.debug(f"Could not move model to CPU: {str(e)}, proceeding with direct deletion")

# Delete the model reference and any cached states
if hasattr(model_instance.llm.model, "_clear_cache"):
model_instance.llm.model._clear_cache()

# Explicitly delete model components if available
if hasattr(model_instance.llm.model, "modules"):
for module in list(model_instance.llm.model.modules()):
del module

# Final deletion of model
del model_instance.llm.model

def unload_model(self, model_config: ModelConfig):
if model_config not in self.active_models:
logger.warning("Couldn't find model to unload.")
Expand All @@ -107,36 +141,7 @@ def unload_model(self, model_config: ModelConfig):
# Handle pipeline-based models with a hybrid approach
if hasattr(model_instance, "llm"):
# Try to move model to CPU first if it's a PyTorch model
if hasattr(model_instance.llm, "model"):
try:
# Check if it's a PyTorch model with a 'to' method
if hasattr(model_instance.llm.model, "to"):
logger.debug(f"Moving model {model_config.llm_model_id} to CPU before deletion")
model_instance.llm.model.to("cpu")
# Small delay to allow memory transfer
import time
time.sleep(0.1)

# Explicitly set requires_grad to False for all parameters if possible
if hasattr(model_instance.llm.model, "parameters"):
for param in model_instance.llm.model.parameters():
if hasattr(param, "requires_grad"):
param.requires_grad = False

except Exception as e:
logger.debug(f"Could not move model to CPU: {str(e)}, proceeding with direct deletion")

# Delete the model reference and any cached states
if hasattr(model_instance.llm.model, "_clear_cache"):
model_instance.llm.model._clear_cache()

# Explicitly delete model components if available
if hasattr(model_instance.llm.model, "modules"):
for module in list(model_instance.llm.model.modules()):
del module

# Final deletion of model
del model_instance.llm.model
self._cleanup_pytorch_model(model_instance, model_config)

# Handle tokenizer
if hasattr(model_instance.llm, "tokenizer"):
Expand All @@ -159,18 +164,19 @@ def unload_model(self, model_config: ModelConfig):
torch.cuda.reset_peak_memory_stats()
# Synchronize CUDA to ensure operations are complete
torch.cuda.synchronize()

# Force additional cleanup with multiple empty_cache calls
torch.cuda.empty_cache()

# Wait a bit longer to ensure memory is released back to the system
import time

time.sleep(0.5)
torch.cuda.empty_cache()

# Additional synchronization point
torch.cuda.synchronize()

# One final garbage collection
gc.collect()

Expand Down Expand Up @@ -254,38 +260,38 @@ def _emergency_gpu_cleanup(self):
doesn't free up expected memory.
"""
logger.info("Performing emergency GPU cleanup")

# Reset model tracking state
self.active_models = {}
self.used_ram = 0.0

# Run aggressive cleanup sequence
import time

# Multiple rounds of garbage collection
for _ in range(3):
gc.collect()
torch.cuda.empty_cache()
time.sleep(0.1)

# Force CUDA synchronization
torch.cuda.synchronize()

# Reset all CUDA cached memory
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_accumulated_memory_stats()

# Try to release all unreachable objects
gc.collect(generation=2)

# Delay to allow OS to reclaim memory
time.sleep(1.0)

# Final cache clear
torch.cuda.empty_cache()
torch.cuda.synchronize()

logger.info(f"Emergency cleanup complete. Current GPU utilization: {GPUInfo.gpu_utilization * 100:.2f}%")
GPUInfo.log_gpu_info()

Expand Down
5 changes: 4 additions & 1 deletion shared/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,10 @@ class SharedSettings(BaseSettings):
SUBTENSOR_NETWORK: Optional[str] = Field(None, env="SUBTENSOR_NETWORK")
MAX_ALLOWED_VRAM_GB: float = Field(62, env="MAX_ALLOWED_VRAM_GB")
PROXY_URL: Optional[str] = Field(None, env="PROXY_URL")
LLM_MODEL: list[str] = ["hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4", "mrfakename/mistral-small-3.1-24b-instruct-2503-hf"]
LLM_MODEL: list[str] = [
"hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4",
"mrfakename/mistral-small-3.1-24b-instruct-2503-hf",
]
SAMPLING_PARAMS: dict[str, Any] = {
"temperature": 0.7,
"top_p": 0.95,
Expand Down