Remap partial disk offload to cpu for GGUF files

GGUF files don't support disk offload so attempt to remap them to the CPU when device_map is auto. If device_map is anything else but None, raise a NotImplementedError.
huggingface · Feb 10, 2025 · 136c2a5 · 136c2a5
1 parent 0be9802
commit 136c2a5
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 0 deletions.
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -4241,12 +4241,31 @@ def from_pretrained(
             if hf_quantizer is not None:
                 hf_quantizer.validate_environment(device_map=device_map)
 
+            if gguf_path:
+                remapped_devices = set()
+                for name, device in device_map.items():
+                    if device == "disk":
+                        device_map[name] = "cpu"
+                        remapped_devices.add(name)
+                if len(remapped_devices) > 0:
+                    logger.warning(
+                        "Accelerate has auto-mapped modules to disk but disk offload is not supported for "
+                        "models loaded from GGUF files. Remapping modules to the cpu: "
+                        ", ".join(remapped_devices)
+                    )
+
         elif device_map is not None:
             model.tie_weights()
             tied_params = find_tied_parameters(model)
             # check if we don't have tied param in different devices
             check_tied_parameters_on_same_device(tied_params, device_map)
 
+        if gguf_path and device_map is not None and "disk" in device_map.values():
+            raise NotImplementedError(
+                "One or more modules is configured to be mapped to disk. Disk offload is not supported for models "
+                "loaded from GGUF files."
+            )
+
         if from_tf:
             if resolved_archive_file.endswith(".index"):
                 # Load from a TensorFlow 1.X checkpoint - provided by original authors

diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py
@@ -219,6 +219,49 @@ def test_q6_k_fp16(self):
         EXPECTED_TEXT = "Hello, World!\n\nStep 3: Add"
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
 
+    def test_gguf_errors_disk_offload(self):
+        from collections import OrderedDict
+
+        q2_k_gguf_model_id = self.gguf_filename.format(quant_type=QuantType.Q2_K.name)
+        with self.assertRaises(NotImplementedError):
+            AutoModelForCausalLM.from_pretrained(
+                self.gguf_model_id,
+                device_map=OrderedDict(
+                    [
+                        ("model.embed_tokens", "cpu"),
+                        ("lm_head", "cpu"),
+                        ("model.layers.0", "cpu"),
+                        ("model.layers.1", "cpu"),
+                        ("model.layers.2", "cpu"),
+                        ("model.layers.3", "cpu"),
+                        ("model.layers.4", "cpu"),
+                        ("model.layers.5", "cpu"),
+                        ("model.layers.6", "cpu"),
+                        ("model.layers.7", "cpu"),
+                        ("model.layers.8", "cpu"),
+                        ("model.layers.9", "cpu"),
+                        ("model.layers.10", "disk"),
+                        ("model.layers.11", "disk"),
+                        ("model.layers.12", "disk"),
+                        ("model.layers.13", "disk"),
+                        ("model.layers.14", "disk"),
+                        ("model.layers.15", "disk"),
+                        ("model.layers.16", "disk"),
+                        ("model.layers.17", "disk"),
+                        ("model.layers.18", "disk"),
+                        ("model.layers.19", "disk"),
+                        ("model.layers.20", "disk"),
+                        ("model.layers.21", "disk"),
+                        ("model.layers.22", "disk"),
+                        ("model.norm", "disk"),
+                        ("model.rotary_emb", "disk"),
+                    ]
+                ),
+                gguf_file=q2_k_gguf_model_id,
+                offload_folder="offload",
+                offload_state_dict=True,
+            )
+
 
 @require_gguf
 @require_torch_gpu