Skip to content

Commit

Permalink
Remap partial disk offload to cpu for GGUF files
Browse files Browse the repository at this point in the history
GGUF files don't support disk offload so attempt to remap them to the CPU when device_map is auto. If device_map is anything else but None, raise a NotImplementedError.
  • Loading branch information
dmlap committed Feb 10, 2025
1 parent 0be9802 commit 136c2a5
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 0 deletions.
19 changes: 19 additions & 0 deletions src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4241,12 +4241,31 @@ def from_pretrained(
if hf_quantizer is not None:
hf_quantizer.validate_environment(device_map=device_map)

if gguf_path:
remapped_devices = set()
for name, device in device_map.items():
if device == "disk":
device_map[name] = "cpu"
remapped_devices.add(name)
if len(remapped_devices) > 0:
logger.warning(
"Accelerate has auto-mapped modules to disk but disk offload is not supported for "
"models loaded from GGUF files. Remapping modules to the cpu: "
", ".join(remapped_devices)
)

elif device_map is not None:
model.tie_weights()
tied_params = find_tied_parameters(model)
# check if we don't have tied param in different devices
check_tied_parameters_on_same_device(tied_params, device_map)

if gguf_path and device_map is not None and "disk" in device_map.values():
raise NotImplementedError(
"One or more modules is configured to be mapped to disk. Disk offload is not supported for models "
"loaded from GGUF files."
)

if from_tf:
if resolved_archive_file.endswith(".index"):
# Load from a TensorFlow 1.X checkpoint - provided by original authors
Expand Down
43 changes: 43 additions & 0 deletions tests/quantization/ggml/test_ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,49 @@ def test_q6_k_fp16(self):
EXPECTED_TEXT = "Hello, World!\n\nStep 3: Add"
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)

def test_gguf_errors_disk_offload(self):
from collections import OrderedDict

q2_k_gguf_model_id = self.gguf_filename.format(quant_type=QuantType.Q2_K.name)
with self.assertRaises(NotImplementedError):
AutoModelForCausalLM.from_pretrained(
self.gguf_model_id,
device_map=OrderedDict(
[
("model.embed_tokens", "cpu"),
("lm_head", "cpu"),
("model.layers.0", "cpu"),
("model.layers.1", "cpu"),
("model.layers.2", "cpu"),
("model.layers.3", "cpu"),
("model.layers.4", "cpu"),
("model.layers.5", "cpu"),
("model.layers.6", "cpu"),
("model.layers.7", "cpu"),
("model.layers.8", "cpu"),
("model.layers.9", "cpu"),
("model.layers.10", "disk"),
("model.layers.11", "disk"),
("model.layers.12", "disk"),
("model.layers.13", "disk"),
("model.layers.14", "disk"),
("model.layers.15", "disk"),
("model.layers.16", "disk"),
("model.layers.17", "disk"),
("model.layers.18", "disk"),
("model.layers.19", "disk"),
("model.layers.20", "disk"),
("model.layers.21", "disk"),
("model.layers.22", "disk"),
("model.norm", "disk"),
("model.rotary_emb", "disk"),
]
),
gguf_file=q2_k_gguf_model_id,
offload_folder="offload",
offload_state_dict=True,
)


@require_gguf
@require_torch_gpu
Expand Down

0 comments on commit 136c2a5

Please sign in to comment.