huggingface · regisss · Dec 12, 2024 · Dec 9, 2024 · Dec 10, 2024
@@ -441,6 +441,12 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
     if load_to_meta:
         # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
         with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
+            if (
+                config.rope_scaling
+                and config.rope_scaling["rope_type"] == "llama3"
+                and config.max_position_embeddings > 8192
+            ):
+                config.max_position_embeddings = 8192
             model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
 
         # Model loaded to meta is managed differently

@@ -119,9 +119,6 @@ def __init__(
             else:
                 self.rope_type = "default"
             self.max_seq_len_cached = config.max_position_embeddings
-            # Truncate the cached max sequence length to 8k to limit cached register buffer size
-            if not self.training and config.max_position_embeddings > 8192 and self.rope_type == "llama3":
-                self.max_seq_len_cached = 8192
             self.original_max_seq_len = config.max_position_embeddings
 
         self.config = config