pytorch · felipemello1 · Dec 6, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml
@@ -19,6 +19,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/code_llama2_7B/full_low_memory # /tmp may be deleted by your system. Change it to your preference.
+
 # Model arguments
 model:
   _component_: torchtune.models.code_llama2.code_llama2_7b
@@ -39,7 +41,7 @@ checkpointer:
     pytorch_model-00003-of-00003.bin
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/CodeLlama-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 
@@ -55,14 +57,14 @@ shuffle: True
 epochs: 1
 max_steps_per_epoch: null
 batch_size: 2
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
+gradient_accumulation_steps: 1  # Use to increase effective batch size
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
   lr: 2e-5
 optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-compile: False  # pytorch compile, set to true for better perf/memory
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -73,13 +75,13 @@ enable_activation_offloading: True  # True reduces memory
 dtype: bf16
 
 # Logging
-output_dir: /tmp/codellama_finetune_output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: /tmp/CodeLlama-7b-hf/logs
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler

diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/code_llama2_7B/lora_single_device # /tmp may be deleted by your system. Change it to your preference.
+
 # Model Arguments
 model:
   _component_: torchtune.models.code_llama2.lora_code_llama2_7b
@@ -42,7 +44,7 @@ checkpointer:
   ]
   adapter_checkpoint: null
   recipe_checkpoint: null
-  output_dir: /tmp/CodeLlama-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 save_adapter_weights_only: False
@@ -59,7 +61,7 @@ shuffle: True
 epochs: 1
 max_steps_per_epoch: null
 batch_size: 2
-gradient_accumulation_steps: 8  # Use to increase virtual batch size
+gradient_accumulation_steps: 8  # Use to increase effective batch size
 optimizer:
   _component_: torch.optim.AdamW
   fused: True
@@ -70,7 +72,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-compile: False  # pytorch compile, set to true for better perf/memory
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -81,10 +83,9 @@ enable_activation_offloading: False  # True reduces memory
 dtype: bf16
 
 # Logging
-output_dir: /tmp/codellama_lora_finetune_output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: /tmp/CodeLlama-7b-hf/logs
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 

diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/code_llama2_7B/qlora_single_device # /tmp may be deleted by your system. Change it to your preference.
+
 # Model Arguments
 model:
   _component_: torchtune.models.code_llama2.qlora_code_llama2_7b
@@ -42,7 +44,7 @@ checkpointer:
   ]
   adapter_checkpoint: null
   recipe_checkpoint: null
-  output_dir: /tmp/CodeLlama-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 save_adapter_weights_only: False
@@ -58,7 +60,7 @@ shuffle: True
 epochs: 1
 max_steps_per_epoch: null
 batch_size: 2
-gradient_accumulation_steps: 8  # Use to increase virtual batch size
+gradient_accumulation_steps: 8  # Use to increase effective batch size
 optimizer:
   _component_: torch.optim.AdamW
   fused: True
@@ -69,7 +71,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-compile: False  # pytorch compile, set to true for better perf/memory
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -80,10 +82,9 @@ enable_activation_offloading: False  # True reduces memory
 dtype: bf16
 
 # Logging
-output_dir: /tmp/codellama_qlora_finetune_output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: /tmp/CodeLlama-7b-hf/logs
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 

diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml
@@ -18,6 +18,8 @@
 # best to use 8B_full_single_device.yaml for those cases
 
 
+output_dir: /tmp/torchtune/dev_8B/full_experimental # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
@@ -42,7 +44,7 @@ checkpointer:
     consolidated.00.pth
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Meta-Llama-3-8B/
+  output_dir: ${output_dir}
   model_type: LLAMA3
 resume_from_checkpoint: False
 
@@ -57,8 +59,8 @@ optimizer:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -77,11 +79,11 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-llama3-finetune
+  log_dir: ${output_dir}/logs
 log_every_n_steps: null
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler

diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
@@ -16,6 +16,8 @@
 # This config works only when the model is being fine-tuned on 2+ GPUs.
 
 
+output_dir: /tmp/torchtune/gemma_2B/full # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
@@ -40,7 +42,7 @@ checkpointer:
     model-00002-of-00002.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma-2b
+  output_dir: ${output_dir}
   model_type: GEMMA
 resume_from_checkpoint: False
 
@@ -54,8 +56,8 @@ optimizer:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 # Training env
@@ -71,11 +73,11 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma-finetune
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler

diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only when the model is being fine-tuned on 2+ GPUs.
 
+output_dir: /tmp/torchtune/gemma_2B/lora # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
@@ -44,7 +46,7 @@ checkpointer:
     model-00002-of-00002.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma-2b
+  output_dir: ${output_dir}
   model_type: GEMMA
 resume_from_checkpoint: False
 
@@ -66,8 +68,8 @@ loss:
 batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -82,11 +84,11 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma-lora
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler

diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/gemma_2B/lora_single_device # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
@@ -44,7 +46,7 @@ checkpointer:
     model-00002-of-00002.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma-2b
+  output_dir: ${output_dir}
   model_type: GEMMA
 resume_from_checkpoint: False
 save_adapter_weights_only: False
@@ -65,8 +67,8 @@ loss:
 batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 8  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 8  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -81,8 +83,7 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma-lora
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 

diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/gemma_2B/qlora_single_device # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
@@ -44,7 +46,7 @@ checkpointer:
     model-00002-of-00002.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma-2b
+  output_dir: ${output_dir}
   model_type: GEMMA
 resume_from_checkpoint: False
 save_adapter_weights_only: False
@@ -65,8 +67,8 @@ loss:
 batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 8  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 8  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -81,8 +83,7 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma-lora
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 

diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml
@@ -16,6 +16,8 @@
 # This config works only when the model is being fine-tuned on 2+ GPUs.
 
 
+output_dir: /tmp/torchtune/gemma_7B/full # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
@@ -42,7 +44,7 @@ checkpointer:
     model-00004-of-00004.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma-7b
+  output_dir: ${output_dir}
   model_type: GEMMA
 resume_from_checkpoint: False
 
@@ -56,8 +58,8 @@ optimizer:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 # Training env
@@ -73,11 +75,11 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma-finetune
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler