Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update configs #2107

Merged
merged 5 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions recipes/configs/code_llama2/7B_full_low_memory.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#
# This config works only for training on single device.

output_dir: /tmp/torchtune/code_llama2_7B/full_low_memory # /tmp may be deleted by your system. Change it to your preference.

# Model arguments
model:
_component_: torchtune.models.code_llama2.code_llama2_7b
Expand All @@ -39,7 +41,7 @@ checkpointer:
pytorch_model-00003-of-00003.bin
]
recipe_checkpoint: null
output_dir: /tmp/CodeLlama-7b-hf
output_dir: ${output_dir}
model_type: LLAMA2
resume_from_checkpoint: False

Expand All @@ -55,14 +57,14 @@ shuffle: True
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 1 # Use to increase virtual batch size
gradient_accumulation_steps: 1 # Use to increase effective batch size
optimizer:
_component_: bitsandbytes.optim.PagedAdamW
lr: 2e-5
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
compile: False # pytorch compile, set to true for better perf/memory
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
device: cuda
Expand All @@ -73,13 +75,13 @@ enable_activation_offloading: True # True reduces memory
dtype: bf16

# Logging
output_dir: /tmp/codellama_finetune_output
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/CodeLlama-7b-hf/logs
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True


# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
Expand Down
11 changes: 6 additions & 5 deletions recipes/configs/code_llama2/7B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#
# This config works only for training on single device.

output_dir: /tmp/torchtune/code_llama2_7B/lora_single_device # /tmp may be deleted by your system. Change it to your preference.

# Model Arguments
model:
_component_: torchtune.models.code_llama2.lora_code_llama2_7b
Expand Down Expand Up @@ -42,7 +44,7 @@ checkpointer:
]
adapter_checkpoint: null
recipe_checkpoint: null
output_dir: /tmp/CodeLlama-7b-hf
output_dir: ${output_dir}
model_type: LLAMA2
resume_from_checkpoint: False
save_adapter_weights_only: False
Expand All @@ -59,7 +61,7 @@ shuffle: True
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 8 # Use to increase virtual batch size
gradient_accumulation_steps: 8 # Use to increase effective batch size
optimizer:
_component_: torch.optim.AdamW
fused: True
Expand All @@ -70,7 +72,7 @@ lr_scheduler:
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
compile: False # pytorch compile, set to true for better perf/memory
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
device: cuda
Expand All @@ -81,10 +83,9 @@ enable_activation_offloading: False # True reduces memory
dtype: bf16

# Logging
output_dir: /tmp/codellama_lora_finetune_output
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/CodeLlama-7b-hf/logs
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True

Expand Down
11 changes: 6 additions & 5 deletions recipes/configs/code_llama2/7B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#
# This config works only for training on single device.

output_dir: /tmp/torchtune/code_llama2_7B/qlora_single_device # /tmp may be deleted by your system. Change it to your preference.

# Model Arguments
model:
_component_: torchtune.models.code_llama2.qlora_code_llama2_7b
Expand Down Expand Up @@ -42,7 +44,7 @@ checkpointer:
]
adapter_checkpoint: null
recipe_checkpoint: null
output_dir: /tmp/CodeLlama-7b-hf
output_dir: ${output_dir}
model_type: LLAMA2
resume_from_checkpoint: False
save_adapter_weights_only: False
Expand All @@ -58,7 +60,7 @@ shuffle: True
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 8 # Use to increase virtual batch size
gradient_accumulation_steps: 8 # Use to increase effective batch size
optimizer:
_component_: torch.optim.AdamW
fused: True
Expand All @@ -69,7 +71,7 @@ lr_scheduler:
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
compile: False # pytorch compile, set to true for better perf/memory
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
device: cuda
Expand All @@ -80,10 +82,9 @@ enable_activation_offloading: False # True reduces memory
dtype: bf16

# Logging
output_dir: /tmp/codellama_qlora_finetune_output
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: /tmp/CodeLlama-7b-hf/logs
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True

Expand Down
12 changes: 7 additions & 5 deletions recipes/configs/dev/8B_full_experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
# best to use 8B_full_single_device.yaml for those cases


output_dir: /tmp/torchtune/dev_8B/full_experimental # /tmp may be deleted by your system. Change it to your preference.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this recipe? Shouldn't it be under a dev/model structure?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the dev recipe for selective activation checkpointing. I think we should decide what we wanna do with this feature (either integrate it by default or scrap it, cause I don't like that we currently expose two different AC APIs). I think it still provides parity with vanilla AC so we could just turn it on everywhere given requests like #2101

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created #2114 to continue the discussion there


# Tokenizer
tokenizer:
_component_: torchtune.models.llama3.llama3_tokenizer
Expand All @@ -42,7 +44,7 @@ checkpointer:
consolidated.00.pth
]
recipe_checkpoint: null
output_dir: /tmp/Meta-Llama-3-8B/
output_dir: ${output_dir}
model_type: LLAMA3
resume_from_checkpoint: False

Expand All @@ -57,8 +59,8 @@ optimizer:
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
gradient_accumulation_steps: 1 # Use to increase effective batch size
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what makes this experimental, but did you double check that gradient_acc and compile work with this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, but this change was just a comment change

compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
device: cuda
Expand All @@ -77,11 +79,11 @@ dtype: bf16
# Logging
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
output_dir: /tmp/alpaca-llama3-finetune
log_dir: ${output_dir}/logs
log_every_n_steps: null
log_peak_memory_stats: True


# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
Expand Down
12 changes: 7 additions & 5 deletions recipes/configs/gemma/2B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# This config works only when the model is being fine-tuned on 2+ GPUs.


output_dir: /tmp/torchtune/gemma_2B/full # /tmp may be deleted by your system. Change it to your preference.

# Tokenizer
tokenizer:
_component_: torchtune.models.gemma.gemma_tokenizer
Expand All @@ -40,7 +42,7 @@ checkpointer:
model-00002-of-00002.safetensors,
]
recipe_checkpoint: null
output_dir: /tmp/gemma-2b
output_dir: ${output_dir}
model_type: GEMMA
resume_from_checkpoint: False

Expand All @@ -54,8 +56,8 @@ optimizer:
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
gradient_accumulation_steps: 1 # Use to increase effective batch size
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

# Training env
Expand All @@ -71,11 +73,11 @@ dtype: bf16
# Logging
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-finetune
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True


# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
Expand Down
12 changes: 7 additions & 5 deletions recipes/configs/gemma/2B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#
# This config works only when the model is being fine-tuned on 2+ GPUs.

output_dir: /tmp/torchtune/gemma_2B/lora # /tmp may be deleted by your system. Change it to your preference.

# Tokenizer
tokenizer:
_component_: torchtune.models.gemma.gemma_tokenizer
Expand Down Expand Up @@ -44,7 +46,7 @@ checkpointer:
model-00002-of-00002.safetensors,
]
recipe_checkpoint: null
output_dir: /tmp/gemma-2b
output_dir: ${output_dir}
model_type: GEMMA
resume_from_checkpoint: False

Expand All @@ -66,8 +68,8 @@ loss:
batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
gradient_accumulation_steps: 1 # Use to increase effective batch size
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
device: cuda
Expand All @@ -82,11 +84,11 @@ dtype: bf16
# Logging
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-lora
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True


# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
Expand Down
11 changes: 6 additions & 5 deletions recipes/configs/gemma/2B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#
# This config works only for training on single device.

output_dir: /tmp/torchtune/gemma_2B/lora_single_device # /tmp may be deleted by your system. Change it to your preference.

# Tokenizer
tokenizer:
_component_: torchtune.models.gemma.gemma_tokenizer
Expand Down Expand Up @@ -44,7 +46,7 @@ checkpointer:
model-00002-of-00002.safetensors,
]
recipe_checkpoint: null
output_dir: /tmp/gemma-2b
output_dir: ${output_dir}
model_type: GEMMA
resume_from_checkpoint: False
save_adapter_weights_only: False
Expand All @@ -65,8 +67,8 @@ loss:
batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
gradient_accumulation_steps: 8 # Use to increase effective batch size
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
device: cuda
Expand All @@ -81,8 +83,7 @@ dtype: bf16
# Logging
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-lora
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True

Expand Down
11 changes: 6 additions & 5 deletions recipes/configs/gemma/2B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#
# This config works only for training on single device.

output_dir: /tmp/torchtune/gemma_2B/qlora_single_device # /tmp may be deleted by your system. Change it to your preference.

# Tokenizer
tokenizer:
_component_: torchtune.models.gemma.gemma_tokenizer
Expand Down Expand Up @@ -44,7 +46,7 @@ checkpointer:
model-00002-of-00002.safetensors,
]
recipe_checkpoint: null
output_dir: /tmp/gemma-2b
output_dir: ${output_dir}
model_type: GEMMA
resume_from_checkpoint: False
save_adapter_weights_only: False
Expand All @@ -65,8 +67,8 @@ loss:
batch_size: 4
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
gradient_accumulation_steps: 8 # Use to increase effective batch size
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Training env
device: cuda
Expand All @@ -81,8 +83,7 @@ dtype: bf16
# Logging
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-lora
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True

Expand Down
12 changes: 7 additions & 5 deletions recipes/configs/gemma/7B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# This config works only when the model is being fine-tuned on 2+ GPUs.


output_dir: /tmp/torchtune/gemma_7B/full # /tmp may be deleted by your system. Change it to your preference.

# Tokenizer
tokenizer:
_component_: torchtune.models.gemma.gemma_tokenizer
Expand All @@ -42,7 +44,7 @@ checkpointer:
model-00004-of-00004.safetensors,
]
recipe_checkpoint: null
output_dir: /tmp/gemma-7b
output_dir: ${output_dir}
model_type: GEMMA
resume_from_checkpoint: False

Expand All @@ -56,8 +58,8 @@ optimizer:
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
gradient_accumulation_steps: 1 # Use to increase effective batch size
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

# Training env
Expand All @@ -73,11 +75,11 @@ dtype: bf16
# Logging
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
output_dir: /tmp/alpaca-gemma-finetune
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True


# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
Expand Down
Loading
Loading