# Config for single device full finetuning in full_finetune_single_device.py # using a Llama3.1 8B Instruct model # # This config assumes that you've run the following command before launching # this run: # tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" # # The default config uses an optimizer from bitsandbytes. If you do not have it installed, # you can install it with # pip install bitsandbytes # # To launch on a single device, run the following command from root: # tune run full_finetune_single_device --config llama3_1/8B_full_single_device # # You can add specific overrides through the command line. For example # to override the checkpointer directory while launching training # you can run: # tune run full_finetune_single_device --config llama3_1/8B_full_single_device checkpointer.checkpoint_dir= # # This config works only for training on single device. # Tokenizer tokenizer: _component_: torchtune.models.llama3.llama3_tokenizer path: ${oc.env:ARTIFACT_LOCATION}/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model max_seq_len: 3200 # Dataset dataset: _component_: torchtune.datasets.alpaca_dataset packed: True # True increases speed seed: null shuffle: True # Model Arguments model: _component_: torchtune.models.llama3_1.llama3_1_8b checkpointer: _component_: torchtune.training.FullModelHFCheckpointer checkpoint_dir: ${oc.env:ARTIFACT_LOCATION}/Meta-Llama-3.1-8B-Instruct checkpoint_files: [ model-00001-of-00004.safetensors, model-00002-of-00004.safetensors, model-00003-of-00004.safetensors, model-00004-of-00004.safetensors ] recipe_checkpoint: null output_dir: ${oc.env:ARTIFACT_LOCATION}/${oc.env:EXPERIMENT_NAME}_${oc.env:RUN_NAME}/checkpoints model_type: LLAMA3 resume_from_checkpoint: False optimizer: _component_: torch.optim.AdamW fused: True lr: 2e-5 lr_scheduler: _component_: torchtune.modules.get_cosine_schedule_with_warmup num_warmup_steps: 10 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss # Fine-tuning arguments batch_size: 4 epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 1 # Use to increase virtual batch size optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1 compile: True # pytorch compile, set to true for better perf/memory # Training environment device: cuda # Memory management enable_activation_checkpointing: True # True reduces memory enable_activation_offloading: True # True reduces memory # Reduced precision dtype: bf16 # Logging metric_logger: # _component_: torchtune.training.metric_logging.DiskLogger _component_: utils.MLflowLogger log_dir: ${output_dir} # Only used for MLflow experiment_name: ${oc.env:EXPERIMENT_NAME} # Set Experiment Name run_name: ${oc.env:RUN_NAME} # Set Run Name autolog: True # Set to False to disable autologging output_dir: ${oc.env:ARTIFACT_LOCATION}/${oc.env:EXPERIMENT_NAME}_${oc.env:RUN_NAME} log_every_n_steps: 1 log_peak_memory_stats: True # Profiler (disabled) profiler: _component_: torchtune.training.setup_torch_profiler enabled: True #Output directory of trace artifacts output_dir: ${output_dir}/profiling_outputs #`torch.profiler.ProfilerActivity` types to trace cpu: True cuda: True #trace options passed to `torch.profiler.profile` profile_memory: True with_stack: False record_shapes: True with_flops: False # `torch.profiler.schedule` options: # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat wait_steps: 5 warmup_steps: 3 active_steps: 2 num_cycles: 1