From d5bdba2be0dd2fc8879f5b578efa429bc871a160 Mon Sep 17 00:00:00 2001 From: rickard Date: Mon, 6 Jan 2025 09:46:55 +0100 Subject: [PATCH 1/3] settings for fp8 training --- config/config_categories.yaml | 7 ++++--- config/config_template.yaml | 12 +++++++----- pyproject.toml | 2 +- run_trainer.py | 18 +++++++++++------- tabs/general_tab.py | 2 +- tabs/prepare_tab.py | 2 +- tabs/tab.py | 4 ++-- tabs/training_tab.py | 2 +- tabs/training_tab_legacy.py | 2 +- trainer_config_validator.py | 1 - 10 files changed, 29 insertions(+), 23 deletions(-) diff --git a/config/config_categories.yaml b/config/config_categories.yaml index 0075b06..bb42e5b 100644 --- a/config/config_categories.yaml +++ b/config/config_categories.yaml @@ -1,5 +1,6 @@ -Dataset: data_root, video_column, caption_column, id_token, video_resolution_buckets, caption_dropout_p -Training: training_type, seed, mixed_precision, train_steps, rank, lora_alpha, target_modules, gradient_accumulation_steps, checkpointing_steps, checkpointing_limit, enable_slicing, enable_tiling, batch_size +Dataset: data_root, video_column, caption_column, id_token, video_resolution_buckets, caption_dropout_p, precompute_conditions +Training: training_type, seed, train_steps, rank, lora_alpha, target_modules, gradient_accumulation_steps, checkpointing_steps, checkpointing_limit, enable_slicing, enable_tiling, batch_size Optimizer: optimizer, lr, beta1, beta2, epsilon, weight_decay, max_grad_norm, lr_scheduler, lr_num_cycles, lr_warmup_steps Validation: validation_steps, validation_epochs, num_validation_videos, validation_prompts, validation_prompt_separator -Accelerate: gpu_ids, nccl_timeout, gradient_checkpointing, allow_tf32, dataloader_num_workers, report_to, accelerate_config \ No newline at end of file +Accelerate: gpu_ids, nccl_timeout, gradient_checkpointing, allow_tf32, dataloader_num_workers, report_to, accelerate_config +Model: model_name, pretrained_model_name_or_path, text_encoder_dtype, text_encoder_2_dtype, text_encoder_3_dtype, vae_dtype, layerwise_upcasting_modules, layerwise_upcasting_storage_dtype, layerwise_upcasting_granularity \ No newline at end of file diff --git a/config/config_template.yaml b/config/config_template.yaml index 87414ac..f518767 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -18,13 +18,15 @@ gpu_ids: '0' gradient_accumulation_steps: 4 gradient_checkpointing: true id_token: afkx +layerwise_upcasting_modules: [none, transformer] +layerwise_upcasting_granularity: [pytorch_layer, diffusers_layer] +layerwise_upcasting_storage_dtype: [float8_e4m3fn, float8_e5m2] lora_alpha: 128 lr: 0.0001 lr_num_cycles: 1 lr_scheduler: ['linear', 'cosine', 'cosine_with_restarts', 'polynomial', 'constant', 'constant_with_warmup'] lr_warmup_steps: 400 max_grad_norm: 1.0 -mixed_precision: [bf16, fp16, 'no'] model_name: ltx_video nccl_timeout: 1800 num_validation_videos: 0 @@ -37,14 +39,14 @@ rank: 128 report_to: none seed: 42 target_modules: to_q to_k to_v to_out.0 -text_encoder_dtype: [bf16, fp16, fp32] -text_encoder_2_dtype: [bf16, fp16, fp32] -text_encoder_3_dtype: [bf16, fp16, fp32] +text_encoder_dtype: [bf16, fp16, fp32, fp8] +text_encoder_2_dtype: [bf16, fp16, fp32, fp8] +text_encoder_3_dtype: [bf16, fp16, fp32, fp8] tracker_name: finetrainers train_steps: 3000 training_type: lora use_8bit_bnb: false -vae_dtype: [bf16, fp16, fp32] +vae_dtype: [bf16, fp16, fp32, fp8] validation_epochs: 0 validation_prompt_separator: ':::' validation_prompts: '' diff --git a/pyproject.toml b/pyproject.toml index fa8753a..4a4632a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "finetrainers-ui" -version = "0.8.0" +version = "0.10.0" dependencies = [ "gradio", "torch>=2.4.1" diff --git a/run_trainer.py b/run_trainer.py index 8b32081..ed89d02 100644 --- a/run_trainer.py +++ b/run_trainer.py @@ -20,7 +20,16 @@ def run(self, config: Config, finetrainers_path: str, log_file: str): # Model arguments model_cmd = f"--model_name {config.get('model_name')} \ - --pretrained_model_name_or_path {config.get('pretrained_model_name_or_path')}" + --pretrained_model_name_or_path {config.get('pretrained_model_name_or_path')} \ + --text_encoder_dtype {config.get('text_encoder_dtype')} \ + --text_encoder_2_dtype {config.get('text_encoder_2_dtype')} \ + --text_encoder_3_dtype {config.get('text_encoder_3_dtype')} \ + --vae_dtype {config.get('vae_dtype')} " + + if config.get('layerwise_upcasting_modules') != 'none': + model_cmd += f"--layerwise_upcasting_modules {config.get('layerwise_upcasting_modules')} \ + --layerwise_upcasting_storage_dtype {config.get('layerwise_upcasting_storage_dtype')} \ + --layerwise_upcasting_granularity {config.get('layerwise_upcasting_granularity')} " # Dataset arguments dataset_cmd = f"--data_root {config.get('data_root')} \ @@ -30,11 +39,7 @@ def run(self, config: Config, finetrainers_path: str, log_file: str): --video_resolution_buckets {config.get('video_resolution_buckets')} \ --caption_dropout_p {config.get('caption_dropout_p')} \ --caption_dropout_technique {config.get('caption_dropout_technique')} \ - {'--precompute_conditions' if config.get('precompute_conditions') else ''} \ - --text_encoder_dtype {config.get('text_encoder_dtype')} \ - --text_encoder_2_dtype {config.get('text_encoder_2_dtype')} \ - --text_encoder_3_dtype {config.get('text_encoder_3_dtype')} \ - --vae_dtype {config.get('vae_dtype')} " + {'--precompute_conditions' if config.get('precompute_conditions') else ''} " # Dataloader arguments dataloader_cmd = f"--dataloader_num_workers {config.get('dataloader_num_workers')}" @@ -45,7 +50,6 @@ def run(self, config: Config, finetrainers_path: str, log_file: str): # Training arguments training_cmd = f"--training_type {config.get('training_type')} \ --seed {config.get('seed')} \ - --mixed_precision {config.get('mixed_precision')} \ --batch_size {config.get('batch_size')} \ --train_steps {config.get('train_steps')} \ --rank {config.get('rank')} \ diff --git a/tabs/general_tab.py b/tabs/general_tab.py index 842268f..8e47736 100644 --- a/tabs/general_tab.py +++ b/tabs/general_tab.py @@ -15,7 +15,7 @@ def __init__(self, title, config_file_path, allow_load=False): try: with self.settings_column: - inputs = self.update_form(self.config) + inputs = self.update_form() self.components = OrderedDict(inputs) children = [] for child in self.settings_column.children: diff --git a/tabs/prepare_tab.py b/tabs/prepare_tab.py index 8fd882a..e45bab7 100644 --- a/tabs/prepare_tab.py +++ b/tabs/prepare_tab.py @@ -20,7 +20,7 @@ def __init__(self, title, config_file_path, allow_load=False): try: with self.settings_column: - self.components = OrderedDict(self.update_form(self.config)) + self.components = OrderedDict(self.update_form()) for i in range(len(self.settings_column.children)): keys = list(self.components.keys()) properties[keys[i]] = self.settings_column.children[i] diff --git a/tabs/tab.py b/tabs/tab.py index 9bc030f..3d798b9 100644 --- a/tabs/tab.py +++ b/tabs/tab.py @@ -71,10 +71,10 @@ def add_buttons(self): outputs=[self.save_status, self.config_file_box, *self.get_properties().values()] ) - def update_form(self, config): + def update_form(self): inputs = dict() - for key, value in config.items(): + for key, value in self.config.items(): category = 'Other' for categories in self.config_categories.keys(): if key in self.config_categories[categories]: diff --git a/tabs/training_tab.py b/tabs/training_tab.py index 74c6d23..904cec2 100644 --- a/tabs/training_tab.py +++ b/tabs/training_tab.py @@ -30,7 +30,7 @@ def __init__(self, title, config_file_path, allow_load=False): try: with self.settings_column: - inputs = self.update_form(self.config) + inputs = self.update_form() self.components = OrderedDict(inputs) children = [] for child in self.settings_column.children: diff --git a/tabs/training_tab_legacy.py b/tabs/training_tab_legacy.py index f754ba4..6b65b99 100644 --- a/tabs/training_tab_legacy.py +++ b/tabs/training_tab_legacy.py @@ -17,7 +17,7 @@ def __init__(self, title, config_file_path, allow_load=False): try: with self.settings_column: - self.components = OrderedDict(self.update_form(self.config)) + self.components = OrderedDict(self.update_form()) for i in range(len(self.settings_column.children)): keys = list(self.components.keys()) properties[keys[i]] = self.settings_column.children[i] diff --git a/trainer_config_validator.py b/trainer_config_validator.py index dbaa2cf..fe4492e 100644 --- a/trainer_config_validator.py +++ b/trainer_config_validator.py @@ -29,7 +29,6 @@ def validate(self): 'lr_scheduler', 'lr_warmup_steps', 'max_grad_norm', - 'mixed_precision', 'model_name', 'nccl_timeout', 'optimizer', From e03b546bf4c99eeb561d4a8dd021e643dd830134 Mon Sep 17 00:00:00 2001 From: rickard Date: Wed, 15 Jan 2025 21:28:49 +0100 Subject: [PATCH 2/3] update settings --- config/config_template.yaml | 3 ++- run_trainer.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/config/config_template.yaml b/config/config_template.yaml index c2040ed..32e66b4 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -21,7 +21,7 @@ gradient_accumulation_steps: 4 gradient_checkpointing: true id_token: afkx layerwise_upcasting_modules: [none, transformer] -layerwise_upcasting_granularity: [pytorch_layer, diffusers_layer] +layerwise_upcasting_skip_modules_pattern: 'patch_embed pos_embed x_embedder context_embedder ^proj_in$ ^proj_out$ norm' layerwise_upcasting_storage_dtype: [float8_e4m3fn, float8_e5m2] image_resolution_buckets: 512x768 lora_alpha: 128 @@ -47,6 +47,7 @@ text_encoder_dtype: [bf16, fp16, fp32, fp8] text_encoder_2_dtype: [bf16, fp16, fp32, fp8] text_encoder_3_dtype: [bf16, fp16, fp32, fp8] tracker_name: finetrainers +transformer_dtype: [bf16, fp16, fp32, fp8] train_steps: 3000 training_type: lora use_8bit_bnb: false diff --git a/run_trainer.py b/run_trainer.py index 2575cb8..e4cc7d6 100644 --- a/run_trainer.py +++ b/run_trainer.py @@ -29,7 +29,7 @@ def run(self, config: Config, finetrainers_path: str, log_file: str): if config.get('layerwise_upcasting_modules') != 'none': model_cmd +=["--layerwise_upcasting_modules", config.get('layerwise_upcasting_modules'), "--layerwise_upcasting_storage_dtype", config.get('layerwise_upcasting_storage_dtype'), - "--layerwise_upcasting_granularity", config.get('layerwise_upcasting_granularity')] + "--layerwise_upcasting_skip_modules_pattern", config.get('layerwise_upcasting_skip_modules_pattern')] dataset_cmd = ["--data_root", config.get('data_root'), "--video_column", config.get('video_column'), @@ -45,6 +45,7 @@ def run(self, config: Config, finetrainers_path: str, log_file: str): "--text_encoder_2_dtype", config.get('text_encoder_2_dtype'), "--text_encoder_3_dtype", config.get('text_encoder_3_dtype'), "--vae_dtype", config.get('vae_dtype'), + "--transformer_dtype", config.get('transformer_dtype'), '--precompute_conditions' if config.get('precompute_conditions') else ''] if config.get('dataset_file'): dataset_cmd += ["--dataset_file", config.get('dataset_file')] @@ -56,7 +57,6 @@ def run(self, config: Config, finetrainers_path: str, log_file: str): training_cmd = ["--training_type", config.get('training_type'), "--seed", config.get('seed'), - "--mixed_precision", config.get('mixed_precision'), "--batch_size", config.get('batch_size'), "--train_steps", config.get('train_steps'), "--rank", config.get('rank'), From 2ef40893ba53086b68341506bdd72c03da462f68 Mon Sep 17 00:00:00 2001 From: rickard Date: Thu, 16 Jan 2025 19:47:11 +0100 Subject: [PATCH 3/3] update config_template --- config/config_template.yaml | 10 +++++----- tabs/tab.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config/config_template.yaml b/config/config_template.yaml index 32e66b4..3982058 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -43,15 +43,15 @@ report_to: none resume_from_checkpoint: '' seed: 42 target_modules: to_q to_k to_v to_out.0 -text_encoder_dtype: [bf16, fp16, fp32, fp8] -text_encoder_2_dtype: [bf16, fp16, fp32, fp8] -text_encoder_3_dtype: [bf16, fp16, fp32, fp8] +text_encoder_dtype: [bf16, fp16, fp32] +text_encoder_2_dtype: [bf16, fp16, fp32] +text_encoder_3_dtype: [bf16, fp16, fp32] tracker_name: finetrainers -transformer_dtype: [bf16, fp16, fp32, fp8] +transformer_dtype: [bf16, fp16, fp32] train_steps: 3000 training_type: lora use_8bit_bnb: false -vae_dtype: [bf16, fp16, fp32, fp8] +vae_dtype: [bf16, fp16, fp32] validation_epochs: 0 validation_prompt_separator: ':::' validation_prompts: '' diff --git a/tabs/tab.py b/tabs/tab.py index 3d798b9..6d71814 100644 --- a/tabs/tab.py +++ b/tabs/tab.py @@ -114,6 +114,6 @@ def update_properties(self, *args): properties_values[index] = value #properties[key].value = value - return ["Config loaded. Edit below:", config_file_box, *properties_values] + return ["Config loaded.", config_file_box, *properties_values] except Exception as e: return [f"Error loading config: {e}", config_file_box, *properties_values] \ No newline at end of file