exp 2 large model configs

alan-turing-institute · Aug 16, 2024 · b118080 · b118080
1 parent c5ed520
commit b118080
Show file tree

Hide file tree

Showing 8 changed files with 165 additions and 17 deletions.
diff --git a/...relationships_gpt2_longer_train copy.yaml → ...ent/experiment_2_relationships_llama.yaml b/...relationships_gpt2_longer_train copy.yaml → ...ent/experiment_2_relationships_llama.yaml
@@ -2,42 +2,37 @@
 combinations:
   data_config:
     - gen_tofu_rel_1
-    - gen_tofu_rel_2
     - gen_tofu_rel_3
-    - gen_tofu_rel_4
-    - gen_tofu_rel_5
     - gen_tofu_rel_6
 
   train_config:
-    - longer
+    - default
 
   forget_config:
-    - [ascent, shorter]
+    - [difference, default]
+    - [idk, default]
     - [difference, shorter]
     - [idk, shorter]
-    - [kl, shorter]
 
   seed:
     - 40
     - 41
     - 42
     - 43
-    - 44
-    - 45
-    - 46
-    - 47
-    - 48
-    - 49
 
-model_config: gpt2
+model_config: Meta-Llama-3.1-8B-Instruct
 
 # Full data config: which dataset to use to build full model to do forgetting on
 full_data_config: gen_tofu_full
 
 # Baskerville kwargs
 use_bask: true
+model_cache_dir: /bask/projects/v/vjgo8416-sltv-forget/caches/models
+data_cache_dir: /bask/projects/v/vjgo8416-sltv-forget/caches/datasets
+wandb_cache_dir: /bask/projects/v/vjgo8416-sltv-forget/caches/wandb
+
 bask:
-  walltime: '0-5:0:0'
+  walltime: '0-12:0:0'
   gpu_number: 1
   node_number: 1
 

diff --git a/configs/experiment/experiment_2_relationships_phi.yaml b/configs/experiment/experiment_2_relationships_phi.yaml
@@ -0,0 +1,46 @@
+# Combinations to build runs over
+combinations:
+  data_config:
+    - gen_tofu_rel_1
+    - gen_tofu_rel_3
+    - gen_tofu_rel_6
+
+  train_config:
+    - default
+
+  forget_config:
+    - [difference, accumulate]
+    - [idk, default]
+    - [difference, shorter_accumulate]
+    - [idk, shorter]
+
+  seed:
+    - 40
+    - 41
+    - 42
+    - 43
+
+model_config: Phi-3-mini-4k-instruct
+
+# Full data config: which dataset to use to build full model to do forgetting on
+full_data_config: gen_tofu_full
+
+# Baskerville kwargs
+use_bask: true
+model_cache_dir: /bask/projects/v/vjgo8416-sltv-forget/caches/models
+data_cache_dir: /bask/projects/v/vjgo8416-sltv-forget/caches/datasets
+wandb_cache_dir: /bask/projects/v/vjgo8416-sltv-forget/caches/wandb
+
+bask:
+  walltime: '0-12:0:0'
+  gpu_number: 1
+  node_number: 1
+
+# Wandb kwargs
+wandb_kwargs:
+  use_wandb: true
+  wandb_config:
+    entity: turing-arc
+    project: selective-forgetting
+    log_model: "false"
+    group: experiment-2-relationship
diff --git a/configs/model/Meta-Llama-3.1-8B-Instruct/hyperparameters/default.yaml b/configs/model/Meta-Llama-3.1-8B-Instruct/hyperparameters/default.yaml
@@ -4,7 +4,7 @@ trainer_kwargs: # passed to TrainingArguments
 
   # Batch size
   per_device_train_batch_size: 8
-  per_device_eval_batch_size: 8
+  per_device_eval_batch_size: 16
   gradient_accumulation_steps: 1
 
   # Core hyperparameters

diff --git a/configs/model/Meta-Llama-3.1-8B-Instruct/hyperparameters/shorter.yaml b/configs/model/Meta-Llama-3.1-8B-Instruct/hyperparameters/shorter.yaml
@@ -0,0 +1,35 @@
+trainer_kwargs: # passed to TrainingArguments
+  # Memory optimization
+  bf16: True
+
+  # Batch size
+  per_device_train_batch_size: 8
+  per_device_eval_batch_size: 16
+  gradient_accumulation_steps: 1
+
+  # Core hyperparameters
+  learning_rate: 1.e-4
+  num_train_epochs: 5
+  weight_decay: 0.01
+  warmup_ratio: 0.2  # 0.2 for 1 epoch (when running for 5 epochs)
+
+  # Evaluation
+  eval_strategy: steps
+  eval_steps: 0.2
+
+  # Logging
+  logging_strategy: steps
+  logging_steps: 0.2
+
+
+  # Early stopping
+  save_strategy: epoch
+  save_total_limit: 1
+
+  # Outputs
+  output_dir: output
+
+peft_kwargs:  # passed to LoraConfig
+  r: 8
+  lora_alpha: 32
+  lora_dropout: 0.05
diff --git a/configs/model/Phi-3-mini-4k-instruct/hyperparameters/accumulate.yaml b/configs/model/Phi-3-mini-4k-instruct/hyperparameters/accumulate.yaml
@@ -4,7 +4,7 @@ trainer_kwargs: # passed to TrainingArguments
 
   # Batch size
   per_device_train_batch_size: 8
-  per_device_eval_batch_size: 16
+  per_device_eval_batch_size: 32
   gradient_accumulation_steps: 2
 
   # Core hyperparameters

diff --git a/configs/model/Phi-3-mini-4k-instruct/hyperparameters/default.yaml b/configs/model/Phi-3-mini-4k-instruct/hyperparameters/default.yaml
@@ -4,7 +4,7 @@ trainer_kwargs: # passed to TrainingArguments
 
   # Batch size
   per_device_train_batch_size: 16
-  per_device_eval_batch_size: 16
+  per_device_eval_batch_size: 32
   gradient_accumulation_steps: 1
 
   # Core hyperparameters

diff --git a/configs/model/Phi-3-mini-4k-instruct/hyperparameters/shorter.yaml b/configs/model/Phi-3-mini-4k-instruct/hyperparameters/shorter.yaml
@@ -0,0 +1,36 @@
+trainer_kwargs: # passed to TrainingArguments
+  # Memory optimization
+  bf16: True
+
+  # Batch size
+  per_device_train_batch_size: 16
+  per_device_eval_batch_size: 32
+  gradient_accumulation_steps: 1
+
+  # Core hyperparameters
+  learning_rate: 1.e-4
+  num_train_epochs: 5
+  weight_decay: 0.01
+  warmup_ratio: 0.2  # 0.2 for 1 epoch (when running for 5 epochs)
+
+  # Evaluation
+  eval_strategy: steps
+  eval_steps: 0.2
+
+  # Logging
+  logging_strategy: steps
+  logging_steps: 0.2
+
+
+  # Early stopping
+  save_strategy: epoch
+  save_total_limit: 1
+
+  # Outputs
+  output_dir: output
+
+peft_kwargs:  # passed to LoraConfig
+  r: 8
+  lora_alpha: 8
+  lora_dropout: 0.05
+  target_modules: ["k_proj", "q_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
diff --git a/configs/model/Phi-3-mini-4k-instruct/hyperparameters/shorter_accumulate.yaml b/configs/model/Phi-3-mini-4k-instruct/hyperparameters/shorter_accumulate.yaml
@@ -0,0 +1,36 @@
+trainer_kwargs: # passed to TrainingArguments
+  # Memory optimization
+  bf16: True
+
+  # Batch size
+  per_device_train_batch_size: 8
+  per_device_eval_batch_size: 32
+  gradient_accumulation_steps: 2
+
+  # Core hyperparameters
+  learning_rate: 1.e-4
+  num_train_epochs: 5
+  weight_decay: 0.01
+  warmup_ratio: 0.2  # 0.2 for 1 epoch (when running for 5 epochs)
+
+  # Evaluation
+  eval_strategy: steps
+  eval_steps: 0.2
+
+  # Logging
+  logging_strategy: steps
+  logging_steps: 0.2
+
+
+  # Early stopping
+  save_strategy: epoch
+  save_total_limit: 1
+
+  # Outputs
+  output_dir: output
+
+peft_kwargs:  # passed to LoraConfig
+  r: 8
+  lora_alpha: 8
+  lora_dropout: 0.05
+  target_modules: ["k_proj", "q_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]