Skip to content

Commit a96540c

Browse files
committed
do not land: local setup
Summary: Test Plan:
1 parent 29e7250 commit a96540c

File tree

5 files changed

+283
-18
lines changed

5 files changed

+283
-18
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto",
4+
"loss_scale": 0,
5+
"loss_scale_window": 1000,
6+
"initial_scale_power": 16,
7+
"hysteresis": 2,
8+
"min_loss_scale": 1
9+
},
10+
"bf16": {
11+
"enabled": "auto"
12+
},
13+
"optimizer": {
14+
"type": "Adam",
15+
"params": {
16+
"lr": "auto",
17+
"betas": "auto",
18+
"eps": "auto",
19+
"weight_decay": "auto"
20+
}
21+
},
22+
"scheduler": {
23+
"type": "WarmupDecayLR",
24+
"params": {
25+
"warmup_min_lr": "auto",
26+
"warmup_max_lr": "auto",
27+
"warmup_num_steps": "auto",
28+
"total_num_steps": "auto"
29+
}
30+
},
31+
"zero_optimization": {
32+
"stage": 3,
33+
"overlap_comm": true,
34+
"contiguous_gradients": true,
35+
"sub_group_size": 1e9,
36+
"reduce_bucket_size": "auto",
37+
"stage3_prefetch_bucket_size": "auto",
38+
"stage3_param_persistence_threshold": "auto",
39+
"stage3_max_live_parameters": 1e9,
40+
"stage3_max_reuse_distance": 1e9,
41+
"stage3_gather_16bit_weights_on_model_save": true
42+
},
43+
"gradient_accumulation_steps": "auto",
44+
"gradient_clipping": "auto",
45+
"train_batch_size": "auto",
46+
"train_micro_batch_size_per_gpu": "auto"
47+
}
48+

blog/llm-finetuning-4/finetune.py

+148-8
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,22 @@
1+
import functools
12
import logging
23
import os
4+
import random
35
import sys
6+
from itertools import chain
7+
from typing import Dict
48

59
import datasets
610
import determined as det
711
import evaluate
12+
import numpy as np
813
import torch
914
import transformers
15+
import wandb
16+
1017
from determined.transformers import DetCallback
1118
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model
12-
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
19+
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, get_linear_schedule_with_warmup
1320
from trl import DataCollatorForCompletionOnlyLM
1421

1522
from chat_format import get_chat_format, get_response_template_ids, set_special_tokens
@@ -18,17 +25,31 @@
1825
logger = logging.getLogger(__name__)
1926

2027

21-
def get_tokenizer(model_name, model_commit_hash):
28+
def get_tokenizer(model_name, model_commit_hash, hparams):
2229
tokenizer = AutoTokenizer.from_pretrained(
2330
model_name,
2431
padding_side="right",
2532
truncation_side="right",
2633
revision=model_commit_hash,
34+
token=hparams["hf_token"],
2735
)
2836
set_special_tokens(tokenizer, model_name)
2937
return tokenizer
3038

3139

40+
def standardize_lora_init(lora_layer, alpha: int):
41+
self_attn = lora_layer.self_attn
42+
q_proj = self_attn.q_proj.lora_A.default
43+
v_proj = self_attn.v_proj.lora_A.default
44+
with torch.no_grad():
45+
sd_q = q_proj.state_dict()
46+
sd_q['weight'] = sd_q['weight'] / alpha
47+
q_proj.load_state_dict(sd_q)
48+
sd_v = v_proj.state_dict()
49+
sd_v['weight'] = sd_v['weight'] / alpha
50+
v_proj.load_state_dict(sd_v)
51+
52+
3253
def get_model_and_tokenizer(model_name, use_lora, hparams, inference=False, device_map="auto", model_commit_hash=None):
3354
if inference:
3455
if use_lora:
@@ -47,22 +68,55 @@ def get_model_and_tokenizer(model_name, use_lora, hparams, inference=False, devi
4768
model_name,
4869
torch_dtype=torch.bfloat16,
4970
revision=model_commit_hash,
71+
token=hparams["hf_token"],
5072
)
73+
model.enable_input_require_grads()
5174

5275
if use_lora:
5376
r = hparams["r"]
54-
lora_alpha = r * hparams["lora_alpha_in_r"]
77+
lora_alpha = hparams["lora_alpha"]
5578
peft_config = LoraConfig(
5679
task_type="CAUSAL_LM",
5780
inference_mode=False,
5881
r=r,
5982
lora_alpha=lora_alpha,
6083
lora_dropout=hparams["lora_dropout"],
84+
use_rslora=hparams["use_rslora"]
6185
)
6286

6387
model = get_peft_model(model, peft_config)
6488

65-
tokenizer = get_tokenizer(model_name, model_commit_hash=model_commit_hash)
89+
lora_a = model.base_model.model.model.layers[0].self_attn.q_proj.lora_A.default
90+
print("LoRA a at initialization, before rescaling, layer 0, q_proj:")
91+
print(lora_a.state_dict())
92+
lora_a = model.base_model.model.model.layers[31].self_attn.q_proj.lora_A.default
93+
print("LoRA a at initialization, before rescaling, layer 31, q_proj:")
94+
print(lora_a.state_dict())
95+
lora_a = model.base_model.model.model.layers[0].self_attn.v_proj.lora_A.default
96+
print("LoRA a at initialization, before rescaling, layer 0, v_proj:")
97+
print(lora_a.state_dict())
98+
lora_a = model.base_model.model.model.layers[31].self_attn.v_proj.lora_A.default
99+
print("LoRA a at initialization, before rescaling, layer 31, v_proj:")
100+
print(lora_a.state_dict())
101+
102+
if hparams["custom_scale_init"]:
103+
for l in model.base_model.model.model.layers:
104+
standardize_lora_init(l, lora_alpha)
105+
106+
lora_a = model.base_model.model.model.layers[0].self_attn.q_proj.lora_A.default
107+
print("LoRA a at initialization, after rescaling, layer 0, q_proj:")
108+
print(lora_a.state_dict())
109+
lora_a = model.base_model.model.model.layers[31].self_attn.q_proj.lora_A.default
110+
print("LoRA a at initialization, after rescaling, layer 31, q_proj:")
111+
print(lora_a.state_dict())
112+
lora_a = model.base_model.model.model.layers[0].self_attn.v_proj.lora_A.default
113+
print("LoRA a at initialization, after rescaling, layer 0, v_proj:")
114+
print(lora_a.state_dict())
115+
lora_a = model.base_model.model.model.layers[31].self_attn.v_proj.lora_A.default
116+
print("LoRA a at initialization, after rescaling, layer 31, v_proj:")
117+
print(lora_a.state_dict())
118+
119+
tokenizer = get_tokenizer(model_name, model_commit_hash=model_commit_hash, hparams=hparams)
66120
return model, tokenizer
67121

68122

@@ -73,6 +127,23 @@ def fn(formatted):
73127
return fn
74128

75129

130+
def group_texts(examples, block_size) -> Dict:
131+
# Concatenate all texts.
132+
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
133+
total_length = len(concatenated_examples[list(examples.keys())[0]])
134+
# We drop the small remainder, we could add padding if the model supported it instead
135+
# of this drop, you can customize this part to your needs.
136+
if total_length >= block_size:
137+
total_length = (total_length // block_size) * block_size
138+
# Split by chunks of max_len.
139+
result = {
140+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
141+
for k, t in concatenated_examples.items()
142+
}
143+
result["labels"] = result["input_ids"].copy()
144+
return result
145+
146+
76147
def preprocess_logits_for_metrics(logits, labels):
77148
if isinstance(logits, tuple):
78149
# Depending on the model and config, logits may contain extra tensors,
@@ -105,10 +176,18 @@ def tokenize(element):
105176
}
106177

107178
dataset = load_or_create_dataset(hparams["dataset_subset"])
179+
block_size = hparams["block_size"]
108180
column_names = list(dataset["train"].features)
109181
for k in dataset.keys():
110182
dataset[k] = dataset[k].map(tokenize, remove_columns=column_names)
111-
183+
if hparams["group_text"]:
184+
with training_args.main_process_first(desc="grouping texts together", local=False):
185+
dataset = dataset.map(
186+
functools.partial(group_texts, block_size=block_size),
187+
batched=True,
188+
desc=f"Grouping texts in chunks of {block_size}",
189+
)
190+
112191
response_template_ids = get_response_template_ids(tokenizer, model_name)
113192
collator = DataCollatorForCompletionOnlyLM(
114193
response_template_ids, tokenizer=tokenizer
@@ -151,6 +230,18 @@ def compute_metrics(eval_preds):
151230

152231
trainer.train()
153232

233+
def set_seed(seed: int = 42) -> None:
234+
np.random.seed(seed)
235+
random.seed(seed)
236+
torch.manual_seed(seed)
237+
torch.cuda.manual_seed(seed)
238+
# When running on the CuDNN backend, two further options must be set
239+
torch.backends.cudnn.deterministic = True
240+
torch.backends.cudnn.benchmark = False
241+
# Set a fixed value for the hash seed
242+
os.environ["PYTHONHASHSEED"] = str(seed)
243+
print(f"Random seed set as {seed}")
244+
154245

155246
if __name__ == "__main__":
156247
# Setup logging
@@ -169,12 +260,19 @@ def compute_metrics(eval_preds):
169260
hparams = info.trial.hparams
170261

171262
if "hf_token" in hparams:
263+
print("SWY logged flow triggered")
264+
hf_token = hparams["hf_token"]
265+
print(f"SWY token is {hf_token}")
172266
import huggingface_hub
173-
174267
huggingface_hub.login(token=hparams["hf_token"])
175268

176269
if hparams["training_args"]["deepspeed"]:
177-
hparams["training_args"]["deepspeed"] = "ds_configs/ds_config_stage_3.json"
270+
if not hparams["use_adam"]:
271+
hparams["training_args"]["deepspeed"] = "ds_configs/ds_config_stage_3.json"
272+
print("swy not using adam")
273+
else:
274+
hparams["training_args"]["deepspeed"] = "ds_configs/ds_config_stage_3_adam.json"
275+
print("swy using adam")
178276

179277
training_args = TrainingArguments(**hparams["training_args"])
180278
if training_args.deepspeed:
@@ -186,8 +284,50 @@ def compute_metrics(eval_preds):
186284
distributed = det.core.DistributedContext.from_deepspeed()
187285
else:
188286
distributed = det.core.DistributedContext.from_torch_distributed()
189-
287+
288+
random_seed = 42
289+
190290
with det.core.init(distributed=distributed) as core_context:
291+
if core_context.distributed.rank == 0:
292+
wandb.login(key=hparams["wandb_key"])
293+
import uuid
294+
# Generate a UUID
295+
my_uuid = uuid.uuid4()
296+
# Convert UUID to string
297+
uuid_str = str(my_uuid)[:5]
298+
r = hparams["r"]
299+
lora_alpha = hparams["lora_alpha"]
300+
lora_dropout = hparams["lora_dropout"]
301+
dataset_subset = hparams["dataset_subset"]
302+
lr = str(hparams["training_args"]["learning_rate"])
303+
use_rslora = False
304+
if "use_rslora" in hparams:
305+
use_rslora = hparams["use_rslora"]
306+
optimizer = "adamW"
307+
if "use_adam" in hparams and hparams["use_adam"]:
308+
optimizer = "adam"
309+
run_name = f"test_lora_blog_{dataset_subset}_r_{r}_alpha_{lora_alpha}_dropout_{lora_dropout}_lr_{lr}_seed_{random_seed}_opt_{optimizer}"
310+
if use_rslora:
311+
run_name += "_rslora"
312+
run_name += f"_{uuid_str}"
313+
run = wandb.init(
314+
project="lora-blog-v3",
315+
name=run_name,
316+
config={
317+
"r":hparams["r"],
318+
"lora_alpha":hparams["lora_alpha"],
319+
"dropout":hparams["lora_dropout"],
320+
"dataset_subset":hparams["dataset_subset"],
321+
"model":hparams["model"],
322+
"lr": lr,
323+
"seed": random_seed,
324+
"optimizer": optimizer,
325+
"use_rslora": use_rslora
326+
}
327+
)
328+
329+
set_seed(random_seed)
330+
191331
det_callback = DetCallback(
192332
core_context,
193333
training_args,

blog/llm-finetuning-4/lora.yaml

+24-10
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,63 @@
1-
name: mistral lora easy
1+
name: mistral lora hard
22
debug: false
33
environment:
44
environment_variables:
55
- NCCL_DEBUG=INFO
6+
- NCCL_SOCKET_IFNAME=ens,eth,ib
67
image:
78
gpu: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14
89
cpu: determinedai/environments:py-3.10-pytorch-2.0-cpu-03ae7d7
910
resources:
1011
slots_per_trial: 2
12+
resource_pool: a100
13+
workspace: swy_5
14+
project: lora-blog
1115
searcher:
1216
name: grid
1317
max_length:
14-
batches: 5000
18+
batches: 3000
1519
metric: eval_accuracy
1620
smaller_is_better: false
1721
hyperparameters:
1822
model: "mistralai/Mistral-7B-Instruct-v0.2"
1923
model_commit_hash: "99259002b41e116d28ccb2d04a9fbe22baed0c7f"
20-
dataset_subset: "easy"
24+
dataset_subset: "hard"
25+
block_size: 2048
26+
group_text: false
2127
lora: true
2228
# Tunable hyperparameters
2329
r:
2430
type: categorical
25-
vals: [1, 2, 4, 8, 16, 32, 64]
26-
lora_alpha_in_r:
31+
vals: [2, 8, 32, 128]
32+
lora_alpha:
2733
type: categorical
28-
vals: [0.5, 1, 2]
34+
vals: [0.5, 1, 2, 8, 32, 128, 256, 512]
2935
lora_dropout:
3036
type: categorical
3137
vals: [0.1]
3238
# End tunable hyperparameters
39+
hf_token: <HF_TOKEN>
40+
wandb_key: <WANDB_KEY>
3341
training_args:
3442
output_dir: "/tmp/llm_finetuning"
35-
max_steps: 5000
36-
per_device_train_batch_size: 8
43+
max_steps: 3000
44+
per_device_train_batch_size: 4
3745
per_device_eval_batch_size: 4
38-
fp16: true
46+
bf16: true
3947
evaluation_strategy: "steps"
40-
eval_steps: 1000
48+
eval_steps: 500
4149
logging_strategy: "steps"
4250
logging_steps: 100
4351
save_strategy: "steps"
4452
save_steps: 1000
4553
learning_rate: 1e-5
4654
deepspeed: true
55+
report_to: "wandb"
56+
gradient_checkpointing: true
57+
# Below are settings we experimneted with
58+
use_adam: false
59+
custom_scale_init: false
60+
use_rslora: false
4761
entrypoint: >-
4862
python -m determined.launch.torch_distributed
4963
python finetune.py

0 commit comments

Comments
 (0)