Skip to content

Commit b348d4f

Browse files
szewaiyuen6KevinMusgrave
and
KevinMusgrave
authored
feat: lora blog (#29)
* feat: lora blog Summary: Test Plan: * readme updates * updated readme * change folder * fixed link --------- Co-authored-by: KevinMusgrave <[email protected]>
1 parent 91f8812 commit b348d4f

11 files changed

+508
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ This repository contains a variety of Determined examples that are not actively
1515
| [LLM Finetuning](blog/llm-finetuning) | Finetuning TinyLlama-1.1B on Text-to-SQL. |
1616
| [LLM Finetuning 2](blog/llm-finetuning-2) | Finetuning Mistral-7B on Text-to-SQL using LoRA and DeepSpeed. |
1717
| [LLM Finetuning 3](blog/llm-finetuning-3) | Finetuning Gemma-2B using DPO. |
18+
| [LoRA Parameters](blog/lora-parameters) | Finding the best LoRA parameters. |
1819
| [Python SDK demo](blog/python_sdk_demo) | Example usage of the Determined Python SDK to run and administer experiments. |
1920
| [Tensor Parallelism](blog/tp) | Profiling tensor parallelism in PyTorch. |
2021

blog/lora-parameters/.detignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
text-to-sql*
2+
checkpoints

blog/lora-parameters/.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
__pycache__
2+
.DS_STORE
3+
text-to-sql*
4+
checkpoints
5+
*.png

blog/lora-parameters/README.md

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Finding the best LoRA parameters
2+
3+
We finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 40 GB A100 GPUs utilizing DeepSpeed.
4+
5+
See our [blog post](https://www.determined.ai/blog/lora-parameters) for our experiment results.
6+
7+
To get started, first install Determined on your local machine:
8+
```bash
9+
pip install determined
10+
```
11+
12+
Then finetune with LoRA:
13+
```bash
14+
det e create lora.yaml .
15+
```
16+
17+
You can view the actual training code in `finetune.py`.
18+
19+
20+
## Configuration
21+
22+
Change configuration options in `lora.yaml`. Some important options are:
23+
- `slots_per_trial`: the number of GPUs to use.
24+
- `dataset_subset`: the difficulty subset to train on.
25+
- `per_device_train_batch_size`: the batch size per GPU.
26+
27+
28+
DeepSpeed configuration files are in the `ds_configs` folder.
29+
30+
31+
## Contributors
32+
33+
- By [Sze Wai Yuen](https://github.com/szewaiyuen6)
34+
- Built on `llm-finetuning` code by [Agnieszka Ciborowska](https://github.com/aciborowska) and [Kevin Musgrave](https://github.com/KevinMusgrave).

blog/lora-parameters/chat_format.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
CHAT_ML_TEMPLATE = """
2+
{% for message in messages %}
3+
{% if message['role'] == 'user' %}
4+
{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}
5+
{% elif message['role'] == 'system' %}
6+
{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}
7+
{% elif message['role'] == 'assistant' %}
8+
{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}
9+
{% endif %}
10+
{% endfor %}
11+
"""
12+
13+
14+
CHAT_ML_EOS_TOKEN = "<|im_end|>"
15+
16+
17+
def get_chat_format(element, model_name, with_assistant_response=True):
18+
system_prompt = (
19+
"You are a helpful programmer assistant that excels at SQL. "
20+
"When prompted with a task and a definition of an SQL table, you "
21+
"respond with a SQL query to retrieve information from the table. "
22+
"Don't explain your reasoning, only provide the SQL query."
23+
)
24+
25+
user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "
26+
27+
if model_name == "mistralai/Mistral-7B-Instruct-v0.2":
28+
user_prompt = f"{system_prompt}\n{user_prompt}"
29+
output = [
30+
{"role": "user", "content": user_prompt.format_map(element)},
31+
]
32+
else:
33+
output = [
34+
{"role": "system", "content": system_prompt},
35+
{"role": "user", "content": user_prompt.format_map(element)},
36+
]
37+
38+
if with_assistant_response:
39+
output.append({"role": "assistant", "content": element["response"]})
40+
41+
return output
42+
43+
44+
def set_special_tokens(tokenizer, model_name):
45+
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
46+
tokenizer.chat_template = CHAT_ML_TEMPLATE
47+
tokenizer.eos_token = CHAT_ML_EOS_TOKEN
48+
if tokenizer.pad_token_id is None:
49+
tokenizer.pad_token_id = tokenizer.eos_token_id
50+
51+
52+
def get_assistant_prompt(model_name):
53+
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
54+
return "<|im_start|>assistant\n"
55+
else:
56+
return "[/INST]"
57+
58+
59+
def get_response_template_ids(tokenizer, model_name):
60+
return tokenizer.encode(get_assistant_prompt(model_name), add_special_tokens=False)
61+
62+
63+
def maybe_add_generation_prompt(x, model_name):
64+
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4":
65+
return x + get_assistant_prompt(model_name)
66+
else:
67+
return x

blog/lora-parameters/dataset_utils.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import datasets
2+
import pandas as pd
3+
4+
5+
def add_length_column(dataset) -> pd.DataFrame:
6+
df = dataset.to_pandas()
7+
df["total_length"] = 0
8+
for column_name in ["instruction", "input", "response"]:
9+
num_words = df[column_name].astype(str).str.split().apply(len)
10+
df["total_length"] += num_words
11+
12+
return df
13+
14+
15+
def filter_by_total_length(df, difficulty, number_of_samples):
16+
if difficulty == "easy":
17+
return df[df["total_length"].between(10, 100)].iloc[:number_of_samples]
18+
elif difficulty == "medium":
19+
return df[df["total_length"].between(101, 200)].iloc[:number_of_samples]
20+
elif difficulty == "hard":
21+
return df[df["total_length"].between(201, 800)].iloc[:number_of_samples]
22+
23+
24+
def get_dataset_subset_name(difficulty: str) -> str:
25+
return f"text-to-sql-v1-{difficulty}"
26+
27+
28+
def create_and_save_datasets(
29+
df, difficulty, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1
30+
):
31+
seed = 123
32+
# remove total_length column because we don't need it anymore
33+
df = df.drop(columns=["total_length"])
34+
dataset = datasets.Dataset.from_pandas(df, preserve_index=False)
35+
36+
# split into training and "the rest"
37+
train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed)
38+
39+
# split "the rest" into validation and testing
40+
val_test = train_valtest["test"].train_test_split(
41+
test_size=test_ratio / (test_ratio + val_ratio), seed=seed
42+
)
43+
44+
dataset = datasets.DatasetDict(
45+
{
46+
"train": train_valtest["train"],
47+
"valid": val_test["train"],
48+
"test": val_test["test"],
49+
}
50+
)
51+
dataset_name = get_dataset_subset_name(difficulty)
52+
dataset.save_to_disk(dataset_name)
53+
return dataset
54+
55+
56+
def load_dataset(difficulty):
57+
return datasets.load_from_disk(get_dataset_subset_name(difficulty))
58+
59+
60+
def load_or_create_dataset(difficulty, num_samples=10000):
61+
try:
62+
return load_dataset(difficulty)
63+
except FileNotFoundError:
64+
dataset = datasets.load_dataset("Clinton/Text-to-sql-v1")
65+
dataset = dataset["train"]
66+
dataset = dataset.remove_columns(["text", "source"])
67+
df = add_length_column(dataset)
68+
df = filter_by_total_length(df, difficulty, num_samples)
69+
return create_and_save_datasets(df, difficulty)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto",
4+
"loss_scale": 0,
5+
"loss_scale_window": 1000,
6+
"initial_scale_power": 16,
7+
"hysteresis": 2,
8+
"min_loss_scale": 1
9+
},
10+
"bf16": {
11+
"enabled": "auto"
12+
},
13+
"optimizer": {
14+
"type": "AdamW",
15+
"params": {
16+
"lr": "auto",
17+
"betas": "auto",
18+
"eps": "auto",
19+
"weight_decay": "auto"
20+
}
21+
},
22+
"scheduler": {
23+
"type": "WarmupDecayLR",
24+
"params": {
25+
"warmup_min_lr": "auto",
26+
"warmup_max_lr": "auto",
27+
"warmup_num_steps": "auto",
28+
"total_num_steps": "auto"
29+
}
30+
},
31+
"zero_optimization": {
32+
"stage": 3,
33+
"overlap_comm": true,
34+
"contiguous_gradients": true,
35+
"sub_group_size": 1e9,
36+
"reduce_bucket_size": "auto",
37+
"stage3_prefetch_bucket_size": "auto",
38+
"stage3_param_persistence_threshold": "auto",
39+
"stage3_max_live_parameters": 1e9,
40+
"stage3_max_reuse_distance": 1e9,
41+
"stage3_gather_16bit_weights_on_model_save": true
42+
},
43+
"gradient_accumulation_steps": "auto",
44+
"gradient_clipping": "auto",
45+
"train_batch_size": "auto",
46+
"train_micro_batch_size_per_gpu": "auto"
47+
}

0 commit comments

Comments
 (0)