Skip to content

Commit

Permalink
Add lr scheduler, weight decay and max_grad_norm (#214)
Browse files Browse the repository at this point in the history
* Add weight decay and max_grad_norm

* Change to min_lr_ratio

* Update max_grad_norm and weight_decay defaults, supported vals

* Add hints for disabling max_grad_nomr

* add back learning rate hint

* add max_grad_norm and weight_decay to FinetuneRequest

* Remove percentage from min_lr_ratio description

* Fix hints and typing

* Update version to 1.3.5

* Fix more typing

* Make min_lr_ratio optional

---------

Co-authored-by: Arsh Zahed <[email protected]>
  • Loading branch information
azahed98 and Arsh Zahed authored Nov 19, 2024
1 parent 1eb7779 commit 2467de2
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 4 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"

[tool.poetry]
name = "together"
version = "1.3.4"
version = "1.3.5"
authors = [
"Together AI <[email protected]>"
]
Expand Down
24 changes: 24 additions & 0 deletions src/together/cli/api/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,30 @@ def fine_tuning(ctx: click.Context) -> None:
)
@click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
@click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
@click.option(
"--min-lr-ratio",
type=float,
default=0.0,
help="The ratio of the final learning rate to the peak learning rate",
)
@click.option(
"--warmup-ratio",
type=float,
default=0.0,
help="Warmup ratio for learning rate scheduler.",
)
@click.option(
"--max-grad-norm",
type=float,
default=1.0,
help="Max gradient norm to be used for gradient clipping. Set to 0 to disable.",
)
@click.option(
"--weight-decay",
type=float,
default=0.0,
help="Weight decay",
)
@click.option(
"--lora/--no-lora",
type=bool,
Expand Down Expand Up @@ -115,7 +133,10 @@ def create(
n_checkpoints: int,
batch_size: int | Literal["max"],
learning_rate: float,
min_lr_ratio: float,
warmup_ratio: float,
max_grad_norm: float,
weight_decay: float,
lora: bool,
lora_r: int,
lora_dropout: float,
Expand All @@ -138,7 +159,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
48 changes: 45 additions & 3 deletions src/together/resources/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
TogetherClient,
TogetherRequest,
TrainingType,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.finetune import DownloadCheckpointType
from together.utils import log_warn_once, normalize_key
Expand All @@ -35,7 +37,10 @@ def createFinetuneRequest(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
warmup_ratio: float | None = 0.0,
min_lr_ratio: float = 0.0,
warmup_ratio: float = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand Down Expand Up @@ -83,6 +88,20 @@ def createFinetuneRequest(
if warmup_ratio > 1 or warmup_ratio < 0:
raise ValueError("Warmup ratio should be between 0 and 1")

if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0):
raise ValueError("Min learning rate ratio should be between 0 and 1")

if max_grad_norm < 0:
raise ValueError("Max gradient norm should be non-negative")

if weight_decay is not None and (weight_decay < 0):
raise ValueError("Weight decay should be non-negative")

lrScheduler = FinetuneLRScheduler(
lr_scheduler_type="linear",
lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
)

finetune_request = FinetuneRequest(
model=model,
training_file=training_file,
Expand All @@ -92,7 +111,10 @@ def createFinetuneRequest(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
lr_scheduler=lrScheduler,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
training_type=training_type,
suffix=suffix,
wandb_key=wandb_api_key,
Expand All @@ -117,7 +139,10 @@ def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
warmup_ratio: float | None = 0.0,
min_lr_ratio: float = 0.0,
warmup_ratio: float = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -143,7 +168,11 @@ def create(
batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
the learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
weight_decay (float, optional): Weight decay. Defaults to 0.0.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -185,7 +214,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down Expand Up @@ -436,7 +468,10 @@ async def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
warmup_ratio: float | None = 0.0,
min_lr_ratio: float = 0.0,
warmup_ratio: float = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -462,7 +497,11 @@ async def create(
batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
the learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
weight_decay (float, optional): Weight decay. Defaults to 0.0.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -504,7 +543,10 @@ async def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
4 changes: 4 additions & 0 deletions src/together/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
LoRATrainingType,
TrainingType,
FinetuneTrainingLimits,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.images import (
ImageRequest,
Expand Down Expand Up @@ -57,6 +59,8 @@
"FinetuneList",
"FinetuneListEvents",
"FinetuneDownloadResult",
"FinetuneLRScheduler",
"FinetuneLinearLRSchedulerArgs",
"FileRequest",
"FileResponse",
"FileList",
Expand Down
21 changes: 21 additions & 0 deletions src/together/types/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,14 @@ class FinetuneRequest(BaseModel):
n_epochs: int
# training learning rate
learning_rate: float
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float
# max gradient norm
max_grad_norm: float
# weight decay
weight_decay: float
# number of checkpoints to save
n_checkpoints: int | None = None
# number of evaluation loops to run
Expand Down Expand Up @@ -193,8 +199,14 @@ class FinetuneResponse(BaseModel):
batch_size: int | None = None
# training learning rate
learning_rate: float | None = None
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float | None = None
# max gradient norm
max_grad_norm: float | None = None
# weight decay
weight_decay: float | None = None
# number of steps between evals
eval_steps: int | None = None
# training type
Expand Down Expand Up @@ -287,3 +299,12 @@ class FinetuneTrainingLimits(BaseModel):
min_learning_rate: float
full_training: FinetuneFullTrainingLimits | None = None
lora_training: FinetuneLoraTrainingLimits | None = None


class FinetuneLRScheduler(BaseModel):
lr_scheduler_type: str
lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None


class FinetuneLinearLRSchedulerArgs(BaseModel):
min_lr_ratio: float | None = 0.0

0 comments on commit 2467de2

Please sign in to comment.