Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lr scheduler, weight decay and max_grad_norm #214

Merged
merged 12 commits into from
Nov 19, 2024
27 changes: 27 additions & 0 deletions src/together/cli/api/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,30 @@ def fine_tuning(ctx: click.Context) -> None:
)
@click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
@click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
@click.option(
"--min-lr-ratio",
type=float,
default=0.0,
help="Final learning rate as a percentage of the initial learning rate",
)
@click.option(
"--warmup-ratio",
type=float,
default=0.0,
help="Warmup ratio for learning rate scheduler.",
)
@click.option(
"--max-grad-norm",
type=float,
default=None,
help="Max gradient norm",
)
@click.option(
"--weight-decay",
type=float,
default=None,
help="Weight decay",
)
@click.option(
"--lora/--no-lora",
type=bool,
Expand Down Expand Up @@ -103,7 +121,10 @@ def create(
n_checkpoints: int,
batch_size: int | Literal["max"],
learning_rate: float,
min_lr_ratio: float,
warmup_ratio: float,
max_grad_norm: float,
weight_decay: float,
lora: bool,
lora_r: int,
lora_dropout: float,
Expand All @@ -125,7 +146,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down Expand Up @@ -194,7 +218,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
46 changes: 44 additions & 2 deletions src/together/resources/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
TogetherClient,
TogetherRequest,
TrainingType,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.finetune import DownloadCheckpointType
from together.utils import log_warn_once, normalize_key
Expand All @@ -35,7 +37,10 @@ def createFinetuneRequest(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
min_lr_ratio: float | None = 0.0,
warmup_ratio: float | None = 0.0,
max_grad_norm: float | None = None,
weight_decay: float | None = None,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand Down Expand Up @@ -82,6 +87,20 @@ def createFinetuneRequest(
if warmup_ratio > 1 or warmup_ratio < 0:
raise ValueError("Warmup ratio should be between 0 and 1")

if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0):
raise ValueError("Ending rate should be between 0 and 1")

if max_grad_norm is not None and (max_grad_norm < 0):
raise ValueError("Max gradient norm should be non-negative")

if weight_decay is not None and (weight_decay < 0):
raise ValueError("Weight decay should be non-negative")

lrScheduler = FinetuneLRScheduler(
lr_scheduler_type="linear",
lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
)

finetune_request = FinetuneRequest(
model=model,
training_file=training_file,
Expand All @@ -91,7 +110,10 @@ def createFinetuneRequest(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
lr_scheduler=lrScheduler,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
training_type=training_type,
suffix=suffix,
wandb_key=wandb_api_key,
Expand All @@ -115,7 +137,10 @@ def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
min_lr_ratio: float | None = 0.0,
warmup_ratio: float | None = 0.0,
max_grad_norm: float | None = None,
weight_decay: float | None = None,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -140,7 +165,11 @@ def create(
batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Ending learning rate as a percentage of the initial learning rate for
learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to None.
weight_decay (float, optional): Weight decay. Defaults to None.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -176,7 +205,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down Expand Up @@ -426,7 +458,10 @@ async def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
min_lr_ratio: float | None = 0.0,
warmup_ratio: float | None = 0.0,
max_grad_norm: float | None = None,
weight_decay: float | None = None,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -449,9 +484,13 @@ async def create(
n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning.
Defaults to 1.
batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Ending learning rate as a percentage of the initial learning rate for
learning rate scheduler. Defaults to 0.0.
min_lr_ratio (float, optional): Ending learning rate for learning rate scheduler.
Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to None.
weight_decay (float, optional): Weight decay. Defaults to None.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -487,7 +526,10 @@ async def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
4 changes: 4 additions & 0 deletions src/together/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
LoRATrainingType,
TrainingType,
FinetuneTrainingLimits,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.images import (
ImageRequest,
Expand Down Expand Up @@ -57,6 +59,8 @@
"FinetuneList",
"FinetuneListEvents",
"FinetuneDownloadResult",
"FinetuneLRScheduler",
"FinetuneLinearLRSchedulerArgs",
"FileRequest",
"FileResponse",
"FileList",
Expand Down
17 changes: 17 additions & 0 deletions src/together/types/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ class FinetuneRequest(BaseModel):
n_epochs: int
# training learning rate
learning_rate: float
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float
# number of checkpoints to save
Expand Down Expand Up @@ -192,8 +194,14 @@ class FinetuneResponse(BaseModel):
batch_size: int | None = None
# training learning rate
learning_rate: float | None = None
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float | None = None
# max gradient norm
max_grad_norm: float | None = None
# weight decay
weight_decay: float | None = None
# number of steps between evals
eval_steps: int | None = None
# training type
Expand Down Expand Up @@ -285,3 +293,12 @@ class FinetuneTrainingLimits(BaseModel):
min_learning_rate: float
full_training: FinetuneFullTrainingLimits | None = None
lora_training: FinetuneLoraTrainingLimits | None = None


class FinetuneLRScheduler(BaseModel):
lr_scheduler_type: str
lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None


class FinetuneLinearLRSchedulerArgs(BaseModel):
min_lr_ratio: float