Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lr scheduler, weight decay and max_grad_norm #214

Merged
merged 12 commits into from
Nov 19, 2024
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"

[tool.poetry]
name = "together"
version = "1.3.4"
version = "1.3.5"
authors = [
"Together AI <[email protected]>"
]
Expand Down
24 changes: 24 additions & 0 deletions src/together/cli/api/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,30 @@ def fine_tuning(ctx: click.Context) -> None:
)
@click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
@click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
@click.option(
"--min-lr-ratio",
type=float,
default=0.0,
help="The ratio of the final learning rate to the peak learning rate",
)
@click.option(
"--warmup-ratio",
type=float,
default=0.0,
help="Warmup ratio for learning rate scheduler.",
)
@click.option(
"--max-grad-norm",
type=float,
default=1.0,
help="Max gradient norm to be used for gradient clipping. Set to 0 to disable.",
)
@click.option(
"--weight-decay",
type=float,
default=0.0,
help="Weight decay",
)
@click.option(
"--lora/--no-lora",
type=bool,
Expand Down Expand Up @@ -115,7 +133,10 @@ def create(
n_checkpoints: int,
batch_size: int | Literal["max"],
learning_rate: float,
min_lr_ratio: float,
warmup_ratio: float,
max_grad_norm: float,
weight_decay: float,
lora: bool,
lora_r: int,
lora_dropout: float,
Expand All @@ -138,7 +159,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
48 changes: 45 additions & 3 deletions src/together/resources/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
TogetherClient,
TogetherRequest,
TrainingType,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.finetune import DownloadCheckpointType
from together.utils import log_warn_once, normalize_key
Expand All @@ -35,7 +37,10 @@ def createFinetuneRequest(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
warmup_ratio: float | None = 0.0,
min_lr_ratio: float = 0.0,
warmup_ratio: float = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand Down Expand Up @@ -83,6 +88,20 @@ def createFinetuneRequest(
if warmup_ratio > 1 or warmup_ratio < 0:
raise ValueError("Warmup ratio should be between 0 and 1")

if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0):
raise ValueError("Min learning rate ratio should be between 0 and 1")

if max_grad_norm < 0:
raise ValueError("Max gradient norm should be non-negative")

if weight_decay is not None and (weight_decay < 0):
raise ValueError("Weight decay should be non-negative")

lrScheduler = FinetuneLRScheduler(
lr_scheduler_type="linear",
lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
)

finetune_request = FinetuneRequest(
model=model,
training_file=training_file,
Expand All @@ -92,7 +111,10 @@ def createFinetuneRequest(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
lr_scheduler=lrScheduler,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
training_type=training_type,
suffix=suffix,
wandb_key=wandb_api_key,
Expand All @@ -117,7 +139,10 @@ def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
warmup_ratio: float | None = 0.0,
min_lr_ratio: float = 0.0,
warmup_ratio: float = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -143,7 +168,11 @@ def create(
batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
the learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
weight_decay (float, optional): Weight decay. Defaults to 0.0.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -185,7 +214,10 @@ def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down Expand Up @@ -436,7 +468,10 @@ async def create(
n_checkpoints: int | None = 1,
batch_size: int | Literal["max"] = "max",
learning_rate: float | None = 0.00001,
warmup_ratio: float | None = 0.0,
min_lr_ratio: float = 0.0,
warmup_ratio: float = 0.0,
max_grad_norm: float = 1.0,
weight_decay: float = 0.0,
lora: bool = False,
lora_r: int | None = None,
lora_dropout: float | None = 0,
Expand All @@ -462,7 +497,11 @@ async def create(
batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
learning_rate (float, optional): Learning rate multiplier to use for training
Defaults to 0.00001.
min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
the learning rate scheduler. Defaults to 0.0.
warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
weight_decay (float, optional): Weight decay. Defaults to 0.0.
lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
Expand Down Expand Up @@ -504,7 +543,10 @@ async def create(
n_checkpoints=n_checkpoints,
batch_size=batch_size,
learning_rate=learning_rate,
min_lr_ratio=min_lr_ratio,
warmup_ratio=warmup_ratio,
max_grad_norm=max_grad_norm,
weight_decay=weight_decay,
lora=lora,
lora_r=lora_r,
lora_dropout=lora_dropout,
Expand Down
4 changes: 4 additions & 0 deletions src/together/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
LoRATrainingType,
TrainingType,
FinetuneTrainingLimits,
FinetuneLRScheduler,
FinetuneLinearLRSchedulerArgs,
)
from together.types.images import (
ImageRequest,
Expand Down Expand Up @@ -57,6 +59,8 @@
"FinetuneList",
"FinetuneListEvents",
"FinetuneDownloadResult",
"FinetuneLRScheduler",
"FinetuneLinearLRSchedulerArgs",
"FileRequest",
"FileResponse",
"FileList",
Expand Down
21 changes: 21 additions & 0 deletions src/together/types/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,14 @@ class FinetuneRequest(BaseModel):
n_epochs: int
# training learning rate
learning_rate: float
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float
# max gradient norm
max_grad_norm: float
# weight decay
weight_decay: float
# number of checkpoints to save
n_checkpoints: int | None = None
# number of evaluation loops to run
Expand Down Expand Up @@ -193,8 +199,14 @@ class FinetuneResponse(BaseModel):
batch_size: int | None = None
# training learning rate
learning_rate: float | None = None
# learning rate scheduler type and args
lr_scheduler: FinetuneLRScheduler | None = None
# learning rate warmup ratio
warmup_ratio: float | None = None
# max gradient norm
max_grad_norm: float | None = None
# weight decay
weight_decay: float | None = None
# number of steps between evals
eval_steps: int | None = None
# training type
Expand Down Expand Up @@ -287,3 +299,12 @@ class FinetuneTrainingLimits(BaseModel):
min_learning_rate: float
full_training: FinetuneFullTrainingLimits | None = None
lora_training: FinetuneLoraTrainingLimits | None = None


class FinetuneLRScheduler(BaseModel):
lr_scheduler_type: str
lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None


class FinetuneLinearLRSchedulerArgs(BaseModel):
min_lr_ratio: float | None = 0.0
Loading