Add lr scheduler, weight decay and max_grad_norm (#214)

* Add weight decay and max_grad_norm * Change to min_lr_ratio * Update max_grad_norm and weight_decay defaults, supported vals * Add hints for disabling max_grad_nomr * add back learning rate hint * add max_grad_norm and weight_decay to FinetuneRequest * Remove percentage from min_lr_ratio description * Fix hints and typing * Update version to 1.3.5 * Fix more typing * Make min_lr_ratio optional --------- Co-authored-by: Arsh Zahed <[email protected]>
togethercomputer · Nov 19, 2024 · 2467de2 · 2467de2
1 parent 1eb7779
commit 2467de2
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 4 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "together"
-version = "1.3.4"
+version = "1.3.5"
 authors = [
     "Together AI <[email protected]>"
 ]

diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
@@ -65,12 +65,30 @@ def fine_tuning(ctx: click.Context) -> None:
 )
 @click.option("--batch-size", type=INT_WITH_MAX, default="max", help="Train batch size")
 @click.option("--learning-rate", type=float, default=1e-5, help="Learning rate")
+@click.option(
+    "--min-lr-ratio",
+    type=float,
+    default=0.0,
+    help="The ratio of the final learning rate to the peak learning rate",
+)
 @click.option(
     "--warmup-ratio",
     type=float,
     default=0.0,
     help="Warmup ratio for learning rate scheduler.",
 )
+@click.option(
+    "--max-grad-norm",
+    type=float,
+    default=1.0,
+    help="Max gradient norm to be used for gradient clipping. Set to 0 to disable.",
+)
+@click.option(
+    "--weight-decay",
+    type=float,
+    default=0.0,
+    help="Weight decay",
+)
 @click.option(
     "--lora/--no-lora",
     type=bool,
@@ -115,7 +133,10 @@ def create(
     n_checkpoints: int,
     batch_size: int | Literal["max"],
     learning_rate: float,
+    min_lr_ratio: float,
     warmup_ratio: float,
+    max_grad_norm: float,
+    weight_decay: float,
     lora: bool,
     lora_r: int,
     lora_dropout: float,
@@ -138,7 +159,10 @@ def create(
         n_checkpoints=n_checkpoints,
         batch_size=batch_size,
         learning_rate=learning_rate,
+        min_lr_ratio=min_lr_ratio,
         warmup_ratio=warmup_ratio,
+        max_grad_norm=max_grad_norm,
+        weight_decay=weight_decay,
         lora=lora,
         lora_r=lora_r,
         lora_dropout=lora_dropout,

diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
@@ -20,6 +20,8 @@
     TogetherClient,
     TogetherRequest,
     TrainingType,
+    FinetuneLRScheduler,
+    FinetuneLinearLRSchedulerArgs,
 )
 from together.types.finetune import DownloadCheckpointType
 from together.utils import log_warn_once, normalize_key
@@ -35,7 +37,10 @@ def createFinetuneRequest(
     n_checkpoints: int | None = 1,
     batch_size: int | Literal["max"] = "max",
     learning_rate: float | None = 0.00001,
-    warmup_ratio: float | None = 0.0,
+    min_lr_ratio: float = 0.0,
+    warmup_ratio: float = 0.0,
+    max_grad_norm: float = 1.0,
+    weight_decay: float = 0.0,
     lora: bool = False,
     lora_r: int | None = None,
     lora_dropout: float | None = 0,
@@ -83,6 +88,20 @@ def createFinetuneRequest(
     if warmup_ratio > 1 or warmup_ratio < 0:
         raise ValueError("Warmup ratio should be between 0 and 1")
 
+    if min_lr_ratio is not None and (min_lr_ratio > 1 or min_lr_ratio < 0):
+        raise ValueError("Min learning rate ratio should be between 0 and 1")
+
+    if max_grad_norm < 0:
+        raise ValueError("Max gradient norm should be non-negative")
+
+    if weight_decay is not None and (weight_decay < 0):
+        raise ValueError("Weight decay should be non-negative")
+
+    lrScheduler = FinetuneLRScheduler(
+        lr_scheduler_type="linear",
+        lr_scheduler_args=FinetuneLinearLRSchedulerArgs(min_lr_ratio=min_lr_ratio),
+    )
+
     finetune_request = FinetuneRequest(
         model=model,
         training_file=training_file,
@@ -92,7 +111,10 @@ def createFinetuneRequest(
         n_checkpoints=n_checkpoints,
         batch_size=batch_size,
         learning_rate=learning_rate,
+        lr_scheduler=lrScheduler,
         warmup_ratio=warmup_ratio,
+        max_grad_norm=max_grad_norm,
+        weight_decay=weight_decay,
         training_type=training_type,
         suffix=suffix,
         wandb_key=wandb_api_key,
@@ -117,7 +139,10 @@ def create(
         n_checkpoints: int | None = 1,
         batch_size: int | Literal["max"] = "max",
         learning_rate: float | None = 0.00001,
-        warmup_ratio: float | None = 0.0,
+        min_lr_ratio: float = 0.0,
+        warmup_ratio: float = 0.0,
+        max_grad_norm: float = 1.0,
+        weight_decay: float = 0.0,
         lora: bool = False,
         lora_r: int | None = None,
         lora_dropout: float | None = 0,
@@ -143,7 +168,11 @@ def create(
             batch_size (int or "max"): Batch size for fine-tuning. Defaults to max.
             learning_rate (float, optional): Learning rate multiplier to use for training
                 Defaults to 0.00001.
+            min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
+                the learning rate scheduler. Defaults to 0.0.
             warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
+            max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
+            weight_decay (float, optional): Weight decay. Defaults to 0.0.
             lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
             lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
             lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
@@ -185,7 +214,10 @@ def create(
             n_checkpoints=n_checkpoints,
             batch_size=batch_size,
             learning_rate=learning_rate,
+            min_lr_ratio=min_lr_ratio,
             warmup_ratio=warmup_ratio,
+            max_grad_norm=max_grad_norm,
+            weight_decay=weight_decay,
             lora=lora,
             lora_r=lora_r,
             lora_dropout=lora_dropout,
@@ -436,7 +468,10 @@ async def create(
         n_checkpoints: int | None = 1,
         batch_size: int | Literal["max"] = "max",
         learning_rate: float | None = 0.00001,
-        warmup_ratio: float | None = 0.0,
+        min_lr_ratio: float = 0.0,
+        warmup_ratio: float = 0.0,
+        max_grad_norm: float = 1.0,
+        weight_decay: float = 0.0,
         lora: bool = False,
         lora_r: int | None = None,
         lora_dropout: float | None = 0,
@@ -462,7 +497,11 @@ async def create(
             batch_size (int, optional): Batch size for fine-tuning. Defaults to max.
             learning_rate (float, optional): Learning rate multiplier to use for training
                 Defaults to 0.00001.
+            min_lr_ratio (float, optional): Min learning rate ratio of the initial learning rate for
+                the learning rate scheduler. Defaults to 0.0.
             warmup_ratio (float, optional): Warmup ratio for learning rate scheduler.
+            max_grad_norm (float, optional): Max gradient norm. Defaults to 1.0, set to 0 to disable.
+            weight_decay (float, optional): Weight decay. Defaults to 0.0.
             lora (bool, optional): Whether to use LoRA adapters. Defaults to True.
             lora_r (int, optional): Rank of LoRA adapters. Defaults to 8.
             lora_dropout (float, optional): Dropout rate for LoRA adapters. Defaults to 0.
@@ -504,7 +543,10 @@ async def create(
             n_checkpoints=n_checkpoints,
             batch_size=batch_size,
             learning_rate=learning_rate,
+            min_lr_ratio=min_lr_ratio,
             warmup_ratio=warmup_ratio,
+            max_grad_norm=max_grad_norm,
+            weight_decay=weight_decay,
             lora=lora,
             lora_r=lora_r,
             lora_dropout=lora_dropout,

diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py
@@ -30,6 +30,8 @@
     LoRATrainingType,
     TrainingType,
     FinetuneTrainingLimits,
+    FinetuneLRScheduler,
+    FinetuneLinearLRSchedulerArgs,
 )
 from together.types.images import (
     ImageRequest,
@@ -57,6 +59,8 @@
     "FinetuneList",
     "FinetuneListEvents",
     "FinetuneDownloadResult",
+    "FinetuneLRScheduler",
+    "FinetuneLinearLRSchedulerArgs",
     "FileRequest",
     "FileResponse",
     "FileList",

diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
@@ -150,8 +150,14 @@ class FinetuneRequest(BaseModel):
     n_epochs: int
     # training learning rate
     learning_rate: float
+    # learning rate scheduler type and args
+    lr_scheduler: FinetuneLRScheduler | None = None
     # learning rate warmup ratio
     warmup_ratio: float
+    # max gradient norm
+    max_grad_norm: float
+    # weight decay
+    weight_decay: float
     # number of checkpoints to save
     n_checkpoints: int | None = None
     # number of evaluation loops to run
@@ -193,8 +199,14 @@ class FinetuneResponse(BaseModel):
     batch_size: int | None = None
     # training learning rate
     learning_rate: float | None = None
+    # learning rate scheduler type and args
+    lr_scheduler: FinetuneLRScheduler | None = None
     # learning rate warmup ratio
     warmup_ratio: float | None = None
+    # max gradient norm
+    max_grad_norm: float | None = None
+    # weight decay
+    weight_decay: float | None = None
     # number of steps between evals
     eval_steps: int | None = None
     # training type
@@ -287,3 +299,12 @@ class FinetuneTrainingLimits(BaseModel):
     min_learning_rate: float
     full_training: FinetuneFullTrainingLimits | None = None
     lora_training: FinetuneLoraTrainingLimits | None = None
+
+
+class FinetuneLRScheduler(BaseModel):
+    lr_scheduler_type: str
+    lr_scheduler_args: FinetuneLinearLRSchedulerArgs | None = None
+
+
+class FinetuneLinearLRSchedulerArgs(BaseModel):
+    min_lr_ratio: float | None = 0.0