Should be merging the LORA adapters back into the model for the final…

… checkpoint
allenai · Sep 23, 2024 · f78d021 · f78d021
1 parent 5967a52
commit f78d021
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 12 deletions.
diff --git a/pdelfin/train/config/qwen2vl-2b.yaml b/pdelfin/train/config/qwen2vl-2b.yaml
@@ -80,4 +80,4 @@ save:
   path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/
   save_every_steps: 100
 
-max_workers: 10
+max_workers: 30
diff --git a/pdelfin/train/train.py b/pdelfin/train/train.py
@@ -1,14 +1,3 @@
-# Step 1, load the data
-# Probably, we want to see just a folder with openai batch input jsonls, plus the batch output jsonls
-# TODO: Figure out hyperparameters for image sizing
-# Step 2. Load those prompts through and do a forward pass to calculate the loss
-
-# Step 3. Add hugging face accelerate for training
-
-# Step 4. Checkpointing code, both saving and reloading to restart
-
-# Step 5. Move over from interactive session to gantry launch script
-
 import os
 import json
 import base64
@@ -217,9 +206,15 @@ def run_train(config: TrainConfig):
         trainer.train()  # pyright: ignore
 
         with get_local_dir(join_path("", save_path, "best")) as best_dir:
+            if config.lora is not None:
+                logger.info("Merging LoRA adapters into the base model...")
+                model = model.merge_and_unload()
+                logger.info("LoRA adapters merged successfully.")
+
             model.save_pretrained(best_dir)
             logger.info("Saved best model to %s", best_dir)
 
+
         # Uncomment to test speed of data loader
         # train_dataloader = DataLoader(formatted_dataset["train"], batch_size=1, num_workers=4, shuffle=False)
         # for entry in tqdm(train_dataloader):