huggingface · qgallouedec · Sep 17, 2024 · Sep 11, 2024 · Sep 13, 2024 · Sep 17, 2024
diff --git a/tests/test_bco_trainer.py b/tests/test_bco_trainer.py
@@ -103,6 +103,26 @@ def test_bco_trainer(self, name, pre_compute, eval_dataset):
                 if param.sum() != 0:
                     self.assertFalse(torch.equal(param.cpu(), new_param.cpu()))
 
+    def test_bco_trainer_with_ref_model_is_model(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = BCOConfig(
+                output_dir=tmp_dir,
+                per_device_train_batch_size=2,
+                max_steps=3,
+                report_to="none",
+            )
+
+            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference")
+
+            with self.assertRaises(ValueError):
+                BCOTrainer(
+                    model=self.model,
+                    ref_model=self.model,  # ref_model can't be the same as model
+                    args=training_args,
+                    tokenizer=self.tokenizer,
+                    train_dataset=dummy_dataset["train"],
+                )
+
     def test_tokenize_and_process_tokens(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = BCOConfig(

diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
@@ -327,6 +327,26 @@ def test_dpo_trainer_without_providing_ref_model(self, rpo_alpha, _):
                 if param.sum() != 0:
                     assert not torch.equal(param, new_param)
 
+    def test_dpo_trainer_with_ref_model_is_model(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = DPOConfig(
+                output_dir=tmp_dir,
+                per_device_train_batch_size=2,
+                max_steps=3,
+                report_to="none",
+            )
+
+            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_preference")
+
+            with self.assertRaises(ValueError):
+                DPOTrainer(
+                    model=self.model,
+                    ref_model=self.model,  # ref_model can't be the same as model
+                    args=training_args,
+                    tokenizer=self.tokenizer,
+                    train_dataset=dummy_dataset["train"],
+                )
+
     @require_peft
     def test_dpo_trainer_without_providing_ref_model_with_lora(self):
         from peft import LoraConfig
@@ -473,7 +493,7 @@ def test_tr_dpo_trainer(self):
 
             trainer = DPOTrainer(
                 model=self.model,
-                ref_model=self.model,
+                ref_model=self.ref_model,
                 beta=0.1,
                 args=training_args,
                 tokenizer=self.tokenizer,

diff --git a/tests/test_kto_trainer.py b/tests/test_kto_trainer.py
@@ -101,6 +101,26 @@ def test_kto_trainer(self, name, loss_type, pre_compute, eval_dataset):
                 if param.sum() != 0:
                     self.assertFalse(torch.equal(param, new_param))
 
+    def test_kto_trainer_with_ref_model_is_model(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = KTOConfig(
+                output_dir=tmp_dir,
+                per_device_train_batch_size=2,
+                max_steps=3,
+                report_to="none",
+            )
+
+            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference")
+
+            with self.assertRaises(ValueError):
+                KTOTrainer(
+                    model=self.model,
+                    ref_model=self.model,  # ref_model can't be the same as model
+                    args=training_args,
+                    tokenizer=self.tokenizer,
+                    train_dataset=dummy_dataset["train"],
+                )
+
     def test_tokenize_and_process_tokens(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = KTOConfig(

diff --git a/tests/test_online_dpo_trainer.py b/tests/test_online_dpo_trainer.py
@@ -86,6 +86,26 @@ def test_training_with_ref_model(self):
             # Check if training loss is available
             self.assertIn("train_loss", trainer.state.log_history[-1])
 
+    def test_ref_model_is_model(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = OnlineDPOConfig(
+                output_dir=tmp_dir,
+                per_device_train_batch_size=2,
+                max_steps=3,
+                report_to="none",
+            )
+
+            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only")
+
+            with self.assertRaises(ValueError):
+                OnlineDPOTrainer(
+                    model=self.model,
+                    ref_model=self.model,  # ref_model can't be the same as model
+                    args=training_args,
+                    tokenizer=self.tokenizer,
+                    train_dataset=dummy_dataset["train"],
+                )
+
     @require_peft
     def test_training_with_peft(self):
         lora_config = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")

diff --git a/trl/trainer/bco_trainer.py b/trl/trainer/bco_trainer.py
@@ -336,6 +336,12 @@ def __init__(
         if type(args) is TrainingArguments:
             raise ValueError("Please use `BCOConfig` instead `TrainingArguments`.")
 
+        if ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must mass a copy of it, or `None` if you use peft."
+            )
+
         if args.model_init_kwargs is None:
             model_init_kwargs = {}
         elif not isinstance(model, str):

diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
@@ -452,6 +452,12 @@ def __init__(
         reference_free: bool = False,
         force_use_ref_model: bool = False,
     ):
+        if not isinstance(model, str) and ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must mass a copy of it, or `None` if you use peft."
+            )
+
         if model_init_kwargs is not None:
             warnings.warn(
                 "You passed `model_init_kwargs` to the DPOTrainer, the value you passed will override the one in the `DPOConfig`."

diff --git a/trl/trainer/kto_trainer.py b/trl/trainer/kto_trainer.py
@@ -319,6 +319,12 @@ def __init__(
         if type(args) is TrainingArguments:
             raise ValueError("Please use `KTOConfig` instead TrainingArguments.")
 
+        if not isinstance(model, str) and ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, you must mass a copy of it, or `None` if you use peft."
+            )
+
         if args.model_init_kwargs is None:
             model_init_kwargs = {}
         elif not isinstance(model, str):

diff --git a/trl/trainer/online_dpo_trainer.py b/trl/trainer/online_dpo_trainer.py
@@ -128,6 +128,14 @@ def __init__(
         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ) -> None:
+        if ref_model is model:
+            raise ValueError(
+                "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
+                "same as `model`, either omit the `ref_model` argument or pass `None`."
+            )
+
+        self.ref_model = ref_model
+
         if reward_model is not None and judge is not None:
             warnings.warn(
                 "Both `reward_model` and `judge` are provided. Please choose provide only one of them. "

diff --git a/trl/trainer/ppov2_trainer.py b/trl/trainer/ppov2_trainer.py
@@ -97,6 +97,12 @@ def __init__(
         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         callbacks: Optional[List[TrainerCallback]] = None,
     ) -> None:
+        if ref_policy is policy:
+            raise ValueError(
+                "`policy` and `ref_policy` cannot be the same object. If you want `ref_policy` to be the "
+                "same as `policy`, you must mass a copy of it, or `None` if you use peft."
+            )
+
         self.args = config
         args = config
         self.tokenizer = tokenizer

diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
@@ -78,6 +78,12 @@ def __init__(
         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
         callbacks: Optional[List[TrainerCallback]] = None,
     ) -> None:
+        if ref_policy is policy:
+            raise ValueError(
+                "`policy` and `ref_policy` cannot be the same object. If you want `ref_policy` to be the "
+                "same as `policy`, you must mass a copy of it, or `None` if you use peft."
+            )
+
         self.args = config
         args = config
         self.tokenizer = tokenizer