huggingface · qgallouedec · Jan 6, 2025 · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/tests/test_data_collator_completion_only.py b/tests/test_data_collator_completion_only.py
@@ -114,7 +114,7 @@ def test_padding_free(self):
         inst1 = "### System: You are a helpful assistant.\n\n### User: How much is 2+2?\n\n### Assistant: 2+2 equals 4"
         inst2 = "### System: You are a honest and helpful assistant.\n\n### User: What is the answer of 22x22?\n\n### Assistant: 22x22 equals 484"
 
-        response_template = "\n### Assistant:"
+        response_template = "\n\n### Assistant:"
         collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
         collator_paddingfree = DataCollatorForCompletionOnlyLM(
             response_template, tokenizer=tokenizer, padding_free=True
@@ -143,3 +143,21 @@ def test_padding_free(self):
         self.assertTrue((input_ids_remove_pad == batch_paddingfree["input_ids"]).all())
         self.assertTrue((expected_position_ids == batch_paddingfree["position_ids"]).all())
         self.assertTrue((expected_labels == batch_paddingfree["labels"]).all())
+
+    def test_data_collator_for_completion_only_lm(self):
+        # The tokenizer isn't use but the collator needs it to be provided.
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
+
+        collator = DataCollatorForCompletionOnlyLM(tokenizer.decode(9999), tokenizer=tokenizer, padding_free=True)
+
+        tokenized_instruction = [
+            {"input_ids": [1, 2, 3, 9999, 4, 5], "attention_mask": [1, 1, 1, 1, 1, 1]},
+            {"input_ids": [6, 7, 8, 9, 9999, 10, 11], "attention_mask": [1, 1, 1, 1, 1, 1, 1]},
+        ]
+        batch = collator(tokenized_instruction)
+
+        self.assertEqual(batch["position_ids"].tolist(), [[0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6]])  # flat pos ids
+        self.assertEqual(batch["cu_seq_lens_q"].tolist(), [0, 6, 13])  # start idx of each seq + total number of tokens
+        self.assertEqual(batch["cu_seq_lens_k"].tolist(), [0, 6, 13])  # idem
+        self.assertEqual(batch["max_length_k"], 7)  # max length in batch, here 7 (second sequence)
+        self.assertEqual(batch["max_length_q"], 7)  # idem
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
@@ -211,6 +211,25 @@ def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> d
             batch["labels"] = batch["labels"][attn_mask.bool()].unsqueeze(0)
             batch["labels"][batch["position_ids"] == 0] = self.ignore_index
 
+            # Calculate cumulative sequence lengths for queries and keys to prevent graph breaks during further computations.
+            flattened_position_ids = batch["position_ids"].flatten()
+            indices_q = torch.arange(
+                flattened_position_ids.size(0), device=flattened_position_ids.device, dtype=torch.int32
+            )
+            batch["cu_seq_lens_q"] = torch.cat(
+                (
+                    indices_q[flattened_position_ids == 0],
+                    torch.tensor(
+                        flattened_position_ids.size(), device=flattened_position_ids.device, dtype=torch.int32
+                    ),
+                )
+            )
+            batch["cu_seq_lens_k"] = batch["cu_seq_lens_q"]
+
+            # Determine maximum sequence lengths to prevent graph breaks during further computations.
+            batch["max_length_k"] = flattened_position_ids.max().item() + 1
+            batch["max_length_q"] = batch["max_length_k"]
+
         return batch