huggingface · ArthurZucker · Jul 23, 2024 · Jul 9, 2024 · Jul 10, 2024 · Jul 11, 2024
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -862,17 +862,28 @@ def update(
             k_out.copy_(key_states)
             v_out.copy_(value_states)
         else:
-            k_out[:, :, cache_position] = key_states
-            v_out[:, :, cache_position] = value_states
-
+            # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to
+            # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
+            # operation, that avoids copies and uses less memory.
+            k_out.index_copy_(2, cache_position, key_states)
+            v_out.index_copy_(2, cache_position, value_states)
         return k_out, v_out
 
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states that were seen by the model."""
         # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
         # limit the check to the first batch member and head dimension.
         # TODO: deprecate this function in favor of `cache_position`
-        return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
+        key_cache = self.key_cache[layer_idx]
+        device = key_cache.device
+
+        # index_select(dim, index) performs the same operation as item = tensor[..., index, ...]
+        # but it is used for better generality and flexibility.
+        # For more information, refer to: https://pytorch.org/cppdocs/notes/tensor_indexing.html
+        item = key_cache.index_select(0, torch.tensor(0, device=device))
+        head = item.index_select(1, torch.tensor(0, device=device))
+
+        return head.any(dim=-1).sum()
 
     def get_max_length(self) -> Optional[int]:
         """Returns the maximum sequence length of the cached states."""