vllm-project · simon-mo · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -73,16 +73,17 @@ def write_to_paged_cache(
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        kv_scale: float,
+        kv_scale: Optional[float] = None,
     ) -> None:
+        optional_args = [x for x in (kv_scale, ) if x is not None]
         cache_ops.reshape_and_cache(
             key,
             value,
             key_cache,
             value_cache,
             slot_mapping.flatten(),
             kv_cache_dtype,
-            kv_scale,
+            *optional_args,
         )
 
     @staticmethod
@@ -97,10 +98,11 @@ def forward_decode(
         num_kv_heads: int,
         scale: float,
         alibi_slopes: Optional[torch.Tensor],
-        kv_scale,
+        kv_scale: Optional[float] = None,
     ) -> torch.Tensor:
-        output = torch.empty_like(query)
+        optional_args = [x for x in (kv_scale, ) if x is not None]
 
+        output = torch.empty_like(query)
         block_size = value_cache.shape[3]
         num_seqs, num_heads, head_size = query.shape
         max_num_partitions = ((max_context_len + _PARTITION_SIZE - 1) //
@@ -129,7 +131,7 @@ def forward_decode(
                 max_context_len,
                 alibi_slopes,
                 kv_cache_dtype,
-                kv_scale,
+                *optional_args,
             )
         else:
             # Run PagedAttention V2.
@@ -161,7 +163,7 @@ def forward_decode(
                 max_context_len,
                 alibi_slopes,
                 kv_cache_dtype,
-                kv_scale,
+                *optional_args,
             )
         return output