Merge branch 'mlperf_features' into dev/madamczyk/polynomial_bucketing

HabanaAI · Feb 21, 2025 · 37d74e0 · 37d74e0
2 parents f021c30 + 6eeefdd
commit 37d74e0
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -2934,6 +2934,11 @@ def _patch_prev_output(self):
         model_input = self.cached_step_inputs.pop(0)
         delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(-1).tolist()
         ctx = model_input.async_callback.keywords["ctx"]
+        # If there's no output to patch with,
+        # which is usually the case when we're starting a new request after all in-flight requests are completed,
+        # We return (Note that we have now cleared the cached_step_inputs/outputs as required).
+        if len(ctx.output_queue) == 0: 
+            return
         assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!'
         output_data = ctx.output_queue[0]
         assert len(output_data.outputs) == 1
@@ -2945,4 +2950,4 @@ def _patch_prev_output(self):
             # This is a hack. Assigning output_token_ids triggers
             # a cache recomputation and we only need to update the last token
             seq_data.output_token_ids_array[-1] = real_out
-            seq_data._cached_all_token_ids[-1] = real_out
+            seq_data._cached_all_token_ids[-1] = real_out