Skip to content

Commit

Permalink
Merge branch 'mlperf_features' into dev/madamczyk/polynomial_bucketing
Browse files Browse the repository at this point in the history
  • Loading branch information
madamczykhabana authored Feb 21, 2025
2 parents f021c30 + 6eeefdd commit 37d74e0
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion vllm/worker/hpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2934,6 +2934,11 @@ def _patch_prev_output(self):
model_input = self.cached_step_inputs.pop(0)
delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(-1).tolist()
ctx = model_input.async_callback.keywords["ctx"]
# If there's no output to patch with,
# which is usually the case when we're starting a new request after all in-flight requests are completed,
# We return (Note that we have now cleared the cached_step_inputs/outputs as required).
if len(ctx.output_queue) == 0:
return
assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!'
output_data = ctx.output_queue[0]
assert len(output_data.outputs) == 1
Expand All @@ -2945,4 +2950,4 @@ def _patch_prev_output(self):
# This is a hack. Assigning output_token_ids triggers
# a cache recomputation and we only need to update the last token
seq_data.output_token_ids_array[-1] = real_out
seq_data._cached_all_token_ids[-1] = real_out
seq_data._cached_all_token_ids[-1] = real_out

0 comments on commit 37d74e0

Please sign in to comment.