From 52a9d62163281052fb6532f3ecc01c4966f8e8c7 Mon Sep 17 00:00:00 2001 From: Saehanseul Date: Wed, 19 Feb 2025 03:48:52 +0200 Subject: [PATCH 1/4] fix assert error in second request to LLMEngine --- vllm/worker/hpu_model_runner.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index a11cd318d8949..b870583a4389f 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2783,13 +2783,14 @@ def _patch_prev_output(self): ctx = model_input.async_callback.keywords["ctx"] assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' output_data = ctx.output_queue[0] - assert len(output_data.outputs) == 1 - for fake_out, real_out in zip(output_data.outputs[0], delayed_output): - fake_out.samples[0].output_token = real_out - for sg, real_out in zip(output_data.seq_group_metadata_list, delayed_output): - assert len(sg.seq_data) == 1 - seq_data = list(sg.seq_data.values())[0] - # This is a hack. Assigning output_token_ids triggers - # a cache recomputation and we only need to update the last token - seq_data.output_token_ids_array[-1] = real_out - seq_data._cached_all_token_ids[-1] = real_out + #assert len(output_data.outputs) == 1 + if len(output_data.outputs) > 0: + for fake_out, real_out in zip(output_data.outputs[0], delayed_output): + fake_out.samples[0].output_token = real_out + for sg, real_out in zip(output_data.seq_group_metadata_list, delayed_output): + assert len(sg.seq_data) == 1 + seq_data = list(sg.seq_data.values())[0] + # This is a hack. Assigning output_token_ids triggers + # a cache recomputation and we only need to update the last token + seq_data.output_token_ids_array[-1] = real_out + seq_data._cached_all_token_ids[-1] = real_out From 931789a26289a893b523987bcd30ca6767a34a20 Mon Sep 17 00:00:00 2001 From: Saehanseul Date: Wed, 19 Feb 2025 20:01:01 +0200 Subject: [PATCH 2/4] delayed sampling new request fix --- vllm/worker/hpu_model_runner.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index b870583a4389f..4a72a36f2ddf2 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2781,10 +2781,10 @@ def _patch_prev_output(self): model_input = self.cached_step_inputs.pop(0) delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(-1).tolist() ctx = model_input.async_callback.keywords["ctx"] - assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' - output_data = ctx.output_queue[0] - #assert len(output_data.outputs) == 1 - if len(output_data.outputs) > 0: + #assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' + if len(ctx.output_queue) > 0: + output_data = ctx.output_queue[0] + assert len(output_data.outputs) == 1 for fake_out, real_out in zip(output_data.outputs[0], delayed_output): fake_out.samples[0].output_token = real_out for sg, real_out in zip(output_data.seq_group_metadata_list, delayed_output): From 59fff8ac4d4cbf5937a9c973c72f1bdcce44c6cc Mon Sep 17 00:00:00 2001 From: Saehanseul Date: Wed, 19 Feb 2025 23:24:02 +0200 Subject: [PATCH 3/4] delayed sampling new request fix #3 --- vllm/worker/hpu_model_runner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 4a72a36f2ddf2..c910691cee66b 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2781,8 +2781,9 @@ def _patch_prev_output(self): model_input = self.cached_step_inputs.pop(0) delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(-1).tolist() ctx = model_input.async_callback.keywords["ctx"] - #assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' - if len(ctx.output_queue) > 0: + if len(ctx.output_queue) == 0: + self.cached_step_inputs.pop(0) + elif len(ctx.output_queue) == 1: output_data = ctx.output_queue[0] assert len(output_data.outputs) == 1 for fake_out, real_out in zip(output_data.outputs[0], delayed_output): @@ -2794,3 +2795,5 @@ def _patch_prev_output(self): # a cache recomputation and we only need to update the last token seq_data.output_token_ids_array[-1] = real_out seq_data._cached_all_token_ids[-1] = real_out + else: + assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' From 47f56e89e0cf7657a289f90db576ccd439749a03 Mon Sep 17 00:00:00 2001 From: attafosu Date: Thu, 20 Feb 2025 00:13:44 +0200 Subject: [PATCH 4/4] Sanitize fix for delayed sampling error --- vllm/worker/hpu_model_runner.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index c910691cee66b..058d765b49aab 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2781,19 +2781,20 @@ def _patch_prev_output(self): model_input = self.cached_step_inputs.pop(0) delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(-1).tolist() ctx = model_input.async_callback.keywords["ctx"] - if len(ctx.output_queue) == 0: - self.cached_step_inputs.pop(0) - elif len(ctx.output_queue) == 1: - output_data = ctx.output_queue[0] - assert len(output_data.outputs) == 1 - for fake_out, real_out in zip(output_data.outputs[0], delayed_output): - fake_out.samples[0].output_token = real_out - for sg, real_out in zip(output_data.seq_group_metadata_list, delayed_output): - assert len(sg.seq_data) == 1 - seq_data = list(sg.seq_data.values())[0] - # This is a hack. Assigning output_token_ids triggers - # a cache recomputation and we only need to update the last token - seq_data.output_token_ids_array[-1] = real_out - seq_data._cached_all_token_ids[-1] = real_out - else: - assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' + # If there's no output to patch with, + # which is usually the case when we're starting a new request after all in-flight requests are completed, + # We return (Note that we have now cleared the cached_step_inputs/outputs as required). + if len(ctx.output_queue) == 0: + return + assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' + output_data = ctx.output_queue[0] + assert len(output_data.outputs) == 1 + for fake_out, real_out in zip(output_data.outputs[0], delayed_output): + fake_out.samples[0].output_token = real_out + for sg, real_out in zip(output_data.seq_group_metadata_list, delayed_output): + assert len(sg.seq_data) == 1 + seq_data = list(sg.seq_data.values())[0] + # This is a hack. Assigning output_token_ids triggers + # a cache recomputation and we only need to update the last token + seq_data.output_token_ids_array[-1] = real_out + seq_data._cached_all_token_ids[-1] = real_out \ No newline at end of file