Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
[Frontend] add tok/s speed metric to llm class when using tqdm (vllm-…
Browse files Browse the repository at this point in the history
…project#4400)

Co-authored-by: Michael Goin <[email protected]>
  • Loading branch information
2 people authored and robertgshaw2-redhat committed May 19, 2024
1 parent 32314e5 commit b0d3937
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,22 +247,30 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
# Initialize tqdm.
if use_tqdm:
num_requests = self.llm_engine.get_num_unfinished_requests()
pbar = tqdm(total=num_requests,
desc="Processed prompts",
dynamic_ncols=True)
pbar = tqdm(
total=num_requests,
desc="Processed prompts",
dynamic_ncols=True,
postfix=f"Generation Speed: {0:.2f} toks/s",
)
# Run the engine.
outputs: List[RequestOutput] = []
total_toks = 0
while self.llm_engine.has_unfinished_requests():
step_outputs = self.llm_engine.step()
for output in step_outputs:
if output.finished:
outputs.append(output)
if use_tqdm:
total_toks += (sum(
len(stp.token_ids) for stp in output.outputs))
spd = total_toks / pbar.format_dict["elapsed"]
pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
pbar.update(1)
if use_tqdm:
pbar.close()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
outputs = sorted(outputs, key=lambda x: int(x.request_id))
return outputs
return outputs

0 comments on commit b0d3937

Please sign in to comment.