From b0d39376d18b940cbc99b7a9fcd5f1be59e59d54 Mon Sep 17 00:00:00 2001
From: Mahmoud Ashraf <hassouna97.ma@gmail.com>
Date: Thu, 9 May 2024 08:02:31 +0300
Subject: [PATCH] [Frontend] add tok/s speed metric to llm class when using
 tqdm (#4400)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/entrypoints/llm.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 8737a379f6f2c..e771b495d8b48 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -247,17 +247,25 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
-            pbar = tqdm(total=num_requests,
-                        desc="Processed prompts",
-                        dynamic_ncols=True)
+            pbar = tqdm(
+                total=num_requests,
+                desc="Processed prompts",
+                dynamic_ncols=True,
+                postfix=f"Generation Speed: {0:.2f} toks/s",
+            )
         # Run the engine.
         outputs: List[RequestOutput] = []
+        total_toks = 0
         while self.llm_engine.has_unfinished_requests():
             step_outputs = self.llm_engine.step()
             for output in step_outputs:
                 if output.finished:
                     outputs.append(output)
                     if use_tqdm:
+                        total_toks += (sum(
+                            len(stp.token_ids) for stp in output.outputs))
+                        spd = total_toks / pbar.format_dict["elapsed"]
+                        pbar.postfix = f"Generation Speed: {spd:.2f} toks/s"
                         pbar.update(1)
         if use_tqdm:
             pbar.close()
@@ -265,4 +273,4 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
         # This is necessary because some requests may be finished earlier than
         # its previous requests.
         outputs = sorted(outputs, key=lambda x: int(x.request_id))
-        return outputs
\ No newline at end of file
+        return outputs