From aebf20bb766ace0de6727666f50749ad719018f6 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 11 Mar 2024 16:20:18 -0400
Subject: [PATCH] Benchmarking : Misc updates (#95)

SUMMARY:
Fixes and Quality-of-life changes
- Fix the vllm engine `temperature` to 0.0 so the text generation is
deterministic
 - Fix time-per-output-token metric computation
- Add num_warmup_prompts and log_model_io options to benchmark
throughput

TEST PLAN:
Manual testing

---------

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 .../scripts/backend_request_func.py           |  4 +-
 .../benchmarks/scripts/benchmark_serving.py   |  9 ++--
 .../scripts/benchmark_throughput.py           | 54 +++++++++++--------
 neuralmagic/benchmarks/scripts/common.py      | 28 ++++++++--
 4 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/neuralmagic/benchmarks/scripts/backend_request_func.py b/neuralmagic/benchmarks/scripts/backend_request_func.py
index 404211876bc59..078cfd1c6a7fc 100644
--- a/neuralmagic/benchmarks/scripts/backend_request_func.py
+++ b/neuralmagic/benchmarks/scripts/backend_request_func.py
@@ -111,7 +111,9 @@ async def async_request_vllm(
             "n": 1,
             "best_of": request_func_input.best_of,
             "use_beam_search": request_func_input.use_beam_search,
-            "temperature": 0.0 if request_func_input.use_beam_search else 1.0,
+            # TODO (varun) : Make temperature configurable
+            #"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
+            "temperature": 0.0,
             "top_p": 1.0,
             "max_tokens": request_func_input.output_len,
             "ignore_eos": True,
diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py
index 910d0754d2228..8d8b187e88db4 100644
--- a/neuralmagic/benchmarks/scripts/benchmark_serving.py
+++ b/neuralmagic/benchmarks/scripts/benchmark_serving.py
@@ -33,8 +33,7 @@
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_benchmark_io
-# TODO (move this to scripts)
+from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_serving_request_io
 from .datasets_registry import get_dataset, DatasetArgs
 
 from neuralmagic.benchmarks.scripts.backend_request_func import (
@@ -100,7 +99,9 @@ def calculate_metrics(
             total_output += output_len
             total_input += input_requests[i][1]
             latencies.append(outputs[i].latency)
-            tpots.append((outputs[i].latency - outputs[i].ttft) / output_len)
+            if output_len > 1:
+                tpots.append(
+                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
             ttfts.append(outputs[i].ttft)
             completed += 1
 
@@ -167,7 +168,7 @@ async def benchmark(backend: str, api_url: str, model_id: str,
 
     # Dump model i/o
     if log_model_io:
-        print_benchmark_io(outputs)
+        print_serving_request_io(input_requests, outputs)
 
     metrics = calculate_metrics(
         input_requests=input_requests,
diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py
index 0e3e0c36c3756..5115f63bac001 100644
--- a/neuralmagic/benchmarks/scripts/benchmark_throughput.py
+++ b/neuralmagic/benchmarks/scripts/benchmark_throughput.py
@@ -13,7 +13,7 @@
 from pathlib import Path
 from typing import List, Optional, Tuple
 from transformers import AutoTokenizer
-from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus
+from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs
 from .datasets_registry import get_dataset, DatasetArgs
 
 
@@ -25,21 +25,21 @@ def get_tensor_parallel_size(args: argparse.Namespace) -> int:
     return tensor_parallel_size
 
 
-def run_vllm(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: str,
-    quantization: Optional[str],
-    tensor_parallel_size: int,
-    seed: int,
-    n: int,
-    use_beam_search: bool,
-    trust_remote_code: bool,
-    dtype: str,
-    max_model_len: Optional[int],
-    enforce_eager: bool,
-    sparsity: Optional[str],
-) -> float:
+def run_vllm(requests: List[Tuple[str, int, int]],
+             model: str,
+             tokenizer: str,
+             quantization: Optional[str],
+             tensor_parallel_size: int,
+             seed: int,
+             n: int,
+             use_beam_search: bool,
+             trust_remote_code: bool,
+             dtype: str,
+             max_model_len: Optional[int],
+             enforce_eager: bool,
+             sparsity: Optional[str],
+             num_warmup_prompts: int,
+             log_model_io: bool = False) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(
         model=model,
@@ -53,13 +53,15 @@ def run_vllm(
         enforce_eager=enforce_eager,
     )
 
-    warmup_vllm_engine(engine=llm, model=model, num_prompts=1000)
+    warmup_vllm_engine(engine=llm, model=model, num_prompts=num_warmup_prompts)
 
     # Add the requests to the engine.
     for prompt, _, output_len in requests:
         sampling_params = SamplingParams(
             n=n,
-            temperature=0.0 if use_beam_search else 1.0,
+            # TODO (varun) Make temperature configurable
+            #temperature=0.0 if use_beam_search else 1.0,
+            temperature=0.0,
             top_p=1.0,
             use_beam_search=use_beam_search,
             ignore_eos=True,
@@ -74,9 +76,12 @@ def run_vllm(
 
     start = time.perf_counter()
     # FIXME(woosuk): Do not use internal method.
-    llm._run_engine(use_tqdm=True)
+    outputs = llm._run_engine(use_tqdm=True)
     end = time.perf_counter()
 
+    if log_model_io:
+        print_request_outputs(outputs)
+
     return end - start
 
 
@@ -96,7 +101,7 @@ def main(args: argparse.Namespace):
                                    num_samples=args.num_prompts,
                                    max_len=2048,
                                    seed=42,
-                               ))
+                                   fixed_output_len=args.output_len))
     else:
         # Make a synthetic dataset.
         requests = generate_synthetic_requests(args.input_len, args.output_len,
@@ -114,7 +119,9 @@ def main(args: argparse.Namespace):
                             args.dtype,
                             args.max_model_len,
                             args.enforce_eager,
-                            sparsity=args.sparsity)
+                            sparsity=args.sparsity,
+                            num_warmup_prompts=args.num_warmup_prompts,
+                            log_model_io=args.log_model_io)
 
     total_prompt_tokens = sum(prompt_len for _, prompt_len, _ in requests)
     total_output_tokens = sum(output_len for _, _, output_len in requests)
@@ -189,10 +196,15 @@ def main(args: argparse.Namespace):
                         type=int,
                         default=1000,
                         help="Number of prompts to process.")
+    parser.add_argument("--num-warmup-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to do warmups with.")
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument('--trust-remote-code',
                         action='store_true',
                         help='trust remote code from huggingface')
+    parser.add_argument("--log-model-io", action="store_true")
     parser.add_argument(
         '--max-model-len',
         type=int,
diff --git a/neuralmagic/benchmarks/scripts/common.py b/neuralmagic/benchmarks/scripts/common.py
index 5d5b482a0e3c9..1c756692f09ba 100644
--- a/neuralmagic/benchmarks/scripts/common.py
+++ b/neuralmagic/benchmarks/scripts/common.py
@@ -12,7 +12,7 @@
 from vllm.outputs import RequestOutput
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR
-from .backend_request_func import RequestFuncInput, async_request_vllm
+from .backend_request_func import RequestFuncInput, RequestFuncOutput, async_request_vllm
 from ...tools.call_cmd import call_cmd
 
 
@@ -204,9 +204,27 @@ def instantiate_benchmark_results_dict(benchmarking_script_name: str,
     return result_dict
 
 
-def print_benchmark_io(results: List[RequestOutput]) -> None:
+def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int,
+                  n_output_tokens: int) -> str:
+    return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n"
+
+
+def print_request_outputs(results: List[RequestOutput]) -> None:
     for result in results:
         output = result.outputs[0]
-        print(
-            f"\n\n inputs({len(result.prompt_token_ids)}): {result.prompt}\n output({len(output.token_ids)}): {output.text}"
-        )
+        io_log = format_io_log(result.prompt, output.text,
+                               len(result.prompt_token_ids),
+                               len(output.token_ids))
+        print(f"\n{io_log}")
+
+
+def print_serving_request_io(inputs: List[Tuple[str, int, int]],
+                             outputs: List[RequestFuncOutput]) -> None:
+    """
+        inputs: list of tuples where the tuple is [prompt, prompt_length, output_length],
+        outputs: list of RequestFuncOutput that is the output from the serving case (benchmark_serving.py)
+        Format and print the inputs and outputs.
+    """
+    for i, o in zip(inputs, outputs):
+        io_log = format_io_log(i[0], o.generated_text, i[1], i[2])
+        print(f"\n{io_log}")