From aebf20bb766ace0de6727666f50749ad719018f6 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 11 Mar 2024 16:20:18 -0400 Subject: [PATCH] Benchmarking : Misc updates (#95) SUMMARY: Fixes and Quality-of-life changes - Fix the vllm engine `temperature` to 0.0 so the text generation is deterministic - Fix time-per-output-token metric computation - Add num_warmup_prompts and log_model_io options to benchmark throughput TEST PLAN: Manual testing --------- Co-authored-by: Varun Sundar Rabindranath --- .../scripts/backend_request_func.py | 4 +- .../benchmarks/scripts/benchmark_serving.py | 9 ++-- .../scripts/benchmark_throughput.py | 54 +++++++++++-------- neuralmagic/benchmarks/scripts/common.py | 28 ++++++++-- 4 files changed, 64 insertions(+), 31 deletions(-) diff --git a/neuralmagic/benchmarks/scripts/backend_request_func.py b/neuralmagic/benchmarks/scripts/backend_request_func.py index 404211876bc59..078cfd1c6a7fc 100644 --- a/neuralmagic/benchmarks/scripts/backend_request_func.py +++ b/neuralmagic/benchmarks/scripts/backend_request_func.py @@ -111,7 +111,9 @@ async def async_request_vllm( "n": 1, "best_of": request_func_input.best_of, "use_beam_search": request_func_input.use_beam_search, - "temperature": 0.0 if request_func_input.use_beam_search else 1.0, + # TODO (varun) : Make temperature configurable + #"temperature": 0.0 if request_func_input.use_beam_search else 1.0, + "temperature": 0.0, "top_p": 1.0, "max_tokens": request_func_input.output_len, "ignore_eos": True, diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py index 910d0754d2228..8d8b187e88db4 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_serving.py +++ b/neuralmagic/benchmarks/scripts/benchmark_serving.py @@ -33,8 +33,7 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase from vllm.transformers_utils.tokenizer import get_tokenizer -from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_benchmark_io -# TODO (move this to scripts) +from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_serving_request_io from .datasets_registry import get_dataset, DatasetArgs from neuralmagic.benchmarks.scripts.backend_request_func import ( @@ -100,7 +99,9 @@ def calculate_metrics( total_output += output_len total_input += input_requests[i][1] latencies.append(outputs[i].latency) - tpots.append((outputs[i].latency - outputs[i].ttft) / output_len) + if output_len > 1: + tpots.append( + (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) ttfts.append(outputs[i].ttft) completed += 1 @@ -167,7 +168,7 @@ async def benchmark(backend: str, api_url: str, model_id: str, # Dump model i/o if log_model_io: - print_benchmark_io(outputs) + print_serving_request_io(input_requests, outputs) metrics = calculate_metrics( input_requests=input_requests, diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py index 0e3e0c36c3756..5115f63bac001 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_throughput.py +++ b/neuralmagic/benchmarks/scripts/benchmark_throughput.py @@ -13,7 +13,7 @@ from pathlib import Path from typing import List, Optional, Tuple from transformers import AutoTokenizer -from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus +from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs from .datasets_registry import get_dataset, DatasetArgs @@ -25,21 +25,21 @@ def get_tensor_parallel_size(args: argparse.Namespace) -> int: return tensor_parallel_size -def run_vllm( - requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, - n: int, - use_beam_search: bool, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - sparsity: Optional[str], -) -> float: +def run_vllm(requests: List[Tuple[str, int, int]], + model: str, + tokenizer: str, + quantization: Optional[str], + tensor_parallel_size: int, + seed: int, + n: int, + use_beam_search: bool, + trust_remote_code: bool, + dtype: str, + max_model_len: Optional[int], + enforce_eager: bool, + sparsity: Optional[str], + num_warmup_prompts: int, + log_model_io: bool = False) -> float: from vllm import LLM, SamplingParams llm = LLM( model=model, @@ -53,13 +53,15 @@ def run_vllm( enforce_eager=enforce_eager, ) - warmup_vllm_engine(engine=llm, model=model, num_prompts=1000) + warmup_vllm_engine(engine=llm, model=model, num_prompts=num_warmup_prompts) # Add the requests to the engine. for prompt, _, output_len in requests: sampling_params = SamplingParams( n=n, - temperature=0.0 if use_beam_search else 1.0, + # TODO (varun) Make temperature configurable + #temperature=0.0 if use_beam_search else 1.0, + temperature=0.0, top_p=1.0, use_beam_search=use_beam_search, ignore_eos=True, @@ -74,9 +76,12 @@ def run_vllm( start = time.perf_counter() # FIXME(woosuk): Do not use internal method. - llm._run_engine(use_tqdm=True) + outputs = llm._run_engine(use_tqdm=True) end = time.perf_counter() + if log_model_io: + print_request_outputs(outputs) + return end - start @@ -96,7 +101,7 @@ def main(args: argparse.Namespace): num_samples=args.num_prompts, max_len=2048, seed=42, - )) + fixed_output_len=args.output_len)) else: # Make a synthetic dataset. requests = generate_synthetic_requests(args.input_len, args.output_len, @@ -114,7 +119,9 @@ def main(args: argparse.Namespace): args.dtype, args.max_model_len, args.enforce_eager, - sparsity=args.sparsity) + sparsity=args.sparsity, + num_warmup_prompts=args.num_warmup_prompts, + log_model_io=args.log_model_io) total_prompt_tokens = sum(prompt_len for _, prompt_len, _ in requests) total_output_tokens = sum(output_len for _, _, output_len in requests) @@ -189,10 +196,15 @@ def main(args: argparse.Namespace): type=int, default=1000, help="Number of prompts to process.") + parser.add_argument("--num-warmup-prompts", + type=int, + default=1000, + help="Number of prompts to do warmups with.") parser.add_argument("--seed", type=int, default=0) parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') + parser.add_argument("--log-model-io", action="store_true") parser.add_argument( '--max-model-len', type=int, diff --git a/neuralmagic/benchmarks/scripts/common.py b/neuralmagic/benchmarks/scripts/common.py index 5d5b482a0e3c9..1c756692f09ba 100644 --- a/neuralmagic/benchmarks/scripts/common.py +++ b/neuralmagic/benchmarks/scripts/common.py @@ -12,7 +12,7 @@ from vllm.outputs import RequestOutput from vllm.transformers_utils.tokenizer import get_tokenizer from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR -from .backend_request_func import RequestFuncInput, async_request_vllm +from .backend_request_func import RequestFuncInput, RequestFuncOutput, async_request_vllm from ...tools.call_cmd import call_cmd @@ -204,9 +204,27 @@ def instantiate_benchmark_results_dict(benchmarking_script_name: str, return result_dict -def print_benchmark_io(results: List[RequestOutput]) -> None: +def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int, + n_output_tokens: int) -> str: + return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n" + + +def print_request_outputs(results: List[RequestOutput]) -> None: for result in results: output = result.outputs[0] - print( - f"\n\n inputs({len(result.prompt_token_ids)}): {result.prompt}\n output({len(output.token_ids)}): {output.text}" - ) + io_log = format_io_log(result.prompt, output.text, + len(result.prompt_token_ids), + len(output.token_ids)) + print(f"\n{io_log}") + + +def print_serving_request_io(inputs: List[Tuple[str, int, int]], + outputs: List[RequestFuncOutput]) -> None: + """ + inputs: list of tuples where the tuple is [prompt, prompt_length, output_length], + outputs: list of RequestFuncOutput that is the output from the serving case (benchmark_serving.py) + Format and print the inputs and outputs. + """ + for i, o in zip(inputs, outputs): + io_log = format_io_log(i[0], o.generated_text, i[1], i[2]) + print(f"\n{io_log}")