Merge branch 'main' into rs/bump-main-to-v0.3.2

yukavio · Feb 23, 2024 · 31ecb4d · 31ecb4d
2 parents 9209f15 + ddb6b0d
commit 31ecb4d
Show file tree

Hide file tree

Showing 19 changed files with 1,969 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -1,42 +1,68 @@
-## Neural Magic vLLM
+# Neural Magic vLLM
 
-Fork of vLLM with sparsity.
+## About
 
-### To Run
+[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving that Neural Magic regularly lands upstream improvements to. This fork is our opinionated focus on the latest LLM optimizations, such as quantization and sparsity.
 
-Clone and install magic_wand:
+## Installation
 
+`nm-vllm` is a Python library that contained pre-compiled C++ and CUDA (12.1) binaries.
 
+Install it using pip (coming soon):
 ```bash
-git clone https://github.com/neuralmagic/magic_wand.git
-cd magic_wand
-export TORCH_CUDA_ARCH_LIST=8.6
-pip install -e .
+pip install nm-vllm
 ```
 
-Install:
+You can also build and install `nm-vllm` from source (this will take ~10 minutes):
 ```bash
-cd ../
+git clone https://github.com/neuralmagic/neuralmagic-vllm.git
+cd neuralmagic-vllm
 pip install -e .
 ```
 
-### Run Sample
+In order to use the weight-sparsity kernels, like through `sparsity="sparse_w16a16"`, you must also install `magic_wand`:
+```bash
+pip install magic_wand
+```
+
+## Quickstart
+
+There are many sparse models already pushed up on our HF organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing). You can find [this collection of SparseGPT models ready for inference](https://huggingface.co/collections/nm-testing/sparsegpt-llms-65ca6def5495933ab05cd439).
 
-Run a 50% sparse model:
+Here is a smoke test using a small test `llama2-110M` model train on storytelling:
 
 ```python
 from vllm import LLM, SamplingParams
 
 model = LLM(
-    "nm-testing/Llama-2-7b-pruned50-retrained", 
+    "nm-testing/llama2.c-stories110M-pruned2.4", 
     sparsity="sparse_w16a16",   # If left off, model will be loaded as dense
-    enforce_eager=True,         # Does not work with cudagraphs yet
-    dtype="float16",
-    tensor_parallel_size=1,
+)
+
+sampling_params = SamplingParams(max_tokens=100, temperature=0)
+outputs = model.generate("Hello my name is", sampling_params=sampling_params)
+print(outputs[0].outputs[0].text)
+```
+
+Here is a more realistic example of running a 50% sparse OpenHermes 2.5 Mistral 7B model finetuned for instruction-following:
+
+```python
+from vllm import LLM, SamplingParams
+
+model = LLM(
+    "nm-testing/OpenHermes-2.5-Mistral-7B-pruned50",
+    sparsity="sparse_w16a16",
     max_model_len=1024
 )
 
 sampling_params = SamplingParams(max_tokens=100, temperature=0)
 outputs = model.generate("Hello my name is", sampling_params=sampling_params)
-outputs[0].outputs[0].text
+print(outputs[0].outputs[0].text)
+```
+
+You can also quickly use the same flow with an OpenAI-compatible model server:
+```bash
+python -m vllm.entrypoints.openai.api_server \
+    --model nm-testing/OpenHermes-2.5-Mistral-7B-pruned50 \
+    --sparsity sparse_w16a16
 ```
diff --git a/neuralmagic/__init__.py b/neuralmagic/__init__.py
diff --git a/neuralmagic/benchmarks/README.md b/neuralmagic/benchmarks/README.md
@@ -0,0 +1,64 @@
+# Directory Structure:
+
+- scripts/*.py - Benchmark scripts that perform the metric computation.
+
+- configs/*.json - Config JSON files. These JSONs define what benchmark script to run and what combination of script parameters to use. 
+
+- *.py - Benchmark drivers. Given a config JSON, executes all the commands defined by the config JSON.
+
+# Run Benchmark scripts
+
+All `scripts/benchmark_*.py` files can be executed on their own.
+
+Run `python -m neuralmagic/benchmarks/scripts/* --help` for script description and How-To run.
+
+# Benchmarking drivers and Configs
+
+All the benchmark driver *.py files, input a JSON config file and an output directory path.
+
+As mentioned above, the config file defines what benchmark-script to run and what arguments to run it with.
+
+The following is an example config JSON,
+
+```
+		{
+			"description": "Benchmark vllm engine throughput - with dataset",
+			"models": [
+				"facebook/opt-125m",
+				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+			],
+			"sparsity" : [],
+			"script_name": "benchmark_throughput",
+			"script_args": {
+				"dataset": [
+					"sharegpt",
+                    "ultrachat"
+				],
+				"output-len": [
+					128
+				],
+				"num-prompts": [
+					1000
+				],
+			}
+		}
+```
+This config tells the benchmark driver to run benchmark_throughput script on all the listed models with all possible script-args combinations.
+i.e. the config essentially translates to,
+
+python -m neuralmagic.benchmarks.benchmark_throughput.py --model facebook/opt-125m --dataset sharegpt --output-len 128 --num-prompts 1000
+
+python -m neuralmagic.benchmarks.benchmark_throughput.py --model facebook/opt-125m --dataset ultrachat --output-len 128 --num-prompts 1000
+
+python -m neuralmagic.benchmarks.benchmark_throughput.py --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --dataset sharegpt --output-len 128 --num-prompts 1000
+
+python -m neuralmagic.benchmarks.benchmark_throughput.py --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --dataset ultrachat --output-len 128 --num-prompts 1000
+
+# Benchmarking with driver
+```
+python3 -m neuralmagic.benchmarks.run_benchmarks -i <path-to-config-file> -o <output-directory-path>
+```
+
+# About sparsity
+The benchmark configs have a `sparsity` field. Populate this field with proper sparsity identifiers to inform vllm about model sparsity.
+For the list of valid sparsity args, check `vllm/model_executor/layers/sparsity/*`
diff --git a/neuralmagic/benchmarks/__init__.py b/neuralmagic/benchmarks/__init__.py
@@ -0,0 +1,4 @@
+from neuralmagic.benchmarks.run_benchmark_serving import run_benchmark_serving_script
+from neuralmagic.benchmarks.run_benchmark_throughput import run_benchmark_throughput_script
+
+__all__ = [run_benchmark_serving_script, run_benchmark_throughput_script]
diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py
@@ -0,0 +1,62 @@
+import itertools
+import json
+
+from argparse import Namespace
+from pathlib import Path
+from typing import NamedTuple, Iterable
+# from neuralmagic.tools.call_cmd import call_cmd
+
+from vllm.model_executor.weight_utils import prepare_hf_model_weights
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def download_model(hf_model_id: str) -> None:
+    """
+     Downloads a hugging face model to cache
+     """
+    prepare_hf_model_weights(hf_model_id)
+    get_tokenizer(hf_model_id)
+
+
+def script_args_to_cla(config: NamedTuple) -> Iterable[list[str]]:
+    #config is a NamedTuple constructed from some JSON in neuralmagic/benchmarks/configs
+
+    kv = vars(config.script_args)
+
+    keys = kv.keys()
+    arg_lists = kv.values()
+    assert all(map(lambda le: isinstance(le, list), arg_lists))
+
+    # Empty lists are arguments without any values (e.g. boolean args)
+    key_args = []
+    for k, v in zip(keys, arg_lists):
+        if len(v) == 0:
+            key_args.append(k)
+
+    key_args_cla = list(map(lambda k: f"--{k}", key_args))
+
+    # Remove empty lists from arg_lists and remove key args from keys
+    arg_lists = filter(lambda arg_list: len(arg_list) != 0, arg_lists)
+    keys = filter(lambda k: k not in key_args, keys)
+
+    for args in itertools.product(*arg_lists):
+        cla = key_args_cla
+        for name, value in zip(keys, args):
+            cla.extend([f"--{name}", f"{value}"])
+        yield cla
+
+
+def benchmark_configs(config_file_path: Path) -> Iterable[NamedTuple]:
+    """
+    Give a path to a config file in `neuralmagic/benchmarks/configs/*` return an Iterable of
+    (sub)configs in the file
+    """
+    assert config_file_path.exists()
+
+    configs = None
+    with open(config_file_path, "r") as f:
+        configs = json.load(f, object_hook=lambda d: Namespace(**d))
+    assert configs is not None
+
+    for config in configs.configs:
+        yield config
diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json
@@ -0,0 +1,56 @@
+{
+	"configs": [
+		{
+			"description": "Benchmark vllm serving",
+			"models": [
+				"facebook/opt-125m",
+				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+				"mistralai/Mistral-7B-Instruct-v0.2",
+				"NousResearch/Llama-2-7b-chat-hf"
+			],
+			"sparsity": [],
+			"script_name": "benchmark_serving",
+			"script_args": {
+				"nr-qps-pair_": [
+					"50,0.5",
+					"100,1",
+					"200,2",
+					"500,5"
+				],
+				"best-of": [
+					1
+				],
+				"dataset": [
+					"sharegpt"
+				]
+			}
+		},
+		{
+			"description": "Benchmark vllm serving",
+			"models": [
+				"facebook/opt-125m",
+				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+				"mistralai/Mistral-7B-Instruct-v0.2",
+				"NousResearch/Llama-2-7b-chat-hf"
+			],
+			"sparsity": [],
+			"script_name": "benchmark_serving",
+			"script_args": {
+				"num-prompts_": [
+					50,
+					100
+				],
+				"request-rate_": [
+					0.5,
+					"inf"
+				],
+				"best-of": [
+					1
+				],
+				"dataset": [
+					"sharegpt"
+				]
+			}
+		}
+	]
+}