From ad839831304f38c7ecce9ed4a5d6e3221df978a9 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Fri, 20 Dec 2024 10:37:30 -0800
Subject: [PATCH] docs: Update OpenAI README with 24.12 vllm image, reduced
 setup instructions for vllm, and 24.11 trtllm image

---
 python/openai/README.md | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)
diff --git a/python/openai/README.md b/python/openai/README.md
index 1898097e20..2fd5241f18 100644
--- a/python/openai/README.md
+++ b/python/openai/README.md
@@ -51,22 +51,13 @@
 docker run -it --net=host --gpus all --rm \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
   -e HF_TOKEN \
-  nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.12-vllm-python-py3
 ```
 
-2. Install dependencies inside the container:
+2. Launch the OpenAI-compatible Triton Inference Server:
 ```bash
-# Install python bindings for tritonserver and tritonfrontend
-pip install /opt/tritonserver/python/triton*.whl
-
-# Install application requirements
-git clone https://github.com/triton-inference-server/server.git
-cd server/python/openai/
-pip install -r requirements.txt
-```
+cd /opt/tritonserver/python/openai
 
-3. Launch the OpenAI-compatible Triton Inference Server:
-```bash
 # NOTE: Adjust the --tokenizer based on the model being used
 python3 openai_frontend/main.py --model-repository tests/vllm_models --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
@@ -92,7 +83,7 @@ INFO:     Uvicorn running on http://0.0.0.0:9000 (Press CTRL+C to quit) <- OpenA
 
 </details>
 
-4. Send a `/v1/chat/completions` request:
+3. Send a `/v1/chat/completions` request:
   - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses.
 ```bash
 MODEL="llama-3.1-8b-instruct"
@@ -132,7 +123,7 @@ curl -s http://localhost:9000/v1/chat/completions -H 'Content-Type: application/
 
 </details>
 
-5. Send a `/v1/completions` request:
+4. Send a `/v1/completions` request:
   - Note the use of `jq` is optional, but provides a nicely formatted output for JSON responses.
 ```bash
 MODEL="llama-3.1-8b-instruct"
@@ -166,7 +157,7 @@ curl -s http://localhost:9000/v1/completions -H 'Content-Type: application/json'
 
 </details>
 
-6. Benchmark with `genai-perf`:
+5. Benchmark with `genai-perf`:
 - To install genai-perf in this container, see the instructions [here](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf#install-perf-analyzer-ubuntu-python-38)
 - Or try using genai-perf from the [SDK container](https://github.com/triton-inference-server/perf_analyzer/tree/main/genai-perf#install-perf-analyzer-ubuntu-python-38)
 
@@ -206,7 +197,7 @@ genai-perf profile \
 
 </details>
 
-7. Use the OpenAI python client directly:
+6. Use the OpenAI python client directly:
 ```python
 from openai import OpenAI
 
@@ -231,9 +222,9 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-8. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary):
+7. Run tests (NOTE: The server should not be running, the tests will handle starting/stopping the server as necessary):
 ```bash
-cd server/python/openai/
+cd /opt/tritonserver/python/openai/
 pip install -r requirements-test.txt
 
 pytest -v tests/
@@ -255,7 +246,7 @@ docker run -it --net=host --gpus all --rm \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
   -e HF_TOKEN \
   -e TRTLLM_ORCHESTRATOR=1 \
-  nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.11-trtllm-python-py3
 ```
 
 2. Install dependencies inside the container: