diff --git a/build.py b/build.py index 2c95cbded3..0487636b09 100755 --- a/build.py +++ b/build.py @@ -1806,6 +1806,10 @@ def backend_clone( os.path.join(build_dir, be, "src", "model.py"), backend_dir, ) + clone_script.cpdir( + os.path.join(build_dir, be, "src", "utils"), + backend_dir, + ) clone_script.comment() clone_script.comment(f"end '{be}' backend") diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index 0a7f3cf1a3..b8fc0d8ee0 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -378,3 +378,9 @@ Further documentation can be found in the `TRITONSERVER_MetricFamily*` and The TRT-LLM backend uses the custom metrics API to track and expose specific metrics about LLMs, KV Cache, and Inflight Batching to Triton: https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#triton-metrics + +### vLLM Backend Metrics + +The vLLM backend uses the custom metrics API to track and expose specific metrics about +LLMs to Triton: +https://github.com/triton-inference-server/vllm_backend?tab=readme-ov-file#triton-metrics