diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index 855a5ffbab..d79e0aa256 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -56,6 +56,7 @@ To change the interval at which metrics are polled/updated, see the `--metrics-i - [Inference Request Metrics](#inference-request-metrics) - [GPU Metrics](#gpu-metrics) - [CPU Metrics](#cpu-metrics) +- [Pinned Memory Metrics](#pinned-memory-metrics) - [Response Cache Metrics](#response-cache-metrics) - [Custom Metrics](#custom-metrics) @@ -282,6 +283,15 @@ If building Triton locally, the `TRITON_ENABLE_METRICS_CPU` CMake build flag can |CPU Memory | CPU Total Memory | `nv_cpu_memory_total_bytes` | Total CPU memory (RAM), in bytes | System-wide | Per interval | | | CPU Used Memory | `nv_cpu_memory_used_bytes` | Used CPU memory (RAM), in bytes | System-wide | Per interval | +## Pinned Memory Metrics + +Starting in 24.01, Triton offers Pinned Memory metrics to monitor the utilization of the Pinned Memory pool. + +|Category |Metric |Metric Name |Description |Granularity|Frequency | +|----------------|------------------|----------------------------|-------------------------------------------------------|-----------|-------------| +|Pinned Memory |Total Pinned memory |`nv_pinned_memory_pool_total_bytes` |Total Pinned memory, in bytes |All models |Per interval | +| |Used Pinned memory |`nv_pinned_memory_pool_used_bytes` |Used Pinned memory, in bytes |All models |Per interval | + ## Response Cache Metrics Cache metrics can be reported in two ways: diff --git a/qa/L0_metrics/metrics_config_test.py b/qa/L0_metrics/metrics_config_test.py index a1324ac28e..9a5e93c24a 100755 --- a/qa/L0_metrics/metrics_config_test.py +++ b/qa/L0_metrics/metrics_config_test.py @@ -55,6 +55,10 @@ "nv_cache_hit_duration_per_model", "nv_cache_miss_duration_per_model", ] +PINNED_MEMORY_PATTERNS = [ + "nv_pinned_memory_pool_total_bytes", + "nv_pinned_memory_pool_used_bytes", +] CACHE_SUMMARY_PATTERNS = ["nv_cache_hit_summary", "nv_cache_miss_summary"] @@ -65,6 +69,11 @@ def _get_metrics(self): r.raise_for_status() return r.text + def test_pinned_memory_metrics_exist(self): + metrics = self._get_metrics() + for metric in PINNED_MEMORY_PATTERNS: + self.assertIn(metric, metrics) + # Counters def test_inf_counters_exist(self): metrics = self._get_metrics() diff --git a/qa/L0_metrics/pinned_memory_metrics_test.py b/qa/L0_metrics/pinned_memory_metrics_test.py new file mode 100755 index 0000000000..42f6bbe2f7 --- /dev/null +++ b/qa/L0_metrics/pinned_memory_metrics_test.py @@ -0,0 +1,176 @@ +#!/usr/bin/python +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import re +import threading +import time +import unittest + +import numpy as np +import requests +import tritonclient.http as httpclient +from tritonclient.utils import * + +# Triton server reserves 256 MB for pinned memory by default. +DEFAULT_TOTAL_PINNED_MEMORY_SIZE = 2**28 # bytes, Equivalent to 256 MB +TOTAL_PINNED_MEMORY_SIZE = int( + os.environ.get("CUSTOM_PINNED_MEMORY_POOL_SIZE", DEFAULT_TOTAL_PINNED_MEMORY_SIZE) +) +print(f"TOTAL_PINNED_MEMORY_SIZE: {TOTAL_PINNED_MEMORY_SIZE} bytes") + +# Pinned memory usage when server is idle (no inference) +DEFAULT_USED_PINNED_MEMORY_SIZE = 0 # bytes + + +def get_metrics(): + total_bytes_pattern = re.compile(r"pool_total_bytes (\d+)") + used_bytes_pattern = re.compile(r"pool_used_bytes (\d+)") + + r = requests.get("http://localhost:8002/metrics") + r.raise_for_status() + + total_bytes_match = total_bytes_pattern.search(r.text) + total_bytes_value = total_bytes_match.group(1) + + used_bytes_match = used_bytes_pattern.search(r.text) + used_bytes_value = used_bytes_match.group(1) + + return total_bytes_value, used_bytes_value + + +class TestPinnedMemoryMetrics(unittest.TestCase): + def setUp(self): + self.inference_completed = threading.Event() + + shape = [1, 16] + self.model_name = "libtorch_float32_float32_float32" + input0_data = np.random.rand(*shape).astype(np.float32) + input1_data = np.random.rand(*shape).astype(np.float32) + + self.inputs = [ + httpclient.InferInput( + "INPUT0", input0_data.shape, "FP32" + ).set_data_from_numpy(input0_data), + httpclient.InferInput( + "INPUT1", input1_data.shape, "FP32" + ).set_data_from_numpy(input1_data), + ] + + self.outputs = [ + httpclient.InferRequestedOutput("OUTPUT__0"), + httpclient.InferRequestedOutput("OUTPUT__1"), + ] + + # Before loading the model + self._assert_pinned_memory_utilization() + + def _assert_pinned_memory_utilization(self): + total_bytes_value, used_bytes_value = get_metrics() + self.assertEqual(int(total_bytes_value), TOTAL_PINNED_MEMORY_SIZE) + self.assertEqual(int(used_bytes_value), DEFAULT_USED_PINNED_MEMORY_SIZE) + + def _collect_metrics(self): + while not self.inference_completed.is_set(): + total_bytes_value, used_bytes_value = get_metrics() + self.assertEqual(int(total_bytes_value), TOTAL_PINNED_MEMORY_SIZE) + # Assert pinned memory usage is within anticipated values + self.assertIn(int(used_bytes_value), [0, 64, 128, 192, 256]) + + def test_pinned_memory_metrics_asynchronous_requests(self): + with httpclient.InferenceServerClient( + url="localhost:8000", concurrency=10 + ) as client: + if not client.is_model_ready(self.model_name): + client.load_model(self.model_name) + + # Before starting the inference + self._assert_pinned_memory_utilization() + + # Start a thread to collect metrics asynchronously + metrics_thread = threading.Thread(target=self._collect_metrics) + metrics_thread.start() + + # Asynchronous inference requests + async_requests = [] + for _ in range(100): + async_requests.append( + client.async_infer( + model_name=self.model_name, + inputs=self.inputs, + outputs=self.outputs, + ) + ) + + time.sleep(1) + + # Wait for all inference requests to complete + for async_request in async_requests: + async_request.get_result() + + # Set the event to indicate that inference is completed + self.inference_completed.set() + + # Wait for the metrics thread to complete + metrics_thread.join() + + # After Completing inference, used_bytes_value should comedown to 0 + self._assert_pinned_memory_utilization() + + def test_pinned_memory_metrics_synchronous_requests(self): + with httpclient.InferenceServerClient(url="localhost:8000") as client: + if not client.is_model_ready(self.model_name): + client.load_model(self.model_name) + + # Before starting the inference + self._assert_pinned_memory_utilization() + + # Start a thread to collect metrics asynchronously + metrics_thread = threading.Thread(target=self._collect_metrics) + metrics_thread.start() + + # Synchronous inference requests + for _ in range(100): + response = client.infer( + model_name=self.model_name, inputs=self.inputs, outputs=self.outputs + ) + response.get_response() + + time.sleep(0.1) + + # Set the event to indicate that inference is completed + self.inference_completed.set() + + # Wait for the metrics thread to complete + metrics_thread.join() + + # After Completing inference, used_bytes_value should comedown to 0 + self._assert_pinned_memory_utilization() + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh index dea1c62041..b36f8977e7 100755 --- a/qa/L0_metrics/test.sh +++ b/qa/L0_metrics/test.sh @@ -45,12 +45,13 @@ SERVER=${TRITON_DIR}/bin/tritonserver BASE_SERVER_ARGS="--model-repository=${MODELDIR}" SERVER_ARGS="${BASE_SERVER_ARGS}" SERVER_LOG="./inference_server.log" +PYTHON_TEST="metrics_config_test.py" source ../common/util.sh CLIENT_LOG="client.log" TEST_RESULT_FILE="test_results.txt" function check_unit_test() { - if [ $? -ne 0 ]; then + if [ "${PIPESTATUS[0]}" -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 @@ -100,8 +101,6 @@ if [ $? -ne 0 ]; then fi set -e -### GPU Metrics - # Prepare a libtorch float32 model with basic config rm -rf $MODELDIR model=libtorch_float32_float32_float32 @@ -112,8 +111,41 @@ mkdir -p $MODELDIR/${model}/1 && \ sed -i "s/label_filename:.*//" config.pbtxt && \ echo "instance_group [{ kind: KIND_GPU }]" >> config.pbtxt) +### Pinned memory metrics tests +set +e +CLIENT_PY="./pinned_memory_metrics_test.py" +SERVER_LOG="pinned_memory_metrics_test_server.log" +SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1" +run_and_check_server +python3 ${PYTHON_TEST} MetricsConfigTest.test_pinned_memory_metrics_exist -v 2>&1 | tee ${CLIENT_LOG} +check_unit_test + +CLIENT_LOG="pinned_memory_metrics_test_client.log" +python3 ${CLIENT_PY} -v 2>&1 | tee ${CLIENT_LOG} +check_unit_test + +kill $SERVER_PID +wait $SERVER_PID + +# Custom Pinned memory pool size +export CUSTOM_PINNED_MEMORY_POOL_SIZE=1024 # bytes +SERVER_LOG="custom_pinned_memory_test_server.log" +CLIENT_LOG="custom_pinned_memory_test_client.log" +SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1 --pinned-memory-pool-byte-size=$CUSTOM_PINNED_MEMORY_POOL_SIZE" +run_and_check_server +python3 ${CLIENT_PY} -v 2>&1 | tee ${CLIENT_LOG} +check_unit_test + +kill $SERVER_PID +wait $SERVER_PID +set -e + + +### GPU Metrics set +e export CUDA_VISIBLE_DEVICES=0,1,2 +SERVER_LOG="./inference_server.log" +CLIENT_LOG="client.log" run_and_check_server num_gpus=`curl -s localhost:8002/metrics | grep "nv_gpu_utilization{" | wc -l` @@ -227,7 +259,6 @@ MODELDIR="${PWD}/unit_test_models" mkdir -p "${MODELDIR}/identity_cache_on/1" mkdir -p "${MODELDIR}/identity_cache_off/1" BASE_SERVER_ARGS="--model-repository=${MODELDIR} --model-control-mode=explicit" -PYTHON_TEST="metrics_config_test.py" # Check default settings: Counters should be enabled, summaries should be disabled SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off"