Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: Refactor cpu metrics tests to make L0_metrics more stable #7476

Merged
merged 12 commits into from
Jul 29, 2024
Merged
187 changes: 187 additions & 0 deletions qa/L0_metrics/cpu_metrics_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#!/usr/bin/python
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import re
import threading
import time
import unittest
from collections import defaultdict

import numpy as np
import requests
import tritonclient.http as httpclient

CPU_UTILIZATION = "nv_cpu_utilization"
fpetrini15 marked this conversation as resolved.
Show resolved Hide resolved
CPU_USED_MEMORY = "nv_cpu_memory_used_bytes"
CPU_TOTAL_MEMORY = "nv_cpu_memory_total_bytes"


def get_metrics():
utilization_pattern = re.compile(rf"{CPU_UTILIZATION} (\d+\.?\d*)")
used_bytes_pattern = re.compile(rf"{CPU_USED_MEMORY} (\d+)")
total_bytes_pattern = re.compile(rf"{CPU_TOTAL_MEMORY} (\d+)")

r = requests.get("http://localhost:8002/metrics")
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
r.raise_for_status()

utilization_match = utilization_pattern.search(r.text)
utilization_value = float(utilization_match.group(1))

used_bytes_match = used_bytes_pattern.search(r.text)
used_bytes_value = int(used_bytes_match.group(1))

total_bytes_match = total_bytes_pattern.search(r.text)
total_bytes_value = int(total_bytes_match.group(1))

return utilization_value, used_bytes_value, total_bytes_value


class TestCpuMetrics(unittest.TestCase):
def setUp(self):
self.inference_completed = threading.Event()

shape = [1, 16]
self.model_name = "libtorch_float32_float32_float32"
input0_data = np.random.rand(*shape).astype(np.float32)
input1_data = np.random.rand(*shape).astype(np.float32)

self.inputs = [
httpclient.InferInput(
"INPUT0", input0_data.shape, "FP32"
).set_data_from_numpy(input0_data),
httpclient.InferInput(
"INPUT1", input1_data.shape, "FP32"
).set_data_from_numpy(input1_data),
]

def _validate_metric_variance(self, observed_metrics: dict):
dupe_value_tolerance = 5
for metric in [CPU_UTILIZATION, CPU_USED_MEMORY]:
observed_values = observed_metrics[metric]
observed_count = len(observed_values)
print(
f"Observed {metric} count: {observed_count}, values: {observed_values}"
)

# Must have at least 1 more than the duplicate tolerance
self.assertGreater(
observed_count,
dupe_value_tolerance,
f"Found too many sequential duplicate values for {metric}. Double check the server-side --metrics-interval and observation interval in this test, or consider tuning the duplicate tolerance.",
)

# Don't allow observed metric values to be repeated sequentially
# more than a certain tolerance. The expectation is that these metrics
# will vary while the server is processing requests in the background,
# provided the server was configured with a small metrics update interval.
sequential_dupes = 0
max_sequential_dupes = 0
prev_value = observed_values[0]
for value in observed_values[1:]:
if value == prev_value:
sequential_dupes += 1
else:
# If unique value found, reset counter
sequential_dupes = 0

# For future observability on dupe frequency to tune the tolerance
if sequential_dupes > max_sequential_dupes:
max_sequential_dupes = sequential_dupes

self.assertLess(sequential_dupes, dupe_value_tolerance)
prev_value = value

print(
f"Max sequential duplicate values found for {metric}: {max_sequential_dupes}"
)

def _collect_metrics(self, observed_metrics, interval_secs=1):
"""
Collects metrics at provided 'interval_secs' and stores them in the
provided 'observed_metrics' dictionary for postprocessing.
"""
# Give the test and server some time to begin processing requests
# before beginning observation loop.
time.sleep(1)

while not self.inference_completed.is_set():
util_value, used_memory_value, _ = get_metrics()
observed_metrics[CPU_UTILIZATION].append(util_value)
observed_metrics[CPU_USED_MEMORY].append(used_memory_value)
time.sleep(interval_secs)

def test_cpu_metrics_during_inference(self):
with httpclient.InferenceServerClient(
url="localhost:8000", concurrency=10
) as client:
# Start a thread to collect metrics asynchronously while inferences are
# executing, store them in a dictionary for postprocessing validation.
observed_metrics = defaultdict(list)
metrics_thread = threading.Thread(
target=self._collect_metrics, args=(observed_metrics,)
)
metrics_thread.start()

# Fire off many asynchronous inference requests to keep server
# busy while monitoring the CPU metrics. Ideal target is about
# 20-30 seconds of inference to get a good number of metric samples.
async_requests = []
for _ in range(2000):
async_requests.append(
client.async_infer(
model_name=self.model_name,
inputs=self.inputs,
)
)

# Wait for all inference requests to complete
for async_request in async_requests:
async_request.get_result()

# Set the event to indicate that inference is completed
self.inference_completed.set()

# Wait for the metrics thread to complete
metrics_thread.join()

self._validate_metric_variance(observed_metrics)

def test_cpu_metrics_ranges(self):
# Test some simple sanity checks on the expected ranges of values
# for the CPU related metrics.
utilization, used_memory, total_memory = get_metrics()
self.assertGreaterEqual(utilization, 0)
self.assertLessEqual(utilization, 1.0)
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
self.assertGreater(used_memory, 0)
self.assertLessEqual(used_memory, total_memory)
# NOTE: Can be improved in future to compare upper bound against psutil
# system memory if we introduce the dependency into the test/container.
self.assertGreater(total_memory, 0)


if __name__ == "__main__":
unittest.main()
59 changes: 17 additions & 42 deletions qa/L0_metrics/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,23 @@ mkdir -p $MODELDIR/${model}/1 && \
sed -i "s/label_filename:.*//" config.pbtxt && \
echo "instance_group [{ kind: KIND_GPU }]" >> config.pbtxt)

### CPU / RAM metrics tests
set +e
CLIENT_PY="./cpu_metrics_test.py"
SERVER_LOG="cpu_metrics_test_server.log"
# NOTE: CPU utilization is computed based on the metrics interval, so having
# too small of an interval can skew the results.
SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1000 --log-verbose=1"
run_and_check_server

CLIENT_LOG="cpu_metrics_test_client.log"
python3 ${CLIENT_PY} -v 2>&1 | tee ${CLIENT_LOG}
check_unit_test

kill $SERVER_PID
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved
wait $SERVER_PID
set -e

### Pinned memory metrics tests
set +e
CLIENT_PY="./pinned_memory_metrics_test.py"
Expand Down Expand Up @@ -236,48 +253,6 @@ for (( i = 0; i < $num_iterations; ++i )); do
prev_energy=$current_energy
done

### CPU / RAM Metrics

# The underlying values for these metrics do not always update frequently,
# so give ample WAIT time to make sure they change and are being updated.
CPU_METRICS="nv_cpu_utilization nv_cpu_memory_used_bytes"
WAIT_INTERVAL_SECS=2.0
for metric in ${CPU_METRICS}; do
echo -e "\n=== Checking Metric: ${metric} ===\n"
prev_value=`curl -s localhost:8002/metrics | grep ${metric} | grep -v "HELP\|TYPE" | awk '{print $2}'`

num_not_updated=0
num_not_updated_threshold=3
for (( i = 0; i < $num_iterations; ++i )); do
sleep $WAIT_INTERVAL_SECS
current_value=`curl -s localhost:8002/metrics | grep ${metric} | grep -v "HELP\|TYPE" | awk '{print $2}'`
if [ $current_value == $prev_value ]; then
num_not_updated=$((num_not_updated+1))
fi
prev_value=$current_value
done

# Give CPU metrics some tolerance to not update, up to a threshold
# DLIS-4304: An alternative may be to run some busy work on CPU in the
# background rather than allowing a tolerance threshold
if [[ ${num_not_updated} -gt ${num_not_updated_threshold} ]]; then
cat $SERVER_LOG
echo "Metrics were not updated ${num_not_updated}/${num_iterations} times for interval of ${METRICS_INTERVAL_MS} milliseconds for metric: ${metric}"
echo -e "\n***\n*** Metric Interval test failed. \n***"
RET=1
break
fi
done

# Verify reported total memory is non-zero
total_memory=`curl -s localhost:8002/metrics | grep "nv_cpu_memory_total_bytes" | grep -v "HELP\|TYPE" | awk '{print $2}'`
test -z "${total_memory}" && total_memory=0
if [ ${total_memory} -eq 0 ]; then
echo "Found nv_cpu_memory_total_bytes had a value of zero, this should not happen."
echo -e "\n***\n*** CPU total memory test failed. \n***"
RET=1
fi

kill $SERVER_PID
wait $SERVER_PID

Expand Down
Loading