Skip to content

Commit

Permalink
Support for Context Propagation for OTel trace mode (#6785)
Browse files Browse the repository at this point in the history
Added support for OTel context propagation

---------

Co-authored-by: Markus Hennerbichler <[email protected]>
Co-authored-by: Ryan McCormick <[email protected]>
  • Loading branch information
3 people committed Jan 17, 2024
1 parent a18c099 commit 24c3d60
Show file tree
Hide file tree
Showing 12 changed files with 999 additions and 290 deletions.
798 changes: 640 additions & 158 deletions qa/L0_trace/opentelemetry_unittest.py

Large diffs are not rendered by default.

206 changes: 149 additions & 57 deletions qa/L0_trace/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -698,28 +698,40 @@ set +e
# Check opentelemetry trace exporter sends proper info.
# A helper python script starts listening on $OTLP_PORT, where
# OTLP exporter sends traces.
export TRITON_OPENTELEMETRY_TEST='false'
OTLP_PORT=10000
OTEL_COLLECTOR_DIR=./opentelemetry-collector
OTEL_COLLECTOR=./opentelemetry-collector/bin/otelcorecol_*
OTEL_COLLECTOR=./otelcol
OTEL_COLLECTOR_LOG="./trace_collector_http_exporter.log"

# Building the latest version of the OpenTelemetry collector.
# Installing OpenTelemetry collector (v0.91.0).
# Ref: https://opentelemetry.io/docs/collector/getting-started/#local
if [ -d "$OTEL_COLLECTOR_DIR" ]; then rm -Rf $OTEL_COLLECTOR_DIR; fi
git clone --depth 1 --branch v0.82.0 https://github.com/open-telemetry/opentelemetry-collector.git
cd $OTEL_COLLECTOR_DIR
make install-tools
make otelcorecol
cd ..
$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$!

curl --proto '=https' --tlsv1.2 -fOL https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v0.91.0/otelcol_0.91.0_linux_amd64.tar.gz
tar -xvf otelcol_0.91.0_linux_amd64.tar.gz

SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1 \
--trace-config=count=100 --trace-config=mode=opentelemetry \
rm collected_traces.json*
# Unittests then check that produced spans have expected format and events
OPENTELEMETRY_TEST=opentelemetry_unittest.py
OPENTELEMETRY_LOG="opentelemetry_unittest.log"
EXPECTED_NUM_TESTS="13"

# Set up repo and args for SageMaker
export SAGEMAKER_TRITON_DEFAULT_MODEL_NAME="simple"
MODEL_PATH="/opt/ml/models/123456789abcdefghi/model"
rm -r ${MODEL_PATH}
mkdir -p "${MODEL_PATH}"
cp -r $DATADIR/$MODELBASE/* ${MODEL_PATH} && \
rm -r ${MODEL_PATH}/2 && rm -r ${MODEL_PATH}/3 && \
sed -i "s/onnx_int32_int32_int32/simple/" ${MODEL_PATH}/config.pbtxt


SERVER_ARGS="--allow-sagemaker=true --model-control-mode=explicit \
--load-model=simple --load-model=ensemble_add_sub_int32_int32_int32 \
--load-model=bls_simple --trace-config=level=TIMESTAMPS \
--trace-config=rate=1 --trace-config=count=-1 --trace-config=mode=opentelemetry \
--trace-config=opentelemetry,resource=test.key=test.value \
--trace-config=opentelemetry,resource=service.name=test_triton \
--trace-config=opentelemetry,url=localhost:$OTLP_PORT/v1/traces \
--model-repository=$MODELSDIR"
SERVER_LOG="./inference_server_otel_http_exporter.log"
SERVER_LOG="./inference_server_otel_otelcol_exporter.log"

run_server
if [ "$SERVER_PID" == "0" ]; then
Expand All @@ -728,38 +740,97 @@ if [ "$SERVER_PID" == "0" ]; then
exit 1
fi

$SIMPLE_HTTP_CLIENT >>$CLIENT_LOG 2>&1
set +e

set -e
python $OPENTELEMETRY_TEST >>$OPENTELEMETRY_LOG 2>&1
if [ $? -ne 0 ]; then
cat $OPENTELEMETRY_LOG
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $OPENTELEMETRY_LOG
echo -e "\n***\n*** Test Result Verification Failed\n***"
RET=1
fi
fi

set -e
kill $SERVER_PID
wait $SERVER_PID
set +e

kill $COLLECTOR_PID
wait $COLLECTOR_PID
# Testing OTel WAR with trace rate = 0
rm collected_traces.json

OTEL_COLLECTOR=./otelcol
OTEL_COLLECTOR_LOG="./trace_collector_exporter.log"
$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$!

SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=0\
--trace-config=count=-1 --trace-config=mode=opentelemetry \
--trace-config=opentelemetry,url=localhost:$OTLP_PORT/v1/traces \
--model-repository=$MODELSDIR"
SERVER_LOG="./inference_server_otel_WAR.log"

run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

get_trace_setting "bls_simple"
assert_curl_success "Failed to obtain trace settings for 'simple' model"

if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then
RET=1
fi
if [ `grep -c "\"trace_rate\":\"0\"" ./curl.out` != "1" ]; then
RET=1
fi
if [ `grep -c "\"trace_count\":\"-1\"" ./curl.out` != "1" ]; then
RET=1
fi

set +e
# Send bls requests to make sure bls_simple model is NOT traced
for p in {1..10}; do
python -c 'import opentelemetry_unittest; \
opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32")' >> client_update.log 2>&1
done

if ! [[ -s $OTEL_COLLECTOR_LOG && `grep -c 'InstrumentationScope triton-server' $OTEL_COLLECTOR_LOG` == 3 ]] ; then
echo -e "\n***\n*** HTTP exporter test failed.\n***"
cat $OTEL_COLLECTOR_LOG
if [ -s collected_traces.json ] ; then
echo -e "\n***\n*** collected_traces.json should be empty, but it is not.\n***"
exit 1
fi

# Send 1 bls request with OTel context to make sure it is traced
python -c 'import opentelemetry_unittest; \
opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32", \
headers={"traceparent": "00-0af7651916cd43dd8448eb211c12666c-b7ad6b7169242424-01"} \
)' >> client_update.log 2>&1

# Unittests then check that produced spans have expected format and events
OPENTELEMETRY_TEST=opentelemetry_unittest.py
OPENTELEMETRY_LOG="opentelemetry_unittest.log"
EXPECTED_NUM_TESTS="3"
sleep 20

export TRITON_OPENTELEMETRY_TEST='true'
if ! [ -s collected_traces.json ] ; then
echo -e "\n***\n*** collected_traces.json should contain OTel trace, but it is not. \n***"
exit 1
fi

SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1 \
--trace-config=count=100 --trace-config=mode=opentelemetry \
--trace-config=opentelemetry,resource=test.key=test.value \
--trace-config=opentelemetry,resource=service.name=test_triton \
set -e
kill $COLLECTOR_PID
wait $COLLECTOR_PID
kill $SERVER_PID
wait $SERVER_PID
set +e

# Test that only traces with OTel Context are collected after count goes to 0
SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=5\
--trace-config=count=1 --trace-config=mode=opentelemetry \
--trace-config=opentelemetry,url=localhost:$OTLP_PORT/v1/traces \
--model-repository=$MODELSDIR"
SERVER_LOG="./inference_server_otel_ostream_exporter.log"
SERVER_LOG="./inference_server_otel_WAR.log"

run_server
if [ "$SERVER_PID" == "0" ]; then
Expand All @@ -768,41 +839,62 @@ if [ "$SERVER_PID" == "0" ]; then
exit 1
fi

set +e
# Preparing traces for unittest.
# Note: running this separately, so that I could extract spans with `grep`
# from server log later.
python -c 'import opentelemetry_unittest; \
opentelemetry_unittest.prepare_traces()' >>$CLIENT_LOG 2>&1

sleep 5
rm collected_traces.json
$OTEL_COLLECTOR --config ./trace-config.yaml >> $OTEL_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$!

set -e
get_trace_setting "bls_simple"
assert_curl_success "Failed to obtain trace settings for 'simple' model"

kill $SERVER_PID
wait $SERVER_PID
if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then
RET=1
fi
if [ `grep -c "\"trace_rate\":\"5\"" ./curl.out` != "1" ]; then
RET=1
fi
if [ `grep -c "\"trace_count\":\"1\"" ./curl.out` != "1" ]; then
RET=1
fi

set +e
# Send bls requests to make sure bls_simple model is NOT traced
for p in {1..20}; do
python -c 'import opentelemetry_unittest; \
opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32")' >> client_update.log 2>&1
done

grep -z -o -P '({\n(?s).*}\n)' $SERVER_LOG >> trace_collector.log
sleep 20

if ! [ -s trace_collector.log ] ; then
echo -e "\n***\n*** $SERVER_LOG did not contain any OpenTelemetry spans.\n***"
if ! [[ -s collected_traces.json && `grep -c "\"name\":\"InferRequest\"" ./collected_traces.json` == 1 && `grep -c "\"parentSpanId\":\"\"" ./collected_traces.json` == 1 ]] ; then
echo -e "\n***\n*** collected_traces.json should contain only 1 trace.\n***"
cat collected_traces.json
exit 1
fi

# Unittest will not start until expected number of spans is collected.
python $OPENTELEMETRY_TEST >>$OPENTELEMETRY_LOG 2>&1
if [ $? -ne 0 ]; then
cat $OPENTELEMETRY_LOG
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $OPENTELEMETRY_LOG
echo -e "\n***\n*** Test Result Verification Failed\n***"
RET=1
fi
# Send 4 bls request with OTel context and 4 without to make sure it is traced
for p in {1..10}; do
python -c 'import opentelemetry_unittest; \
opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32", \
headers={"traceparent": "00-0af7651916cd43dd8448eb211c12666c-b7ad6b7169242424-01"} \
)' >> client_update.log 2>&1

python -c 'import opentelemetry_unittest; \
opentelemetry_unittest.send_bls_request(model_name="ensemble_add_sub_int32_int32_int32" \
)' >> client_update.log 2>&1

sleep 10
done

if ! [[ -s collected_traces.json && `grep -c "\"parentSpanId\":\"\"" ./collected_traces.json` == 1 && `grep -c "\"parentSpanId\":\"b7ad6b7169242424\"" ./collected_traces.json` == 10 ]] ; then
echo -e "\n***\n*** collected_traces.json should contain 11 OTel trace, but it is not. \n***"
exit 1
fi

set -e
kill $COLLECTOR_PID
wait $COLLECTOR_PID
kill $SERVER_PID
wait $SERVER_PID
set +e

exit $RET
12 changes: 9 additions & 3 deletions qa/L0_trace/trace-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,18 @@ receivers:
http:
endpoint: 0.0.0.0:10000

processors:
batch:
send_batch_size: 10
timeout: 10s

exporters:
logging:
verbosity: detailed
file:
path: ./collected_traces.json

service:
pipelines:
traces:
receivers: [otlp]
exporters: [logging]
processors: [batch]
exporters: [file]
4 changes: 2 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -507,9 +507,9 @@ if(${TRITON_ENABLE_TRACING})
tracer.cc tracer.h
)

if (NOT WIN32)
target_compile_features(tracing-library PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})
target_compile_features(tracing-library PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})

if (NOT WIN32)
target_include_directories(
tracing-library
PRIVATE ${OPENTELEMETRY_CPP_INCLUDE_DIRS}
Expand Down
6 changes: 4 additions & 2 deletions src/grpc/infer_handler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -916,8 +916,10 @@ ModelInferHandler::Execute(InferHandler::State* state)
if (err == nullptr) {
TRITONSERVER_InferenceTrace* triton_trace = nullptr;
#ifdef TRITON_ENABLE_TRACING
state->trace_ =
std::move(trace_manager_->SampleTrace(request.model_name()));
GrpcServerCarrier carrier(state->context_->ctx_.get());
auto start_options =
trace_manager_->GetTraceStartOptions(carrier, request.model_name());
state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
if (state->trace_ != nullptr) {
triton_trace = state->trace_->trace_;
}
Expand Down
29 changes: 29 additions & 0 deletions src/grpc/infer_handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -1433,4 +1433,33 @@ class ModelInferHandler
grpc_compression_level compression_level_;
};

#if !defined(_WIN32) && defined(TRITON_ENABLE_TRACING)
class GrpcServerCarrier : public otel_cntxt::propagation::TextMapCarrier {
public:
GrpcServerCarrier(::grpc::ServerContext* context) : context_(context) {}
GrpcServerCarrier() = default;
virtual opentelemetry::nostd::string_view Get(
opentelemetry::nostd::string_view key) const noexcept override
{
auto it = context_->client_metadata().find({key.data(), key.size()});
if (it != context_->client_metadata().end()) {
return it->second.data();
}
return "";
}

// Not required on server side
virtual void Set(
opentelemetry::nostd::string_view key,
opentelemetry::nostd::string_view value) noexcept override
{
return;
}

::grpc::ServerContext* context_;
};
#else
using GrpcServerCarrier = void*;
#endif // TRITON_ENABLE_TRACING

}}} // namespace triton::server::grpc
6 changes: 4 additions & 2 deletions src/grpc/stream_infer_handler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -309,8 +309,10 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
if (err == nullptr) {
TRITONSERVER_InferenceTrace* triton_trace = nullptr;
#ifdef TRITON_ENABLE_TRACING
state->trace_ =
std::move(trace_manager_->SampleTrace(request.model_name()));
GrpcServerCarrier carrier(state->context_->ctx_.get());
auto start_options =
trace_manager_->GetTraceStartOptions(carrier, request.model_name());
state->trace_ = std::move(trace_manager_->SampleTrace(start_options));
if (state->trace_ != nullptr) {
triton_trace = state->trace_->trace_;
}
Expand Down
6 changes: 4 additions & 2 deletions src/http_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3055,11 +3055,13 @@ HTTPAPIServer::StartTrace(
TRITONSERVER_InferenceTrace** triton_trace)
{
#ifdef TRITON_ENABLE_TRACING
HttpTextMapCarrier carrier(req->headers_in);
auto start_options =
trace_manager_->GetTraceStartOptions(carrier, model_name);
std::shared_ptr<TraceManager::Trace> trace;
trace = std::move(trace_manager_->SampleTrace(model_name));
trace = std::move(trace_manager_->SampleTrace(start_options));
if (trace != nullptr) {
*triton_trace = trace->trace_;

// Timestamps from evhtp are capture in 'req'. We record here
// since this is the first place where we have access to trace
// manager.
Expand Down
Loading

0 comments on commit 24c3d60

Please sign in to comment.