From 5f10d61aeaaada99c93b31d179f5a69274af115a Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Fri, 5 Jul 2024 14:50:14 -0700 Subject: [PATCH] [refactor]: Refactor Frontend Trace OpenTelemetry Implementation (#7390) Co-authored-by: Iman Tabrizian --- src/tracer.cc | 150 +++++++++++++++++++------------------------------- src/tracer.h | 77 ++++++++++++-------------- 2 files changed, 92 insertions(+), 135 deletions(-) diff --git a/src/tracer.cc b/src/tracer.cc index 560278e4a6..b17f5eb7e8 100644 --- a/src/tracer.cc +++ b/src/tracer.cc @@ -335,13 +335,23 @@ TraceManager::SampleTrace(const TraceStartOptions& start_options) std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count(); - ts->otel_context_ = start_options.propagated_context; - opentelemetry::nostd::shared_ptr root_span; - root_span = ts->StartSpan( - "InferRequest", steady_timestamp_ns, otel_trace_api::kSpanKey); + if (ts->span_stacks_.find(ts->trace_id_) == ts->span_stacks_.end()) { + std::unique_ptr< + std::stack>> + st(new std::stack< + opentelemetry::nostd::shared_ptr>()); + ts->span_stacks_.emplace(ts->trace_id_, std::move(st)); + } + auto active_span = + otel_trace_api::GetSpan(start_options.propagated_context); + if (active_span->GetContext().IsValid()) { + ts->span_stacks_[ts->trace_id_]->emplace(active_span); + } // Storing "InferRequest" span as a root span // to keep it alive for the duration of the request. - ts->otel_context_ = ts->otel_context_.SetValue(kRootSpan, root_span); + ts->root_span_ = + ts->StartSpan("InferRequest", steady_timestamp_ns, ts->trace_id_); + ts->span_stacks_[ts->trace_id_]->emplace(ts->root_span_); #else LOG_ERROR << "Unsupported trace mode: " << TraceManager::InferenceTraceModeString(ts->setting_->mode_); @@ -358,7 +368,7 @@ TraceManager::Trace::~Trace() setting_->WriteTrace(streams_); } else if (setting_->mode_ == TRACE_MODE_OPENTELEMETRY) { #ifndef _WIN32 - EndSpan(kRootSpan); + EndSpan(trace_id_); #else LOG_ERROR << "Unsupported trace mode: " << TraceManager::InferenceTraceModeString(setting_->mode_); @@ -390,7 +400,8 @@ TraceManager::Trace::CaptureTimestamp( << "{\"name\":\"" << name << "\",\"ns\":" << timestamp_ns << "}]}"; } else if (setting_->mode_ == TRACE_MODE_OPENTELEMETRY) { #ifndef _WIN32 - AddEvent(kRootSpan, name, timestamp_ns); + root_span_->AddEvent( + name, time_offset_ + std::chrono::nanoseconds{timestamp_ns}); #else LOG_ERROR << "Unsupported trace mode: " << TraceManager::InferenceTraceModeString(setting_->mode_); @@ -501,7 +512,7 @@ TraceManager::ProcessOpenTelemetryParameters( void TraceManager::Trace::StartSpan( - std::string span_key, TRITONSERVER_InferenceTrace* trace, + TRITONSERVER_InferenceTrace* trace, TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns, uint64_t trace_id) { @@ -509,7 +520,7 @@ TraceManager::Trace::StartSpan( LOG_TRITONSERVER_ERROR( TRITONSERVER_InferenceTraceParentId(trace, &parent_id), "getting trace parent id"); - std::string parent_span_key = ""; + auto span_parent_id = parent_id; // Currently, only 2 types of sub-spans are supported: // request span and compute span. Compute span is a leaf span @@ -521,16 +532,9 @@ TraceManager::Trace::StartSpan( // If parent_id > 0, then this is a child trace, spawned from // the ensamble's main request. For this instance, the parent // span is the ensembles's request span. - if (parent_id == 0 && activity == TRITONSERVER_TRACE_REQUEST_START) { - parent_span_key = kRootSpan; - } else if (activity == TRITONSERVER_TRACE_REQUEST_START) { - // [FIXME] For BLS requests parent span for children's request spans - // should be parent model's compute span. Currently, - // this won't work, since parent's compute span will be created - // only after children's spans are created. - parent_span_key = kRequestSpan + std::to_string(parent_id); - } else if (activity == TRITONSERVER_TRACE_COMPUTE_START) { - parent_span_key = kRequestSpan + std::to_string(trace_id); + if ((parent_id == 0 && activity == TRITONSERVER_TRACE_REQUEST_START) || + (activity == TRITONSERVER_TRACE_COMPUTE_START)) { + span_parent_id = trace_id; } std::string display_name = "compute"; @@ -542,7 +546,7 @@ TraceManager::Trace::StartSpan( display_name = model_name; } - auto span = StartSpan(display_name, timestamp_ns, parent_span_key); + auto span = StartSpan(display_name, timestamp_ns, span_parent_id); if (activity == TRITONSERVER_TRACE_REQUEST_START) { int64_t model_version; @@ -564,14 +568,13 @@ TraceManager::Trace::StartSpan( PrepareTraceContext(span, &buffer); TRITONSERVER_InferenceTraceSetContext(trace, buffer.Contents().c_str()); } - - otel_context_ = otel_context_.SetValue(span_key, span); + span_stacks_[trace_id]->emplace(span); } opentelemetry::nostd::shared_ptr TraceManager::Trace::StartSpan( std::string display_name, const uint64_t& raw_timestamp_ns, - std::string parent_span_key) + uint64_t trace_id) { otel_trace_api::StartSpanOptions options; options.kind = otel_trace_api::SpanKind::kServer; @@ -580,45 +583,37 @@ TraceManager::Trace::StartSpan( options.start_steady_time = otel_common::SteadyTimestamp{std::chrono::nanoseconds{raw_timestamp_ns}}; - // If the new span is a child span, we need to retrieve its parent from - // the context and provide it through StartSpanOptions to the child span - if (!parent_span_key.empty() && otel_context_.HasKey(parent_span_key)) { - auto parent_span = opentelemetry::nostd::get< - opentelemetry::nostd::shared_ptr>( - otel_context_.GetValue(parent_span_key)); - options.parent = parent_span->GetContext(); + // If the new span is a child span, we need to retrieve its parent and + // provide it through StartSpanOptions to the child span + if (span_stacks_.find(trace_id) != span_stacks_.end() && + !span_stacks_[trace_id]->empty()) { + options.parent = span_stacks_[trace_id]->top()->GetContext(); } auto provider = opentelemetry::trace::Provider::GetTracerProvider(); return provider->GetTracer(kTritonTracer)->StartSpan(display_name, options); } void -TraceManager::Trace::EndSpan(std::string span_key) +TraceManager::Trace::EndSpan(uint64_t trace_id) { auto timestamp_ns = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count(); - EndSpan(span_key, timestamp_ns); + EndSpan(timestamp_ns, trace_id); } void TraceManager::Trace::EndSpan( - std::string span_key, const uint64_t& raw_timestamp_ns) + const uint64_t& raw_timestamp_ns, uint64_t trace_id) { - if (otel_context_.HasKey(span_key)) { - auto span = opentelemetry::nostd::get< - opentelemetry::nostd::shared_ptr>( - otel_context_.GetValue(span_key)); - - if (span == nullptr) { - return; - } - + if (span_stacks_.find(trace_id) != span_stacks_.end() && + !span_stacks_[trace_id]->empty()) { otel_trace_api::EndSpanOptions end_options; end_options.end_steady_time = otel_common::SteadyTimestamp{ std::chrono::nanoseconds{raw_timestamp_ns}}; - span->End(end_options); + span_stacks_[trace_id]->top()->End(end_options); + span_stacks_[trace_id]->pop(); } } @@ -630,79 +625,46 @@ TraceManager::Trace::ReportToOpenTelemetry( uint64_t id; LOG_TRITONSERVER_ERROR( TRITONSERVER_InferenceTraceId(trace, &id), "getting trace id"); - - auto current_span_key = GetSpanKeyForActivity(activity, id); - if (current_span_key.empty()) { - return; + if (span_stacks_.find(id) == span_stacks_.end()) { + std::unique_ptr< + std::stack>> + st(new std::stack< + opentelemetry::nostd::shared_ptr>()); + span_stacks_.emplace(id, std::move(st)); } - AddEvent(current_span_key, trace, activity, timestamp_ns, id); -} - -std::string -TraceManager::Trace::GetSpanKeyForActivity( - TRITONSERVER_InferenceTraceActivity activity, uint64_t trace_id) -{ - std::string span_name; - switch (activity) { - case TRITONSERVER_TRACE_REQUEST_START: - case TRITONSERVER_TRACE_QUEUE_START: - case TRITONSERVER_TRACE_REQUEST_END: { - span_name = kRequestSpan + std::to_string(trace_id); - break; - } - - case TRITONSERVER_TRACE_COMPUTE_START: - case TRITONSERVER_TRACE_COMPUTE_INPUT_END: - case TRITONSERVER_TRACE_COMPUTE_OUTPUT_START: - case TRITONSERVER_TRACE_COMPUTE_END: { - span_name = kComputeSpan + std::to_string(trace_id); - break; - } - case TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT: - case TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT: - case TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT: - default: { - LOG_ERROR << "Unsupported activity: " - << TRITONSERVER_InferenceTraceActivityString(activity); - span_name = ""; - break; - } - } - - return span_name; + AddEvent(trace, activity, timestamp_ns, id); } void TraceManager::Trace::AddEvent( - std::string span_key, TRITONSERVER_InferenceTrace* trace, + TRITONSERVER_InferenceTrace* trace, TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns, - uint64_t id) + uint64_t trace_id) { if (activity == TRITONSERVER_TRACE_REQUEST_START || activity == TRITONSERVER_TRACE_COMPUTE_START) { - StartSpan(span_key, trace, activity, timestamp_ns, id); + StartSpan(trace, activity, timestamp_ns, trace_id); } AddEvent( - span_key, TRITONSERVER_InferenceTraceActivityString(activity), - timestamp_ns); + TRITONSERVER_InferenceTraceActivityString(activity), timestamp_ns, + trace_id); if (activity == TRITONSERVER_TRACE_REQUEST_END || activity == TRITONSERVER_TRACE_COMPUTE_END) { - EndSpan(span_key, timestamp_ns); + EndSpan(timestamp_ns, trace_id); } } void TraceManager::Trace::AddEvent( - std::string span_key, std::string event, uint64_t timestamp) + const std::string& event, uint64_t timestamp, uint64_t trace_id) { - if (otel_context_.HasKey(span_key)) { - auto span = opentelemetry::nostd::get< - opentelemetry::nostd::shared_ptr>( - otel_context_.GetValue(span_key)); - span->AddEvent(event, time_offset_ + std::chrono::nanoseconds{timestamp}); + if (span_stacks_.find(trace_id) != span_stacks_.end() && + !span_stacks_[trace_id]->empty()) { + span_stacks_[trace_id]->top()->AddEvent( + event, time_offset_ + std::chrono::nanoseconds{timestamp}); } } diff --git a/src/tracer.h b/src/tracer.h index d6e8ee65b0..e33b16dbcb 100644 --- a/src/tracer.h +++ b/src/tracer.h @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -237,8 +238,8 @@ class TraceManager { /// `TRITONSERVER_TRACE_COMPUTE_START`, /// it starts a new request or compute span. For the request span it /// adds some triton related attributes, and adds this span to - /// `otel_context_`. Alternatively, if activity is - /// `TRITONSERVER_TRACE_REQUEST_END` or + /// a span stack, corresponding to the current trace. Alternatively, + /// if activity is `TRITONSERVER_TRACE_REQUEST_END` or /// `TRITONSERVER_TRACE_COMPUTE_END`, it ends the corresponding span. /// /// \param trace TRITONSERVER_InferenceTrace instance. @@ -258,16 +259,25 @@ class TraceManager { /// OpenTelemetry SystemTimestamp to display span on a timeline, and /// OpenTelemetry SteadyTimestamp to calculate the duration on the span /// with better precision. - /// \param parent_span_key A span key, to find a parent span in the - /// OpenTelemetry context. If empty, a root span will be started, - /// i.e. with no parent span specified. + /// \param trace_id Trace id. /// \return A shared pointer to a newly created OpenTelemetry span. opentelemetry::nostd::shared_ptr StartSpan( std::string display_name, const uint64_t& raw_timestamp_ns, - std::string parent_span_key = ""); + uint64_t trace_id); + + // A map to hold spans. Any trace can spawn any amount of child traces, + // e.g. ensemble model and BLS. This map holds + // ( trace id, stack of started spans ) pair and for each trase keeps + // started spans alive for the duration of the traced + // event and helps to preserve parent-child relationship. + std::unordered_map< + uint64_t, std::unique_ptr>>> + span_stacks_; - // OTel context to store spans, created in the current trace - opentelemetry::context::Context otel_context_; + // Root span. Some events should be recorded in the root span, while + // request span is still alive and present in the stack. + opentelemetry::nostd::shared_ptr root_span_; /// Prepares trace context to propagate to TRITONSERVER_InferenceTrace. /// Trace context follows W3C Trace Context specification. @@ -304,8 +314,6 @@ class TraceManager { /// For request spans, it will add the following attributes to the span: /// `model_name`, `model_version`, `trace_id`, `parent_id`. /// - /// \param span_key Span's key to retrieve the corresponding span from the - /// OpenTelemetry context. /// \param trace TRITONSERVER_InferenceTrace, used to request model's name, /// version, trace parent_id from the backend. /// \param activity Trace activity. @@ -315,58 +323,45 @@ class TraceManager { /// with better precision. /// \param trace_id Trace id. void StartSpan( - std::string span_key, TRITONSERVER_InferenceTrace* trace, + TRITONSERVER_InferenceTrace* trace, TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns, uint64_t trace_id); - /// Ends the provided span. + /// Ends the span on the top of the stack, related to trace with `trace_id`. /// - /// \param span_key Span's key to retrieve the corresponding span from the - /// OpenTelemetry context. - void EndSpan(std::string span_key); + /// \param trace_id Trace id. + void EndSpan(uint64_t trace_id); - /// Ends the provided span at specified steady timestamp. + /// Ends the span on the top of the stack, related to trace with `trace_id` + /// at specified steady timestamp. /// - /// \param span_key Span's key to retrieve the corresponding span from the - /// OpenTelemetry context. /// \param raw_timestamp_ns Steady timestamp to use as /// `EndSpanOptions::end_steady_time`. - void EndSpan(std::string span_key, const uint64_t& raw_timestamp_ns); - - /// Returns the span key, for which the activity belongs. - /// - /// \param activity reported activity. /// \param trace_id Trace id. - /// \return A key to identify span, stored in the OpenTelemetry context. - std::string GetSpanKeyForActivity( - TRITONSERVER_InferenceTraceActivity activity, uint64_t trace_id); - - /// Adds event to the span, which is retrieved from OpenTelemetry context - /// with the provided `span_key`. If activity is - /// TRITONSERVER_TRACE_REQUEST_START, or TRITONSERVER_TRACE_COMPUTE_START, - /// starts a new span and adds it to `otel_context_`. + void EndSpan(const uint64_t& raw_timestamp_ns, uint64_t trace_id); + + /// Adds an event to the span on the top of the stack, related to trace + /// with `trace_id`. If activity is TRITONSERVER_TRACE_REQUEST_START, + /// or TRITONSERVER_TRACE_COMPUTE_START, starts a new span and adds it + /// to the span's stack. /// - /// \param span_key Span's key to retrieve the corresponding span from the - /// OpenTelemetry context. /// \param trace TRITONSERVER_InferenceTrace, used to request model's name, /// version, trace parent_id from the backend. /// \param activity Trace activity. /// \param timestamp_ns Timestamp of the provided event. - /// \param id Trace id. + /// \param trace_id Trace id. void AddEvent( - std::string span_key, TRITONSERVER_InferenceTrace* trace, + TRITONSERVER_InferenceTrace* trace, TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns, - uint64_t id); + uint64_t trace_id); - /// Adds event to the OpenTelemetry span, retrieved from an OpenTelementry - /// context with the provided `span_key`. + /// Adds an event to the OpenTelemetry span. /// - /// \param span_key Span's key to retrieve the corresponding span from the - /// OpenTelemetry context. /// \param event An event to add to the span. /// \param timestamp_ns Timestamp of the provided event. + /// \param trace_id Trace id. void AddEvent( - std::string span_key, std::string event, uint64_t timestamp_ns); + const std::string& event, uint64_t timestamp_ns, uint64_t trace_id); #endif };