diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index ef3af71b5..ad04d57f8 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -65,6 +65,7 @@ struct TRITONBACKEND_Response; struct TRITONBACKEND_Backend; struct TRITONBACKEND_Model; struct TRITONBACKEND_ModelInstance; +struct TRITONBACKEND_ModelInstanceResponseStatistics; struct TRITONBACKEND_BackendAttribute; struct TRITONBACKEND_Batcher; @@ -94,7 +95,7 @@ struct TRITONBACKEND_Batcher; /// } /// #define TRITONBACKEND_API_VERSION_MAJOR 1 -#define TRITONBACKEND_API_VERSION_MINOR 18 +#define TRITONBACKEND_API_VERSION_MINOR 19 /// Get the TRITONBACKEND API version supported by Triton. This value /// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and @@ -761,8 +762,9 @@ TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseOutput( /// \param send_flags Flags associated with the response. \see /// TRITONSERVER_ResponseCompleteFlag. \see /// TRITONSERVER_InferenceResponseCompleteFn_t. -/// \param error The TRITONSERVER_Error to send if the response is an -/// error, or nullptr if the response is successful. +/// \param error The TRITONSERVER_Error to send if the response is an error, or +/// nullptr if the response is successful. The caller retains ownership to the +/// error object and must free it with TRITONSERVER_ErrorDelete. /// \return a TRITONSERVER_Error indicating success or failure. TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSend( TRITONBACKEND_Response* response, const uint32_t send_flags, @@ -1319,16 +1321,16 @@ TRITONBACKEND_ModelInstanceReportMemoryUsage( /// TRITONBACKEND_ModelInstanceExecute. /// /// TRITONBACKEND_ModelInstanceExecute() -/// CAPTURE TIMESPACE (exec_start_ns) +/// CAPTURE TIMESTAMP (exec_start_ns) /// < process input tensors to prepare them for inference /// execution, including copying the tensors to/from GPU if /// necessary> -/// CAPTURE TIMESPACE (compute_start_ns) +/// CAPTURE TIMESTAMP (compute_start_ns) /// < perform inference computations to produce outputs > -/// CAPTURE TIMESPACE (compute_end_ns) +/// CAPTURE TIMESTAMP (compute_end_ns) /// < allocate output buffers and extract output tensors, including /// copying the tensors to/from GPU if necessary> -/// CAPTURE TIMESPACE (exec_end_ns) +/// CAPTURE TIMESTAMP (exec_end_ns) /// return /// /// Note that these statistics are associated with a valid @@ -1356,6 +1358,156 @@ TRITONBACKEND_ModelInstanceReportStatistics( const uint64_t compute_start_ns, const uint64_t compute_end_ns, const uint64_t exec_end_ns); +/// Create a new inference response statistics object. +/// +/// \param response_statistics The new response statistics object to be created. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsNew( + TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics); + +/// Delete an inference response statistics object. +/// +/// The caller retains ownership to the objects set on the deleted response +/// statistics object and must free them separately. +/// +/// \param response_statistics The response statistics object to be deleted. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsDelete( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics); + +/// Set model instance to an inference response statistics object. +/// +/// \param response_statistics The response statistics object. +/// \param model_instance The model instance. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONBACKEND_ModelInstance* model_instance); + +/// Set response factory to an inference response statistics object. +/// +/// \param response_statistics The response statistics object. +/// \param response_factory The response factory. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONBACKEND_ResponseFactory* response_factory); + +/// Set response start time to an inference response statistics object. +/// +/// All timestamps should be reported in nanonseconds and collected using +/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. +/// +/// For consistency of measurement across different backends, the timestamps +/// should be collected at the following points during +/// TRITONBACKEND_ModelInstanceExecute. +/// +/// TRITONBACKEND_ModelInstanceExecute() +/// < start of this response > +/// CAPTURE TIMESTAMP (response_start) +/// < generate this response > +/// CAPTURE TIMESTAMP (compute_output_start) +/// < allocate output buffers and extract output tensors, including copying +/// the tensors to/from GPU if necessary > +/// CAPTURE TIMESTAMP (response_end) +/// < end of this response > +/// return +/// +/// \param response_statistics The response statistics object. +/// \param response_start The response start time. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t response_start); + +/// Set compute output start time to an inference response statistics object. +/// +/// Do NOT set this compute output start time (or set it to 0), if reporting an +/// empty response. +/// +/// All timestamps should be reported in nanonseconds and collected using +/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. +/// +/// For consistency of measurement across different backends, the timestamps +/// should be collected at the following points during +/// TRITONBACKEND_ModelInstanceExecute. +/// +/// TRITONBACKEND_ModelInstanceExecute() +/// < start of this response > +/// CAPTURE TIMESTAMP (response_start) +/// < generate this response > +/// CAPTURE TIMESTAMP (compute_output_start) +/// < allocate output buffers and extract output tensors, including copying +/// the tensors to/from GPU if necessary > +/// CAPTURE TIMESTAMP (response_end) +/// < end of this response > +/// return +/// +/// \param response_statistics The response statistics object. +/// \param compute_output_start The compute output start time. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t compute_output_start); + +/// Set response end time to an inference response statistics object. +/// +/// All timestamps should be reported in nanonseconds and collected using +/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. +/// +/// For consistency of measurement across different backends, the timestamps +/// should be collected at the following points during +/// TRITONBACKEND_ModelInstanceExecute. +/// +/// TRITONBACKEND_ModelInstanceExecute() +/// < start of this response > +/// CAPTURE TIMESTAMP (response_start) +/// < generate this response > +/// CAPTURE TIMESTAMP (compute_output_start) +/// < allocate output buffers and extract output tensors, including copying +/// the tensors to/from GPU if necessary > +/// CAPTURE TIMESTAMP (response_end) +/// < end of this response > +/// return +/// +/// \param response_statistics The response statistics object. +/// \param response_end The response end time. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t response_end); + +/// Set error to an inference response statistics object. +/// +/// Use the same error object passed to the TRITONBACKEND_ResponseSend. \see +/// TRITONBACKEND_ResponseSend. +/// +/// \param response_statistics The response statistics object. +/// \param error The error object. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetError( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONSERVER_Error* error); + +/// Record statistics for an inference response. +/// +/// The caller retains ownership to the response statistics and must free it +/// after this function returns. +/// +/// \param response_statistics The statistics to be recorded. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceReportResponseStatistics( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics); + /// Record statistics for the execution of an entire batch of /// inference requests. /// diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index a4a9bba2e..5e43e093d 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -795,6 +795,23 @@ TritonModelInstance::TritonBackendThread::BackendThread() LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "..."; } +// Opaque object for the response statistics C-API +struct ModelInstanceResponseStatistics { +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics() + : model_instance(nullptr), response_factory(nullptr), response_start(0), + compute_output_start(0), response_end(0), error(nullptr) + { + } + TritonModelInstance* model_instance; + std::shared_ptr* response_factory; + uint64_t response_start; + uint64_t compute_output_start; + uint64_t response_end; + TRITONSERVER_Error* error; +#endif // TRITON_ENABLE_STATS +}; + extern "C" { TRITONAPI_DECLSPEC TRITONSERVER_Error* @@ -953,6 +970,148 @@ TRITONBACKEND_ModelInstanceReportStatistics( return nullptr; // success } +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsNew( + TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics) +{ +#ifdef TRITON_ENABLE_STATS + *response_statistics = + reinterpret_cast( + new ModelInstanceResponseStatistics()); +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsDelete( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics) +{ +#ifdef TRITON_ENABLE_STATS + delete reinterpret_cast( + response_statistics); +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONBACKEND_ModelInstance* model_instance) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->model_instance = reinterpret_cast(model_instance); +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONBACKEND_ResponseFactory* response_factory) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->response_factory = + reinterpret_cast*>( + response_factory); + ; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t response_start) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->response_start = response_start; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t compute_output_start) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->compute_output_start = compute_output_start; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t response_end) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->response_end = response_end; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetError( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONSERVER_Error* error) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->error = error; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceReportResponseStatistics( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + + InferenceStatsAggregator* sa = + rs->model_instance->Model()->MutableStatsAggregator(); + std::string key = + std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex()); + + if (rs->error == nullptr) { + if (rs->compute_output_start > 0) { + RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseSuccess( + key, rs->response_start, rs->compute_output_start, rs->response_end)); + } else { + RETURN_TRITONSERVER_ERROR_IF_ERROR( + sa->UpdateResponseEmpty(key, rs->response_start, rs->response_end)); + } + } else { + RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseFail( + key, rs->response_start, rs->compute_output_start, rs->response_end)); + } +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + TRITONAPI_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceReportBatchStatistics( TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size, diff --git a/src/infer_response.h b/src/infer_response.h index 75d5f9d48..612f8c1fe 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -61,6 +61,10 @@ class InferenceResponseFactory { alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), response_delegator_(delegator), is_cancelled_(false) +#ifdef TRITON_ENABLE_STATS + , + response_stats_index_(0) +#endif // TRITON_ENABLE_STATS { } @@ -94,6 +98,11 @@ class InferenceResponseFactory { void ReleaseTrace() { trace_ = nullptr; } #endif // TRITON_ENABLE_TRACING +#ifdef TRITON_ENABLE_STATS + // Return the current response statistics index and increment it. + uint64_t GetAndIncrementResponseIndex() { return response_stats_index_++; }; +#endif // TRITON_ENABLE_STATS + private: // The model associated with this factory. For normal // requests/responses this will always be defined and acts to keep @@ -129,6 +138,11 @@ class InferenceResponseFactory { // Inference trace associated with this response. std::shared_ptr trace_; #endif // TRITON_ENABLE_TRACING + +#ifdef TRITON_ENABLE_STATS + // Number of response statistics reported. + std::atomic response_stats_index_; +#endif // TRITON_ENABLE_STATS }; // diff --git a/src/infer_stats.cc b/src/infer_stats.cc index a1af3b9f2..0f47485c2 100644 --- a/src/infer_stats.cc +++ b/src/infer_stats.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -210,6 +210,132 @@ InferenceStatsAggregator::UpdateSuccessCacheMiss( #endif // TRITON_ENABLE_METRICS } +Status +InferenceStatsAggregator::UpdateResponseSuccess( + const std::string& key, const uint64_t response_start_ns, + const uint64_t compute_output_start_ns, const uint64_t response_end_ns) +{ + if (response_start_ns > compute_output_start_ns) { + return Status( + Status::Code::INVALID_ARG, + "Response start cannot happen after compute output start"); + } + if (compute_output_start_ns > response_end_ns) { + return Status( + Status::Code::INVALID_ARG, + "Compute output start cannot happen after response end"); + } + const uint64_t compute_infer_duration_ns = + compute_output_start_ns - response_start_ns; + const uint64_t compute_output_duration_ns = + response_end_ns - compute_output_start_ns; + const uint64_t total_duration_ns = response_end_ns - response_start_ns; + + { + std::lock_guard lock(mu_); + + auto it = response_stats_.find(key); + if (it == response_stats_.end()) { + it = response_stats_.emplace(key, InferResponseStats()).first; + } + + it->second.compute_infer_count++; + it->second.compute_infer_duration_ns += compute_infer_duration_ns; + it->second.compute_output_count++; + it->second.compute_output_duration_ns += compute_output_duration_ns; + it->second.success_count++; + it->second.success_duration_ns += total_duration_ns; + } + + return Status::Success; +} + +Status +InferenceStatsAggregator::UpdateResponseFail( + const std::string& key, const uint64_t response_start_ns, + const uint64_t compute_output_start_ns, const uint64_t response_end_ns) +{ + uint64_t compute_infer_duration_ns, compute_output_duration_ns, + total_duration_ns; + if (compute_output_start_ns > 0) { + // output tensors copied + if (response_start_ns > compute_output_start_ns) { + return Status( + Status::Code::INVALID_ARG, + "Response start cannot happen after compute output start"); + } + if (compute_output_start_ns > response_end_ns) { + return Status( + Status::Code::INVALID_ARG, + "Compute output start cannot happen after response end"); + } + compute_infer_duration_ns = compute_output_start_ns - response_start_ns; + compute_output_duration_ns = response_end_ns - compute_output_start_ns; + total_duration_ns = response_end_ns - response_start_ns; + } else { + // no output tensors copied + if (response_start_ns > response_end_ns) { + return Status( + Status::Code::INVALID_ARG, + "Response start cannot happen after response end"); + } + compute_infer_duration_ns = response_end_ns - response_start_ns; + compute_output_duration_ns = 0; + total_duration_ns = response_end_ns - response_start_ns; + } + + { + std::lock_guard lock(mu_); + + auto it = response_stats_.find(key); + if (it == response_stats_.end()) { + it = response_stats_.emplace(key, InferResponseStats()).first; + } + + it->second.compute_infer_count++; + it->second.compute_infer_duration_ns += compute_infer_duration_ns; + if (compute_output_duration_ns > 0) { + it->second.compute_output_count++; + it->second.compute_output_duration_ns += compute_output_duration_ns; + } + it->second.fail_count++; + it->second.fail_duration_ns += total_duration_ns; + } + + return Status::Success; +} + +Status +InferenceStatsAggregator::UpdateResponseEmpty( + const std::string& key, const uint64_t response_start_ns, + const uint64_t response_end_ns) +{ + if (response_start_ns > response_end_ns) { + return Status( + Status::Code::INVALID_ARG, + "Response start cannot happen after response end"); + } + const uint64_t compute_infer_duration_ns = + response_end_ns - response_start_ns; + const uint64_t total_duration_ns = response_end_ns - response_start_ns; + + { + std::lock_guard lock(mu_); + + auto it = response_stats_.find(key); + if (it == response_stats_.end()) { + it = response_stats_.emplace(key, InferResponseStats()).first; + } + + it->second.compute_infer_count++; + it->second.compute_infer_duration_ns += compute_infer_duration_ns; + it->second.empty_response_count++; + it->second.empty_response_duration_ns += total_duration_ns; + } + + return Status::Success; +} + void InferenceStatsAggregator::UpdateInferBatchStats( MetricModelReporter* metric_reporter, const size_t batch_size, diff --git a/src/infer_stats.h b/src/infer_stats.h index 86ad9a654..2ae2bc226 100644 --- a/src/infer_stats.h +++ b/src/infer_stats.h @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -78,6 +78,27 @@ class InferenceStatsAggregator { uint64_t cache_miss_duration_ns_; }; + struct InferResponseStats { + InferResponseStats() + : compute_infer_count(0), compute_infer_duration_ns(0), + compute_output_count(0), compute_output_duration_ns(0), + success_count(0), success_duration_ns(0), fail_count(0), + fail_duration_ns(0), empty_response_count(0), + empty_response_duration_ns(0) + { + } + uint64_t compute_infer_count; + uint64_t compute_infer_duration_ns; + uint64_t compute_output_count; + uint64_t compute_output_duration_ns; + uint64_t success_count; + uint64_t success_duration_ns; + uint64_t fail_count; + uint64_t fail_duration_ns; + uint64_t empty_response_count; + uint64_t empty_response_duration_ns; + }; + struct InferBatchStats { InferBatchStats() : count_(0), compute_input_duration_ns_(0), @@ -100,6 +121,11 @@ class InferenceStatsAggregator { uint64_t InferenceCount() const { return inference_count_; } uint64_t ExecutionCount() const { return execution_count_; } const InferStats& ImmutableInferStats() const { return infer_stats_; } + const std::map& ImmutableInferResponseStats() + const + { + return response_stats_; + } const std::map& ImmutableInferBatchStats() const { return batch_stats_; @@ -140,6 +166,21 @@ class InferenceStatsAggregator { MetricModelReporter* metric_reporter, const uint64_t cache_miss_duration_ns); + // Add durations to response stats for a successful response. + Status UpdateResponseSuccess( + const std::string& key, const uint64_t response_start_ns, + const uint64_t compute_output_start_ns, const uint64_t response_end_ns); + + // Add durations to response stats for a failed response. + Status UpdateResponseFail( + const std::string& key, const uint64_t response_start_ns, + const uint64_t compute_output_start_ns, const uint64_t response_end_ns); + + // Add durations to response stats for an empty response. + Status UpdateResponseEmpty( + const std::string& key, const uint64_t response_start_ns, + const uint64_t response_end_ns); + // Add durations to batch infer stats for a batch execution. // 'success_request_count' is the number of success requests in the // batch that have infer_stats attached. @@ -163,6 +204,7 @@ class InferenceStatsAggregator { uint64_t inference_count_; uint64_t execution_count_; InferStats infer_stats_; + std::map response_stats_; std::map batch_stats_; #endif // TRITON_ENABLE_STATS }; diff --git a/src/tritonserver.cc b/src/tritonserver.cc index 70bdddb75..f2d3c2624 100644 --- a/src/tritonserver.cc +++ b/src/tritonserver.cc @@ -1,4 +1,4 @@ -// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -2897,6 +2897,8 @@ TRITONSERVER_ServerModelStatistics( // Add infer statistic const auto& infer_stats = model->StatsAggregator().ImmutableInferStats(); + const auto& infer_response_stats = + model->StatsAggregator().ImmutableInferResponseStats(); const auto& infer_batch_stats = model->StatsAggregator().ImmutableInferBatchStats(); @@ -2933,6 +2935,35 @@ TRITONSERVER_ServerModelStatistics( metadata, inference_stats, "cache_miss", infer_stats.cache_miss_count_, infer_stats.cache_miss_duration_ns_); + // Add response statistics + triton::common::TritonJson::Value response_stats( + metadata, triton::common::TritonJson::ValueType::OBJECT); + for (const auto& res_pair : infer_response_stats) { + triton::common::TritonJson::Value res_stat( + metadata, triton::common::TritonJson::ValueType::OBJECT); + SetDurationStat( + metadata, res_stat, "compute_infer", + res_pair.second.compute_infer_count, + res_pair.second.compute_infer_duration_ns); + SetDurationStat( + metadata, res_stat, "compute_output", + res_pair.second.compute_output_count, + res_pair.second.compute_output_duration_ns); + SetDurationStat( + metadata, res_stat, "success", res_pair.second.success_count, + res_pair.second.success_duration_ns); + SetDurationStat( + metadata, res_stat, "fail", res_pair.second.fail_count, + res_pair.second.fail_duration_ns); + SetDurationStat( + metadata, res_stat, "empty_response", + res_pair.second.empty_response_count, + res_pair.second.empty_response_duration_ns); + RETURN_IF_STATUS_ERROR( + response_stats.Add(res_pair.first.c_str(), std::move(res_stat))); + } + + // Add batch statistics triton::common::TritonJson::Value batch_stats( metadata, triton::common::TritonJson::ValueType::ARRAY); for (const auto& batch : infer_batch_stats) { @@ -2967,6 +2998,8 @@ TRITONSERVER_ServerModelStatistics( RETURN_IF_STATUS_ERROR( model_stat.Add("inference_stats", std::move(inference_stats))); + RETURN_IF_STATUS_ERROR( + model_stat.Add("response_stats", std::move(response_stats))); RETURN_IF_STATUS_ERROR( model_stat.Add("batch_stats", std::move(batch_stats))); RETURN_IF_STATUS_ERROR( diff --git a/src/tritonserver_stub.cc b/src/tritonserver_stub.cc index e6efcda12..dc9bcca52 100644 --- a/src/tritonserver_stub.cc +++ b/src/tritonserver_stub.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -947,6 +947,42 @@ TRITONBACKEND_ModelInstanceReportStatistics() { } TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsNew() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetError() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsDelete() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceReportResponseStatistics() +{ +} +TRITONAPI_DECLSPEC void TRITONBACKEND_ModelInstanceReportBatchStatistics() { }