Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add response statistics #325

Merged
merged 10 commits into from
Feb 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 160 additions & 8 deletions include/triton/core/tritonbackend.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -65,6 +65,7 @@ struct TRITONBACKEND_Response;
struct TRITONBACKEND_Backend;
struct TRITONBACKEND_Model;
struct TRITONBACKEND_ModelInstance;
struct TRITONBACKEND_ModelInstanceResponseStatistics;
struct TRITONBACKEND_BackendAttribute;
struct TRITONBACKEND_Batcher;

Expand Down Expand Up @@ -94,7 +95,7 @@ struct TRITONBACKEND_Batcher;
/// }
///
#define TRITONBACKEND_API_VERSION_MAJOR 1
#define TRITONBACKEND_API_VERSION_MINOR 18
#define TRITONBACKEND_API_VERSION_MINOR 19

/// Get the TRITONBACKEND API version supported by Triton. This value
/// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and
Expand Down Expand Up @@ -761,8 +762,9 @@ TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseOutput(
/// \param send_flags Flags associated with the response. \see
/// TRITONSERVER_ResponseCompleteFlag. \see
/// TRITONSERVER_InferenceResponseCompleteFn_t.
/// \param error The TRITONSERVER_Error to send if the response is an
/// error, or nullptr if the response is successful.
/// \param error The TRITONSERVER_Error to send if the response is an error, or
/// nullptr if the response is successful. The caller retains ownership to the
/// error object and must free it with TRITONSERVER_ErrorDelete.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSend(
TRITONBACKEND_Response* response, const uint32_t send_flags,
Expand Down Expand Up @@ -1319,16 +1321,16 @@ TRITONBACKEND_ModelInstanceReportMemoryUsage(
/// TRITONBACKEND_ModelInstanceExecute.
///
/// TRITONBACKEND_ModelInstanceExecute()
/// CAPTURE TIMESPACE (exec_start_ns)
/// CAPTURE TIMESTAMP (exec_start_ns)
/// < process input tensors to prepare them for inference
/// execution, including copying the tensors to/from GPU if
/// necessary>
/// CAPTURE TIMESPACE (compute_start_ns)
/// CAPTURE TIMESTAMP (compute_start_ns)
/// < perform inference computations to produce outputs >
/// CAPTURE TIMESPACE (compute_end_ns)
/// CAPTURE TIMESTAMP (compute_end_ns)
/// < allocate output buffers and extract output tensors, including
/// copying the tensors to/from GPU if necessary>
/// CAPTURE TIMESPACE (exec_end_ns)
/// CAPTURE TIMESTAMP (exec_end_ns)
/// return
///
/// Note that these statistics are associated with a valid
Expand Down Expand Up @@ -1356,6 +1358,156 @@ TRITONBACKEND_ModelInstanceReportStatistics(
const uint64_t compute_start_ns, const uint64_t compute_end_ns,
const uint64_t exec_end_ns);

/// Create a new inference response statistics object.
///
/// \param response_statistics The new response statistics object to be created.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsNew(
TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics);

/// Delete an inference response statistics object.
///
/// The caller retains ownership to the objects set on the deleted response
/// statistics object and must free them separately.
///
/// \param response_statistics The response statistics object to be deleted.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics);

/// Set model instance to an inference response statistics object.
///
/// \param response_statistics The response statistics object.
/// \param model_instance The model instance.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
TRITONBACKEND_ModelInstance* model_instance);

/// Set response factory to an inference response statistics object.
///
/// \param response_statistics The response statistics object.
/// \param response_factory The response factory.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
TRITONBACKEND_ResponseFactory* response_factory);

/// Set response start time to an inference response statistics object.
///
/// All timestamps should be reported in nanonseconds and collected using
/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
///
/// For consistency of measurement across different backends, the timestamps
/// should be collected at the following points during
/// TRITONBACKEND_ModelInstanceExecute.
///
/// TRITONBACKEND_ModelInstanceExecute()
/// < start of this response >
/// CAPTURE TIMESTAMP (response_start)
/// < generate this response >
/// CAPTURE TIMESTAMP (compute_output_start)
/// < allocate output buffers and extract output tensors, including copying
/// the tensors to/from GPU if necessary >
/// CAPTURE TIMESTAMP (response_end)
/// < end of this response >
/// return
///
/// \param response_statistics The response statistics object.
/// \param response_start The response start time.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
uint64_t response_start);

/// Set compute output start time to an inference response statistics object.
///
/// Do NOT set this compute output start time (or set it to 0), if reporting an
/// empty response.
///
/// All timestamps should be reported in nanonseconds and collected using
/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
///
/// For consistency of measurement across different backends, the timestamps
/// should be collected at the following points during
/// TRITONBACKEND_ModelInstanceExecute.
///
/// TRITONBACKEND_ModelInstanceExecute()
/// < start of this response >
/// CAPTURE TIMESTAMP (response_start)
/// < generate this response >
/// CAPTURE TIMESTAMP (compute_output_start)
/// < allocate output buffers and extract output tensors, including copying
/// the tensors to/from GPU if necessary >
/// CAPTURE TIMESTAMP (response_end)
/// < end of this response >
/// return
///
/// \param response_statistics The response statistics object.
/// \param compute_output_start The compute output start time.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
uint64_t compute_output_start);

/// Set response end time to an inference response statistics object.
///
/// All timestamps should be reported in nanonseconds and collected using
/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
///
/// For consistency of measurement across different backends, the timestamps
/// should be collected at the following points during
/// TRITONBACKEND_ModelInstanceExecute.
///
/// TRITONBACKEND_ModelInstanceExecute()
/// < start of this response >
/// CAPTURE TIMESTAMP (response_start)
/// < generate this response >
/// CAPTURE TIMESTAMP (compute_output_start)
/// < allocate output buffers and extract output tensors, including copying
/// the tensors to/from GPU if necessary >
/// CAPTURE TIMESTAMP (response_end)
/// < end of this response >
/// return
///
/// \param response_statistics The response statistics object.
/// \param response_end The response end time.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
uint64_t response_end);

/// Set error to an inference response statistics object.
///
/// Use the same error object passed to the TRITONBACKEND_ResponseSend. \see
/// TRITONBACKEND_ResponseSend.
///
/// \param response_statistics The response statistics object.
/// \param error The error object.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
TRITONSERVER_Error* error);

/// Record statistics for an inference response.
///
/// The caller retains ownership to the response statistics and must free it
/// after this function returns.
///
/// \param response_statistics The statistics to be recorded.
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportResponseStatistics(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics);

/// Record statistics for the execution of an entire batch of
/// inference requests.
///
Expand Down
161 changes: 160 additions & 1 deletion src/backend_model_instance.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -795,6 +795,23 @@ TritonModelInstance::TritonBackendThread::BackendThread()
LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "...";
}

// Opaque object for the response statistics C-API
struct ModelInstanceResponseStatistics {
kthui marked this conversation as resolved.
Show resolved Hide resolved
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics()
: model_instance(nullptr), response_factory(nullptr), response_start(0),
compute_output_start(0), response_end(0), error(nullptr)
{
}
TritonModelInstance* model_instance;
std::shared_ptr<InferenceResponseFactory>* response_factory;
uint64_t response_start;
uint64_t compute_output_start;
uint64_t response_end;
TRITONSERVER_Error* error;
#endif // TRITON_ENABLE_STATS
};

extern "C" {

TRITONAPI_DECLSPEC TRITONSERVER_Error*
Expand Down Expand Up @@ -953,6 +970,148 @@ TRITONBACKEND_ModelInstanceReportStatistics(
return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsNew(
TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics)
{
#ifdef TRITON_ENABLE_STATS
*response_statistics =
reinterpret_cast<TRITONBACKEND_ModelInstanceResponseStatistics*>(
new ModelInstanceResponseStatistics());
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics)
{
#ifdef TRITON_ENABLE_STATS
delete reinterpret_cast<ModelInstanceResponseStatistics*>(
response_statistics);
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
TRITONBACKEND_ModelInstance* model_instance)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->model_instance = reinterpret_cast<TritonModelInstance*>(model_instance);
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
TRITONBACKEND_ResponseFactory* response_factory)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->response_factory =
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(
response_factory);
;
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
uint64_t response_start)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->response_start = response_start;
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
uint64_t compute_output_start)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->compute_output_start = compute_output_start;
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
uint64_t response_end)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->response_end = response_end;
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
TRITONSERVER_Error* error)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
rs->error = error;
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportResponseStatistics(
TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics)
{
#ifdef TRITON_ENABLE_STATS
ModelInstanceResponseStatistics* rs =
reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);

InferenceStatsAggregator* sa =
rs->model_instance->Model()->MutableStatsAggregator();
std::string key =
std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex());

if (rs->error == nullptr) {
if (rs->compute_output_start > 0) {
RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseSuccess(
key, rs->response_start, rs->compute_output_start, rs->response_end));
} else {
RETURN_TRITONSERVER_ERROR_IF_ERROR(
sa->UpdateResponseEmpty(key, rs->response_start, rs->response_end));
}
} else {
RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseFail(
key, rs->response_start, rs->compute_output_start, rs->response_end));
}
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportBatchStatistics(
TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
Expand Down
Loading
Loading