From 6f5e78b0a61125da20a58bdbd40be58610072f1a Mon Sep 17 00:00:00 2001 From: GuanLuo Date: Mon, 7 Aug 2023 12:28:14 -0700 Subject: [PATCH 1/6] Allow non-decoupled model to send response and FINAL flag separately --- qa/L0_decoupled/decoupled_test.py | 52 +++++++++++++++++++++++++++++++ qa/L0_decoupled/test.sh | 43 ++++++++++++++++++++++++- src/grpc/infer_handler.cc | 28 ++++++++++------- src/http_server.cc | 38 +++++++++++----------- src/http_server.h | 11 ++++--- src/sagemaker_server.cc | 39 ++++++++++++----------- 6 files changed, 155 insertions(+), 56 deletions(-) diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py index b78170cf63..9bab55550d 100755 --- a/qa/L0_decoupled/decoupled_test.py +++ b/qa/L0_decoupled/decoupled_test.py @@ -591,5 +591,57 @@ def test_wrong_shape(self): ) +class NonDecoupledTest(tu.TestResultCollector): + def setUp(self): + self.model_name_ = "repeat_int32" + self.input_data = { + "IN": np.array([1], dtype=np.int32), + "DELAY": np.array([0], dtype=np.uint32), + "WAIT": np.array([0], dtype=np.uint32), + } + + def test_grpc(self): + inputs = [ + grpcclient.InferInput("IN", [1], "INT32").set_data_from_numpy( + self.input_data["IN"] + ), + grpcclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy( + self.input_data["DELAY"] + ), + grpcclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy( + self.input_data["WAIT"] + ), + ] + + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + # Expect the inference is successful + res = triton_client.infer(model_name=self.model_name_, inputs=inputs) + self.assertEqual(1, res.as_numpy("OUT")[0]) + self.assertEqual(0, res.as_numpy("IDX")[0]) + + def test_http(self): + inputs = [ + httpclient.InferInput("IN", [1], "INT32").set_data_from_numpy( + self.input_data["IN"] + ), + httpclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy( + self.input_data["DELAY"] + ), + httpclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy( + self.input_data["WAIT"] + ), + ] + + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", verbose=True + ) + # Expect the inference is successful + res = triton_client.infer(model_name=self.model_name_, inputs=inputs) + self.assertEqual(1, res.as_numpy("OUT")[0]) + self.assertEqual(0, res.as_numpy("IDX")[0]) + + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh index 90bb913b6c..22e407b719 100755 --- a/qa/L0_decoupled/test.sh +++ b/qa/L0_decoupled/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -129,6 +129,47 @@ for trial in $TRIALS; do wait $SERVER_PID done +# Test the server frontend can merge the responses of non-decoupled model that +# sends inference response and COMPLETE flag separately. In other words, from +# the client's perspective there will still be one response. +NON_DECOUPLED_DIR=`pwd`/non_decoupled_models +rm -rf ${NON_DECOUPLED_DIR} && mkdir -p ${NON_DECOUPLED_DIR} +cp -r `pwd`/models/repeat_int32 ${NON_DECOUPLED_DIR}/. && \ + (cd ${NON_DECOUPLED_DIR}/repeat_int32 && \ + sed -i "s/decoupled: True/decoupled: False/" config.pbtxt) + +SERVER_ARGS="--model-repository=${NON_DECOUPLED_DIR}" +SERVER_LOG="./non_decoupled_inference_server.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +CLIENT_LOG=`pwd`/non_decoupled_client.log +echo "Test: NonDecoupledTest" >>$CLIENT_LOG +set +e +python $DECOUPLED_TEST NonDecoupledTest >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 2 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index 021cd3cf18..769707a5c6 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -978,20 +978,20 @@ ModelInferHandler::InferResponseComplete( // notification. std::lock_guard lock(state->step_mtx_); - // Increment the callback index - state->cb_count_++; + // Increment the callback index if received valid 'iresponse' + if (iresponse != nullptr) { + state->cb_count_++; + } LOG_VERBOSE(1) << "ModelInferHandler::InferResponseComplete, " << state->unique_id_ << " step " << state->step_; - // Defer to the callback with the final response - if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { - LOG_ERROR << "[INTERNAL] ModelInfer received a response without FINAL flag"; - return; + // Allow sending 1 response and final flag separately, only mark + // non-inflight when seeing final flag + if (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) { + state->context_->EraseInflightState(state); } - state->context_->EraseInflightState(state); - #ifdef TRITON_ENABLE_TRACING state->trace_timestamps_.emplace_back(std::make_pair( "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp())); @@ -1005,6 +1005,7 @@ ModelInferHandler::InferResponseComplete( TRITONSERVER_InferenceResponseDelete(iresponse), "deleting GRPC inference response"); + state->context_->EraseInflightState(state); state->step_ = Steps::CANCELLED; LOG_VERBOSE(1) << "ModelInferHandler::InferResponseComplete, " @@ -1041,10 +1042,7 @@ ModelInferHandler::InferResponseComplete( "expected a single response, got " + std::to_string(state->cb_count_)) .c_str()); - } else if (iresponse == nullptr) { - err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "received an unexpected null response"); - } else { + } else if (iresponse != nullptr) { err = InferResponseCompleteCommon( state->tritonserver_, iresponse, *response, state->alloc_payload_); } @@ -1061,6 +1059,12 @@ ModelInferHandler::InferResponseComplete( TRITONSERVER_InferenceResponseDelete(iresponse), "deleting GRPC inference response"); + // Defer sending the response until FINAL flag is seen or + // there is error + if (status.ok() && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { + return; + } + #ifdef TRITON_ENABLE_TRACING state->trace_timestamps_.emplace_back( std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp())); diff --git a/src/http_server.cc b/src/http_server.cc index 75319b9484..819a306c71 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -3729,25 +3729,19 @@ HTTPAPIServer::InferRequestClass::InferResponseComplete( HTTPAPIServer::InferRequestClass* infer_request = reinterpret_cast(userp); - auto response_count = infer_request->IncrementResponseCount(); - - // Defer to the callback with the final response - if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { - LOG_ERROR << "[INTERNAL] received a response without FINAL flag"; - return; + if (response != nullptr) { + ++infer_request->response_count_; } TRITONSERVER_Error* err = nullptr; - if (response_count != 0) { + if (infer_request->response_count_ != 1) { err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, std::string( - "expected a single response, got " + - std::to_string(response_count + 1)) - .c_str()); - } else if (response == nullptr) { - err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "received an unexpected null response"); - } else { + TRITONSERVER_ERROR_INTERNAL, + std::string( + "expected a single response, got " + + std::to_string(infer_request->response_count_)) + .c_str()); + } else if (response != nullptr) { err = infer_request->FinalizeResponse(response); } @@ -3758,6 +3752,16 @@ HTTPAPIServer::InferRequestClass::InferResponseComplete( } #endif // TRITON_ENABLE_TRACING + LOG_TRITONSERVER_ERROR( + TRITONSERVER_InferenceResponseDelete(response), + "deleting inference response"); + + // Defer sending the response until FINAL flag is seen or + // there is error + if ((err == nullptr) && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { + return; + } + if (err != nullptr) { EVBufferAddErrorJson(infer_request->req_->buffer_out, err); infer_request->response_code_ = HttpCodeFromError(err); @@ -3765,10 +3769,6 @@ HTTPAPIServer::InferRequestClass::InferResponseComplete( } evthr_defer( infer_request->thread_, InferRequestClass::ReplyCallback, infer_request); - - LOG_TRITONSERVER_ERROR( - TRITONSERVER_InferenceResponseDelete(response), - "deleting inference response"); } TRITONSERVER_Error* diff --git a/src/http_server.h b/src/http_server.h index 9c0643db91..4b3ca466d3 100644 --- a/src/http_server.h +++ b/src/http_server.h @@ -293,14 +293,15 @@ class HTTPAPIServer : public HTTPServer { static void ReplyCallback(evthr_t* thr, void* arg, void* shared); protected: - TRITONSERVER_Server* server_; - evhtp_request_t* req_; - evthr_t* thread_; + TRITONSERVER_Server* server_{nullptr}; + evhtp_request_t* req_{nullptr}; + evthr_t* thread_{nullptr}; - DataCompressor::Type response_compression_type_; + DataCompressor::Type response_compression_type_{ + DataCompressor::Type::IDENTITY}; // Counter to keep track of number of responses generated. - std::atomic response_count_; + std::atomic response_count_{0}; // Event hook for called before request deletion static evhtp_res RequestFiniHook(evhtp_request* req, void* arg); diff --git a/src/sagemaker_server.cc b/src/sagemaker_server.cc index 28c8b688d3..31f77e026d 100644 --- a/src/sagemaker_server.cc +++ b/src/sagemaker_server.cc @@ -378,25 +378,19 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( SagemakerAPIServer::SagemakeInferRequestClass* infer_request = reinterpret_cast(userp); - auto response_count = infer_request->IncrementResponseCount(); - - // Defer to the callback with the final response - if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { - LOG_ERROR << "[INTERNAL] received a response without FINAL flag"; - return; + if (response != nullptr) { + ++infer_request->response_count_; } TRITONSERVER_Error* err = nullptr; - if (response_count != 0) { - err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, std::string( - "expected a single response, got " + - std::to_string(response_count + 1)) - .c_str()); - } else if (response == nullptr) { + if (infer_request->response_count_ != 1) { err = TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, "received an unexpected null response"); - } else { + TRITONSERVER_ERROR_INTERNAL, + std::string( + "expected a single response, got " + + std::to_string(infer_request->response_count_)) + .c_str()); + } else if (response != nullptr) { err = infer_request->FinalizeResponse(response); } @@ -407,6 +401,17 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( } #endif // TRITON_ENABLE_TRACING + + LOG_TRITONSERVER_ERROR( + TRITONSERVER_InferenceResponseDelete(response), + "deleting inference response"); + + // Defer sending the response until FINAL flag is seen or + // there is error + if ((err == nullptr) && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { + return; + } + if (err != nullptr) { EVBufferAddErrorJson(infer_request->req_->buffer_out, err); // [FIXME] In http_server.cc, error handling is enhanced to reporting @@ -422,10 +427,6 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( TRITONSERVER_ErrorDelete(err); } evthr_defer(infer_request->thread_, ReplyCallback, infer_request); - - LOG_TRITONSERVER_ERROR( - TRITONSERVER_InferenceResponseDelete(response), - "deleting inference response"); } void From f83094d75faa6c9bc67ffba4b25a7dd284b6fd9e Mon Sep 17 00:00:00 2001 From: GuanLuo Date: Tue, 20 Feb 2024 15:52:43 -0800 Subject: [PATCH 2/6] Update copyright --- qa/L0_decoupled/decoupled_test.py | 2 +- qa/L0_decoupled/test.sh | 2 +- src/grpc/infer_handler.cc | 2 +- src/http_server.cc | 2 +- src/http_server.h | 2 +- src/sagemaker_server.cc | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py index 9bab55550d..1f76f4845b 100755 --- a/qa/L0_decoupled/decoupled_test.py +++ b/qa/L0_decoupled/decoupled_test.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh index 22e407b719..98ad134d8b 100755 --- a/qa/L0_decoupled/test.sh +++ b/qa/L0_decoupled/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index 769707a5c6..e047de805a 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions diff --git a/src/http_server.cc b/src/http_server.cc index 819a306c71..5c6e0d3747 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -1,4 +1,4 @@ -// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions diff --git a/src/http_server.h b/src/http_server.h index 4b3ca466d3..8e87405c87 100644 --- a/src/http_server.h +++ b/src/http_server.h @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions diff --git a/src/sagemaker_server.cc b/src/sagemaker_server.cc index 31f77e026d..f756657d75 100644 --- a/src/sagemaker_server.cc +++ b/src/sagemaker_server.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions From ec8d12a9459daffcbc937c3a18f6227f64d94f0a Mon Sep 17 00:00:00 2001 From: GuanLuo Date: Tue, 20 Feb 2024 16:17:19 -0800 Subject: [PATCH 3/6] Defer sending error until FINAL flag is seen to avoid invalid reference --- src/grpc/infer_handler.cc | 7 +++---- src/grpc/infer_handler.h | 2 ++ src/http_server.cc | 11 +++++------ src/sagemaker_server.cc | 10 +++++----- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index e047de805a..f977543896 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -1051,8 +1051,7 @@ ModelInferHandler::InferResponseComplete( response->Clear(); } - ::grpc::Status status; - GrpcStatusUtil::Create(&status, err); + GrpcStatusUtil::Create(&state->status_, err); TRITONSERVER_ErrorDelete(err); LOG_TRITONSERVER_ERROR( @@ -1061,7 +1060,7 @@ ModelInferHandler::InferResponseComplete( // Defer sending the response until FINAL flag is seen or // there is error - if (status.ok() && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { + if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { return; } @@ -1071,7 +1070,7 @@ ModelInferHandler::InferResponseComplete( #endif // TRITON_ENABLE_TRACING state->step_ = COMPLETE; - state->context_->responder_->Finish(*response, status, state); + state->context_->responder_->Finish(*response, state->status_, state); if (response_created) { delete response; } diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 42a9437c77..84095a188b 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -1032,6 +1032,7 @@ class InferHandlerState { unique_id_ = NEXT_UNIQUE_ID; context_ = context; step_ = start_step; + status_ = ::grpc::Status{}; cb_count_ = 0; is_decoupled_ = false; complete_ = false; @@ -1100,6 +1101,7 @@ class InferHandlerState { bool is_decoupled_ = false; StateParameters parameters_; + ::grpc::Status status_; std::atomic cb_count_; bool complete_; diff --git a/src/http_server.cc b/src/http_server.cc index 5c6e0d3747..2ccfdef95e 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -3756,17 +3756,16 @@ HTTPAPIServer::InferRequestClass::InferResponseComplete( TRITONSERVER_InferenceResponseDelete(response), "deleting inference response"); - // Defer sending the response until FINAL flag is seen or - // there is error - if ((err == nullptr) && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { - return; - } - if (err != nullptr) { EVBufferAddErrorJson(infer_request->req_->buffer_out, err); infer_request->response_code_ = HttpCodeFromError(err); TRITONSERVER_ErrorDelete(err); } + + // Defer sending the response until FINAL flag is seen + if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { + return; + } evthr_defer( infer_request->thread_, InferRequestClass::ReplyCallback, infer_request); } diff --git a/src/sagemaker_server.cc b/src/sagemaker_server.cc index f756657d75..a214ff99b6 100644 --- a/src/sagemaker_server.cc +++ b/src/sagemaker_server.cc @@ -406,11 +406,6 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( TRITONSERVER_InferenceResponseDelete(response), "deleting inference response"); - // Defer sending the response until FINAL flag is seen or - // there is error - if ((err == nullptr) && (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { - return; - } if (err != nullptr) { EVBufferAddErrorJson(infer_request->req_->buffer_out, err); @@ -426,6 +421,11 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( } TRITONSERVER_ErrorDelete(err); } + + // Defer sending the response until FINAL flag is seen + if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { + return; + } evthr_defer(infer_request->thread_, ReplyCallback, infer_request); } From 1151a4d6b4a4a9e9cbcfac58271095cb48a7c3d3 Mon Sep 17 00:00:00 2001 From: GuanLuo Date: Tue, 5 Mar 2024 18:02:56 -0800 Subject: [PATCH 4/6] Move timestamp capture location --- src/http_server.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/http_server.cc b/src/http_server.cc index 2ccfdef95e..32c508d236 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -3745,12 +3745,6 @@ HTTPAPIServer::InferRequestClass::InferResponseComplete( err = infer_request->FinalizeResponse(response); } -#ifdef TRITON_ENABLE_TRACING - if (infer_request->trace_ != nullptr) { - infer_request->trace_->CaptureTimestamp( - "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()); - } -#endif // TRITON_ENABLE_TRACING LOG_TRITONSERVER_ERROR( TRITONSERVER_InferenceResponseDelete(response), @@ -3766,6 +3760,12 @@ HTTPAPIServer::InferRequestClass::InferResponseComplete( if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { return; } +#ifdef TRITON_ENABLE_TRACING + if (infer_request->trace_ != nullptr) { + infer_request->trace_->CaptureTimestamp( + "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()); + } +#endif // TRITON_ENABLE_TRACING evthr_defer( infer_request->thread_, InferRequestClass::ReplyCallback, infer_request); } From d905aeb8f84767e20d7f1871dc16afa7c69c06ea Mon Sep 17 00:00:00 2001 From: GuanLuo Date: Thu, 7 Mar 2024 14:48:51 -0800 Subject: [PATCH 5/6] Delay time-point of response complete timestamp in GPRC and SageMaker endpoint --- src/grpc/infer_handler.cc | 10 +++++----- src/sagemaker_server.cc | 13 ++++++------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index f977543896..cde705cf6e 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -992,11 +992,6 @@ ModelInferHandler::InferResponseComplete( state->context_->EraseInflightState(state); } -#ifdef TRITON_ENABLE_TRACING - state->trace_timestamps_.emplace_back(std::make_pair( - "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp())); -#endif // TRITON_ENABLE_TRACING - // If gRPC Stream is cancelled then no need of forming and returning // a response. if (state->IsGrpcContextCancelled()) { @@ -1064,6 +1059,11 @@ ModelInferHandler::InferResponseComplete( return; } +#ifdef TRITON_ENABLE_TRACING + state->trace_timestamps_.emplace_back(std::make_pair( + "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp())); +#endif // TRITON_ENABLE_TRACING + #ifdef TRITON_ENABLE_TRACING state->trace_timestamps_.emplace_back( std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp())); diff --git a/src/sagemaker_server.cc b/src/sagemaker_server.cc index a214ff99b6..daedce4f4f 100644 --- a/src/sagemaker_server.cc +++ b/src/sagemaker_server.cc @@ -394,13 +394,6 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( err = infer_request->FinalizeResponse(response); } -#ifdef TRITON_ENABLE_TRACING - if (infer_request->trace_ != nullptr) { - infer_request->trace_->CaptureTimestamp( - "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()); - } -#endif // TRITON_ENABLE_TRACING - LOG_TRITONSERVER_ERROR( TRITONSERVER_InferenceResponseDelete(response), @@ -426,6 +419,12 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { return; } +#ifdef TRITON_ENABLE_TRACING + if (infer_request->trace_ != nullptr) { + infer_request->trace_->CaptureTimestamp( + "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()); + } +#endif // TRITON_ENABLE_TRACING evthr_defer(infer_request->thread_, ReplyCallback, infer_request); } From bb7efdb1a6e69b83dbd17c230d5a5a6754a16995 Mon Sep 17 00:00:00 2001 From: GuanLuo Date: Thu, 7 Mar 2024 17:59:57 -0800 Subject: [PATCH 6/6] Move location of RESPONSE_COMPLETE timestamp capture to better align with the meaning. --- src/grpc/infer_handler.cc | 8 ++++---- src/http_server.cc | 12 ++++++------ src/sagemaker_server.cc | 12 ++++++------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index cde705cf6e..5c60fb46ed 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -1040,6 +1040,10 @@ ModelInferHandler::InferResponseComplete( } else if (iresponse != nullptr) { err = InferResponseCompleteCommon( state->tritonserver_, iresponse, *response, state->alloc_payload_); +#ifdef TRITON_ENABLE_TRACING + state->trace_timestamps_.emplace_back(std::make_pair( + "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp())); +#endif // TRITON_ENABLE_TRACING } if (err != nullptr) { @@ -1059,10 +1063,6 @@ ModelInferHandler::InferResponseComplete( return; } -#ifdef TRITON_ENABLE_TRACING - state->trace_timestamps_.emplace_back(std::make_pair( - "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp())); -#endif // TRITON_ENABLE_TRACING #ifdef TRITON_ENABLE_TRACING state->trace_timestamps_.emplace_back( diff --git a/src/http_server.cc b/src/http_server.cc index 32c508d236..f5b791c06e 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -3743,6 +3743,12 @@ HTTPAPIServer::InferRequestClass::InferResponseComplete( .c_str()); } else if (response != nullptr) { err = infer_request->FinalizeResponse(response); +#ifdef TRITON_ENABLE_TRACING + if (infer_request->trace_ != nullptr) { + infer_request->trace_->CaptureTimestamp( + "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()); + } +#endif // TRITON_ENABLE_TRACING } @@ -3760,12 +3766,6 @@ HTTPAPIServer::InferRequestClass::InferResponseComplete( if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { return; } -#ifdef TRITON_ENABLE_TRACING - if (infer_request->trace_ != nullptr) { - infer_request->trace_->CaptureTimestamp( - "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()); - } -#endif // TRITON_ENABLE_TRACING evthr_defer( infer_request->thread_, InferRequestClass::ReplyCallback, infer_request); } diff --git a/src/sagemaker_server.cc b/src/sagemaker_server.cc index daedce4f4f..b81ebe572a 100644 --- a/src/sagemaker_server.cc +++ b/src/sagemaker_server.cc @@ -392,6 +392,12 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( .c_str()); } else if (response != nullptr) { err = infer_request->FinalizeResponse(response); +#ifdef TRITON_ENABLE_TRACING + if (infer_request->trace_ != nullptr) { + infer_request->trace_->CaptureTimestamp( + "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()); + } +#endif // TRITON_ENABLE_TRACING } @@ -419,12 +425,6 @@ SagemakerAPIServer::SagemakeInferRequestClass::InferResponseComplete( if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { return; } -#ifdef TRITON_ENABLE_TRACING - if (infer_request->trace_ != nullptr) { - infer_request->trace_->CaptureTimestamp( - "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()); - } -#endif // TRITON_ENABLE_TRACING evthr_defer(infer_request->thread_, ReplyCallback, infer_request); }