triton-inference-server · GuanLuo · Mar 8, 2024 · Aug 7, 2023 · Feb 20, 2024 · Feb 21, 2024
diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -591,5 +591,57 @@ def test_wrong_shape(self):
         )
 
 
+class NonDecoupledTest(tu.TestResultCollector):
+    def setUp(self):
+        self.model_name_ = "repeat_int32"
+        self.input_data = {
+            "IN": np.array([1], dtype=np.int32),
+            "DELAY": np.array([0], dtype=np.uint32),
+            "WAIT": np.array([0], dtype=np.uint32),
+        }
+
+    def test_grpc(self):
+        inputs = [
+            grpcclient.InferInput("IN", [1], "INT32").set_data_from_numpy(
+                self.input_data["IN"]
+            ),
+            grpcclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy(
+                self.input_data["DELAY"]
+            ),
+            grpcclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy(
+                self.input_data["WAIT"]
+            ),
+        ]
+
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        # Expect the inference is successful
+        res = triton_client.infer(model_name=self.model_name_, inputs=inputs)
+        self.assertEqual(1, res.as_numpy("OUT")[0])
+        self.assertEqual(0, res.as_numpy("IDX")[0])
+
+    def test_http(self):
+        inputs = [
+            httpclient.InferInput("IN", [1], "INT32").set_data_from_numpy(
+                self.input_data["IN"]
+            ),
+            httpclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy(
+                self.input_data["DELAY"]
+            ),
+            httpclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy(
+                self.input_data["WAIT"]
+            ),
+        ]
+
+        triton_client = httpclient.InferenceServerClient(
+            url="localhost:8000", verbose=True
+        )
+        # Expect the inference is successful
+        res = triton_client.infer(model_name=self.model_name_, inputs=inputs)
+        self.assertEqual(1, res.as_numpy("OUT")[0])
+        self.assertEqual(0, res.as_numpy("IDX")[0])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -129,6 +129,47 @@ for trial in $TRIALS; do
   wait $SERVER_PID
 done
 
+# Test the server frontend can merge the responses of non-decoupled model that
+# sends inference response and COMPLETE flag separately. In other words, from
+# the client's perspective there will still be one response.
+NON_DECOUPLED_DIR=`pwd`/non_decoupled_models
+rm -rf ${NON_DECOUPLED_DIR} && mkdir -p ${NON_DECOUPLED_DIR}
+cp -r `pwd`/models/repeat_int32 ${NON_DECOUPLED_DIR}/. && \
+    (cd ${NON_DECOUPLED_DIR}/repeat_int32 && \
+        sed -i "s/decoupled: True/decoupled: False/" config.pbtxt)
+
+SERVER_ARGS="--model-repository=${NON_DECOUPLED_DIR}"
+SERVER_LOG="./non_decoupled_inference_server.log"
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+CLIENT_LOG=`pwd`/non_decoupled_client.log
+echo "Test: NonDecoupledTest" >>$CLIENT_LOG
+set +e
+python $DECOUPLED_TEST NonDecoupledTest >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" >>$CLIENT_LOG
+        echo -e "\n***\n*** Test NonDecoupledTest Failed\n***"
+        RET=1
+else
+    check_test_results $TEST_RESULT_FILE 2
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
 else

diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
@@ -1,4 +1,4 @@
-// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -978,25 +978,20 @@ ModelInferHandler::InferResponseComplete(
   // notification.
   std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
 
-  // Increment the callback index
-  state->cb_count_++;
+  // Increment the callback index if received valid 'iresponse'
+  if (iresponse != nullptr) {
+    state->cb_count_++;
+  }
 
   LOG_VERBOSE(1) << "ModelInferHandler::InferResponseComplete, "
                  << state->unique_id_ << " step " << state->step_;
 
-  // Defer to the callback with the final response
-  if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) {
-    LOG_ERROR << "[INTERNAL] ModelInfer received a response without FINAL flag";
-    return;
+  // Allow sending 1 response and final flag separately, only mark
+  // non-inflight when seeing final flag
+  if (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) {
+    state->context_->EraseInflightState(state);
   }
 
-  state->context_->EraseInflightState(state);
-
-#ifdef TRITON_ENABLE_TRACING
-  state->trace_timestamps_.emplace_back(std::make_pair(
-      "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()));
-#endif  // TRITON_ENABLE_TRACING
-
   // If gRPC Stream is cancelled then no need of forming and returning
   // a response.
   if (state->IsGrpcContextCancelled()) {
@@ -1005,6 +1000,7 @@ ModelInferHandler::InferResponseComplete(
         TRITONSERVER_InferenceResponseDelete(iresponse),
         "deleting GRPC inference response");
 
+    state->context_->EraseInflightState(state);
     state->step_ = Steps::CANCELLED;
 
     LOG_VERBOSE(1) << "ModelInferHandler::InferResponseComplete, "
@@ -1041,33 +1037,40 @@ ModelInferHandler::InferResponseComplete(
                                          "expected a single response, got " +
                                          std::to_string(state->cb_count_))
                                          .c_str());
-  } else if (iresponse == nullptr) {
-    err = TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL, "received an unexpected null response");
-  } else {
+  } else if (iresponse != nullptr) {
     err = InferResponseCompleteCommon<inference::ModelInferResponse>(
         state->tritonserver_, iresponse, *response, state->alloc_payload_);
+#ifdef TRITON_ENABLE_TRACING
+    state->trace_timestamps_.emplace_back(std::make_pair(
+        "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp()));
+#endif  // TRITON_ENABLE_TRACING
   }
 
   if (err != nullptr) {
     response->Clear();
   }
 
-  ::grpc::Status status;
-  GrpcStatusUtil::Create(&status, err);
+  GrpcStatusUtil::Create(&state->status_, err);
   TRITONSERVER_ErrorDelete(err);
 
   LOG_TRITONSERVER_ERROR(
       TRITONSERVER_InferenceResponseDelete(iresponse),
       "deleting GRPC inference response");
 
+  // Defer sending the response until FINAL flag is seen or
+  // there is error
+  if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) {
+    return;
+  }
+
+
 #ifdef TRITON_ENABLE_TRACING
   state->trace_timestamps_.emplace_back(
       std::make_pair("GRPC_SEND_START", TraceManager::CaptureTimestamp()));
 #endif  // TRITON_ENABLE_TRACING
 
   state->step_ = COMPLETE;
-  state->context_->responder_->Finish(*response, status, state);
+  state->context_->responder_->Finish(*response, state->status_, state);
   if (response_created) {
     delete response;
   }

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
@@ -1032,6 +1032,7 @@ class InferHandlerState {
     unique_id_ = NEXT_UNIQUE_ID;
     context_ = context;
     step_ = start_step;
+    status_ = ::grpc::Status{};
     cb_count_ = 0;
     is_decoupled_ = false;
     complete_ = false;
@@ -1100,6 +1101,7 @@ class InferHandlerState {
   bool is_decoupled_ = false;
   StateParameters parameters_;
 
+  ::grpc::Status status_;
   std::atomic<uint32_t> cb_count_;
   bool complete_;
 

diff --git a/src/http_server.cc b/src/http_server.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -3729,46 +3729,45 @@ HTTPAPIServer::InferRequestClass::InferResponseComplete(
   HTTPAPIServer::InferRequestClass* infer_request =
       reinterpret_cast<HTTPAPIServer::InferRequestClass*>(userp);
 
-  auto response_count = infer_request->IncrementResponseCount();
-
-  // Defer to the callback with the final response
-  if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) {
-    LOG_ERROR << "[INTERNAL] received a response without FINAL flag";
-    return;
+  if (response != nullptr) {
+    ++infer_request->response_count_;
   }
 
   TRITONSERVER_Error* err = nullptr;
-  if (response_count != 0) {
+  if (infer_request->response_count_ != 1) {
     err = TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL, std::string(
-                                         "expected a single response, got " +
-                                         std::to_string(response_count + 1))
-                                         .c_str());
-  } else if (response == nullptr) {
-    err = TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL, "received an unexpected null response");
-  } else {
+        TRITONSERVER_ERROR_INTERNAL,
+        std::string(
+            "expected a single response, got " +
+            std::to_string(infer_request->response_count_))
+            .c_str());
+  } else if (response != nullptr) {
     err = infer_request->FinalizeResponse(response);
-  }
-
 #ifdef TRITON_ENABLE_TRACING
-  if (infer_request->trace_ != nullptr) {
-    infer_request->trace_->CaptureTimestamp(
-        "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp());
-  }
+    if (infer_request->trace_ != nullptr) {
+      infer_request->trace_->CaptureTimestamp(
+          "INFER_RESPONSE_COMPLETE", TraceManager::CaptureTimestamp());
+    }
 #endif  // TRITON_ENABLE_TRACING
+  }
+
+
+  LOG_TRITONSERVER_ERROR(
+      TRITONSERVER_InferenceResponseDelete(response),
+      "deleting inference response");
 
   if (err != nullptr) {
     EVBufferAddErrorJson(infer_request->req_->buffer_out, err);
     infer_request->response_code_ = HttpCodeFromError(err);
     TRITONSERVER_ErrorDelete(err);
   }
+
+  // Defer sending the response until FINAL flag is seen
+  if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) {
+    return;
+  }
   evthr_defer(
       infer_request->thread_, InferRequestClass::ReplyCallback, infer_request);
-
-  LOG_TRITONSERVER_ERROR(
-      TRITONSERVER_InferenceResponseDelete(response),
-      "deleting inference response");
 }
 
 TRITONSERVER_Error*

diff --git a/src/http_server.h b/src/http_server.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -293,14 +293,15 @@ class HTTPAPIServer : public HTTPServer {
     static void ReplyCallback(evthr_t* thr, void* arg, void* shared);
 
    protected:
-    TRITONSERVER_Server* server_;
-    evhtp_request_t* req_;
-    evthr_t* thread_;
+    TRITONSERVER_Server* server_{nullptr};
+    evhtp_request_t* req_{nullptr};
+    evthr_t* thread_{nullptr};
 
-    DataCompressor::Type response_compression_type_;
+    DataCompressor::Type response_compression_type_{
+        DataCompressor::Type::IDENTITY};
 
     // Counter to keep track of number of responses generated.
-    std::atomic<uint32_t> response_count_;
+    std::atomic<uint32_t> response_count_{0};
 
     // Event hook for called before request deletion
     static evhtp_res RequestFiniHook(evhtp_request* req, void* arg);