triton-inference-server · indrajit96 · Aug 16, 2024 · Jul 25, 2024 · Jul 30, 2024 · Jul 31, 2024
diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -241,6 +241,70 @@ def test_infer_pymodel_error(self):
             initial_metrics_value,
         )
 
+    def test_triton_grpc_error_error_on(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 2
+        user_data = UserData()
+        triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
+        metadata = {"triton_grpc_error": "true"}
+        triton_client.start_stream(
+            callback=partial(callback, user_data), headers=metadata
+        )
+
+        with self._shm_leak_detector.Probe() as shm_probe:
+            input_datas = []
+            for i in range(number_of_requests):
+                input_data = np.random.randn(*shape).astype(np.float32)
+                input_datas.append(input_data)
+                inputs = [
+                    grpcclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
+                ]
+                inputs[0].set_data_from_numpy(input_data)
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+                result = user_data._completed_requests.get()
+                if i == 0:
+                    # Stream is not killed
+                    output_data = result.as_numpy("OUT")
+                    self.assertIsNotNone(output_data, "error: expected 'OUT'")
+                elif i == 1:
+                    # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
+                    self.assertIsInstance(result, InferenceServerException)
+                    self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
+
+    def test_triton_grpc_error_error_off(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 4
+        user_data = UserData()
+        triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
+        triton_client.start_stream(callback=partial(callback, user_data))
+
+        with self._shm_leak_detector.Probe() as shm_probe:
+            input_datas = []
+            for i in range(number_of_requests):
+                input_data = np.random.randn(*shape).astype(np.float32)
+                input_datas.append(input_data)
+                inputs = [
+                    grpcclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
+                ]
+                inputs[0].set_data_from_numpy(input_data)
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+                result = user_data._completed_requests.get()
+                if i == 1 or i == 3:
+                    # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
+                    self.assertIsInstance(result, InferenceServerException)
+                    # Existing Behaviour
+                    self.assertEqual(str(result.status()), "None")
+                elif i == 0 or i == 2:
+                    # Stream is not killed
+                    output_data = result.as_numpy("OUT")
+                    self.assertIsNotNone(output_data, "error: expected 'OUT'")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh
@@ -52,6 +52,14 @@ cp ../../python_models/execute_error/config.pbtxt ./models/execute_error/
           sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \
           echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 }" >> config.pbtxt)
 
+mkdir -p models/execute_grpc_error/1/
+cp ../../python_models/execute_grpc_error/model.py ./models/execute_grpc_error/1/
+cp ../../python_models/execute_grpc_error/config.pbtxt ./models/execute_grpc_error/
+(cd models/execute_grpc_error && \
+          sed -i "s/^name:.*/name: \"execute_grpc_error\"/" config.pbtxt && \
+          sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \
+          echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 1200000 }" >> config.pbtxt)
+
 mkdir -p models/execute_return_error/1/
 cp ../../python_models/execute_return_error/model.py ./models/execute_return_error/1/
 cp ../../python_models/execute_return_error/config.pbtxt ./models/execute_return_error/

diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 rm -fr *.log ./models 
 rm -rf models/ 
 rm -rf models/ 
 rm -fr *.log ./models 
 rm -rf models/ 
 rm -rf models/ 
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "python"
+max_batch_size: 64
+
+input [
+  {
+    name: "IN"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+output [
+  {
+    name: "OUT"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py
@@ -0,0 +1,52 @@
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def __init__(self):
+        # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure
+        self.inf_count = 1
+
+    def execute(self, requests):
+        """This function is called on inference request."""
+        responses = []
+
+        # Generate the error for the second request
+        for request in requests:
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "IN")
+            out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy())
+            if self.inf_count % 2:
+                # Every odd request is success
+                responses.append(pb_utils.InferenceResponse([out_tensor]))
+            else:
+                # Every even request is failure
+                error = pb_utils.TritonError("An error occurred during execution")
+                responses.append(pb_utils.InferenceResponse([out_tensor], error))
+            self.inf_count += 1
+
+        return responses
diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h
@@ -76,6 +76,17 @@ typedef enum {
   PARTIAL_COMPLETION
 } Steps;
 
+typedef enum {
+  // No error from CORE seen yet
+  NONE,
+  // Error from CORE encountered, waiting to be picked up by completion queue to
+  // initiate cancellation
+  ERROR_WAITING,
+  // Error from CORE encountered, stream closed
+  // This state is added to avoid double cancellation
+  ERROR_CANCELED
+} Triton_grpc_error_steps;
+
 // Debugging helper
 std::ostream& operator<<(std::ostream& out, const Steps& step);
 

diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
@@ -720,7 +720,8 @@ ModelInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       // single thread scenario.
       StartNewRequest();
     }
-    bool resume = state->context_->HandleCancellation(state, rpc_ok, Name());
+    bool resume = state->context_->HandleCancellation(
+        state, rpc_ok, Name(), false /* is_triton_grpc_error */);
     return resume;
   }
 

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
@@ -642,7 +642,8 @@ class InferHandlerState {
         ::grpc::ServerCompletionQueue* cq, const uint64_t unique_id = 0)
         : cq_(cq), unique_id_(unique_id), ongoing_requests_(0),
           step_(Steps::START), finish_ok_(true), ongoing_write_(false),
-          received_notification_(false)
+          received_notification_(false), triton_grpc_error_(false),
+          grpc_stream_error_state_(Triton_grpc_error_steps::NONE)
     {
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
@@ -664,11 +665,56 @@ class InferHandlerState {
 
     bool ReceivedNotification() { return received_notification_; }
 
+    // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
+    bool IsGRPCError()
+    {
+      if (grpc_stream_error_state_ == Triton_grpc_error_steps::ERROR_WAITING) {
+        // Change the state to ERROR_CANCELED as we have called
+        // HandleCancellation
+        grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_CANCELED;
+        return true;
+      }
+      return false;
+    }
+
     bool IsCancelled()
     {
-      return received_notification_ ? ctx_->IsCancelled() : false;
+      return received_notification_ ? (ctx_->IsCancelled() || IsGRPCError())
+                                    : false;
+    }
+
+    // Extracts headers from GRPC request and updates state
+    void ExtractStateFromHeaders(InferHandlerStateType* state)
+    {
+      const auto& metadata = state->context_->ctx_->client_metadata();
+      for (const auto& pair : metadata) {
+        auto& key = pair.first;
+        auto& value = pair.second;
+        std::string param_key = std::string(key.begin(), key.end());
+        std::string value_key = std::string(value.begin(), value.end());
+        std::string triton_grpc_error_key = "triton_grpc_error";
+        if (param_key == triton_grpc_error_key) {
+          if (value_key == "true") {
+            LOG_VERBOSE(2)
+                << "GRPC: triton_grpc_error mode detected in new grpc stream";
+            state->context_->triton_grpc_error_ = true;
+          }
+        }
+      }
     }
 
+    void SendGRPCStrictResponse(InferHandlerStateType* state)
+    {
+      std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+      // Check if Error not responded previously
+      // Avoid closing connection twice on multiple errors from core
+      if (!state->context_->IsGRPCStrictError()) {
+        state->step_ = Steps::COMPLETE;
+        state->context_->responder_->Finish(state->status_, state);
+        // Mark error for this stream
+        state->context_->MarkGRPCStrictError();
+      }
+    }
     // Increments the ongoing request counter
     void IncrementRequestCounter() { ongoing_requests_++; }
 
@@ -746,7 +792,7 @@ class InferHandlerState {
 
     // Issues the cancellation for all inflight requests
     // being tracked by this context.
-    void IssueRequestCancellation()
+    void IssueRequestCancellation(bool is_triton_grpc_error)
     {
       {
         std::lock_guard<std::recursive_mutex> lock(mu_);
@@ -779,6 +825,7 @@ class InferHandlerState {
             // The RPC is complete and no callback will be invoked to retrieve
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
+            // CHeck for writeready
             PutTaskBackToQueue(state);
           }
         }
@@ -791,9 +838,11 @@ class InferHandlerState {
     // Returns whether or not to continue cycling through the gRPC
     // completion queue or not.
     bool HandleCancellation(
-        InferHandlerStateType* state, bool rpc_ok, const std::string& name)
+        InferHandlerStateType* state, bool rpc_ok, const std::string& name,
+        bool is_triton_grpc_error)
     {
-      if (!IsCancelled()) {
+      // Check to avoid early exit in case of triton_grpc_error
+      if (!IsCancelled() && !(is_triton_grpc_error)) {
         LOG_ERROR
             << "[INTERNAL] HandleCancellation called even when the context was "
                "not cancelled for "
@@ -813,10 +862,9 @@ class InferHandlerState {
         // issue cancellation request to all the inflight
         // states belonging to the context.
         if (state->context_->step_ != Steps::CANCELLED) {
-          IssueRequestCancellation();
+          IssueRequestCancellation(is_triton_grpc_error);
           // Mark the context as cancelled
           state->context_->step_ = Steps::CANCELLED;
-
           // The state returns true because the CancelExecution
           // call above would have raised alarm objects on all
           // pending inflight states objects. This state will
@@ -941,6 +989,21 @@ class InferHandlerState {
       return false;
     }
 
+    // Marks error after it has been responded to
+    void MarkGRPCStrictError()
+    {
+      grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_WAITING;
+    }
+
+    // Checks if error already responded to in triton_grpc_error mode
+    bool IsGRPCStrictError()
+    {
+      if (grpc_stream_error_state_ == Triton_grpc_error_steps::NONE) {
+        return false;
+      }
+      return true;
+    }
+
     // Return true if this context has completed all reads and writes.
     bool IsRequestsCompleted()
     {
@@ -999,6 +1062,15 @@ class InferHandlerState {
     // Tracks whether the async notification has been delivered by
     // completion queue.
     bool received_notification_;
+
+    // True if set by user via header
+    // Can be accessed without a lock, as set only once in startstream
+    std::atomic<bool> triton_grpc_error_;
+
+    // True if stream already encountered error and closed connection
+    // State maintained to avoid writes on closed stream
+    // Need to acquire lock before access
+    int grpc_stream_error_state_;
   };
 
   // This constructor is used to build a wrapper state object
@@ -1090,7 +1162,6 @@ class InferHandlerState {
 
   void MarkAsAsyncNotifyState() { async_notify_state_ = true; }
   bool IsAsyncNotifyState() { return async_notify_state_; }
-
   // Needed in the response handle for classification outputs.
   TRITONSERVER_Server* tritonserver_;