diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
index 8a1c004bf9..855a5ffbab 100644
--- a/docs/user_guide/metrics.md
+++ b/docs/user_guide/metrics.md
@@ -97,6 +97,71 @@ Count*. The count metrics are illustrated by the following examples:
 |              |Failure Count   |`nv_inference_request_failure` |Number of failed inference requests received by Triton (each request is counted as 1, even if the request contains a batch) |Per model  |Per request  |
 |              |Inference Count |`nv_inference_count` |Number of inferences performed (a batch of "n" is counted as "n" inferences, does not include cached requests)|Per model|Per request|
 |              |Execution Count |`nv_inference_exec_count` |Number of inference batch executions (see [Inference Request Metrics](#inference-request-metrics), does not include cached requests)|Per model|Per request|
+|              |Pending Request Count |`nv_inference_pending_request_count` |Number of inference requests awaiting execution by a backend. This number is incremented when a request is enqueued to the server (`TRITONSERVER_ServerInferAsync`) and is decremented when a backend is about to start executing the request. More details can be found below. |Per model|Per request|
+
+#### Pending Request Count (Queue Size) Per-Model
+
+The *Pending Request Count* reflects the number of requests that have been
+received by Triton core via `TRITONSERVER_InferAsync`, but have not yet
+started execution by a backend model instance
+(`TRITONBACKEND_ModelInstanceExecute`).
+
+For all intents and purposes, the
+"pending request count" and "queue size" per-model can be used
+interchangeably, and the number reflected in the metric should
+intuitively represent the number of requests that are not currently
+being executed by any model instances. In simple terms, if you send a 100
+requests to a model that can only handle 5 requests concurrently, then you
+should see a pending count of 95 for that model in most cases.
+
+For those interested in more technical details, the term "pending request count"
+is a bit more accurate than "queue size" because Triton is highly configurable,
+and there are many places in Triton that a request be considered pending rather
+than a single queue. Some of the most common will be called out below:
+- Default Scheduler backlogs any requests not currently executing.
+  - Assuming 1 available model instance with the default scheduler settings,
+    and 10 requests are sent in rapid succession.
+  - The 1st request should be picked up for
+    execution immediately, and the remaining 9 requests should be considered
+    pending for this model, until the 1st request is finished. Afterwards, the
+    next request should be picked up and the pending count should be decremented
+    to 8, and so on until all requests are finished and the pending count is 0.
+- Dynamic Batcher queue for dynamically creating batches from requests.
+  - Assuming 1 available model instance with the dynamic batch scheduler
+    configured with `max_batch_size: 4` and a sufficiently large
+    `max_queue_delay_microseconds` (or queue of requests),
+    and 10 requests are sent in rapid succession.
+  - The first 4 requests, or as large of a batch the scheduler could form,
+    should be picked up for execution immediately, and the remaining 6 requests
+    should be considered pending. After the batch finishes, the next batch
+    should be picked up, decrementing the pending count again to 2 pending.
+    Then finally since only 2 requests remain, the final 2 requests will be
+    batched and picked up by the backend, decrementing the pending count to 0.
+- Sequence Batcher queues and backlogs for ongoing sequence requests, some may
+  be assigned sequence slots, some may not.
+  - Sequence Batchers of both strategies (direct and oldest) will have pending
+    counts that generally follow the same trend as the dynamic batching
+    description above. The sequence batchers will immediately execute as many
+    requests in a batch as it can based on the model/scheduler config settings,
+    and any further requests will be considered pending until the previous batch
+    finishes and the next batch can start.
+- Rate Limiter queues for prepared batches of requests.
+  - When rate limiting is enabled, requests can be held back from execution
+    to satisfy the rate limit constraints that were configured.
+
+There are some places where a request would not be considered pending:
+- Ensemble Scheduler
+  - The Ensemble Scheduler almost immediately enqueues any requests it receives
+    into the composing model schedulers at the first step in the ensemble.
+    Therefore, the requests could be considered pending by the composing model
+    scheduler's, however from the ensemble's perspective, these requests have been
+    scheduled.
+- Frontends (HTTP/GRPC Servers)
+  - Any requests sent from a client to a frontend server in-front of Triton
+    may spend some time in the corresponding server's code mapping
+    protocol-specific metadata to Triton metadata. Though this time is
+    generally brief, it will not be considered pending from Triton's
+    perspective until Triton core has received the request from the frontend.
 
 ### Latencies
 
diff --git a/qa/L0_metrics/ensemble_delay/config.pbtxt b/qa/L0_metrics/ensemble_delay/config.pbtxt
new file mode 100644
index 0000000000..0eaa2f76f7
--- /dev/null
+++ b/qa/L0_metrics/ensemble_delay/config.pbtxt
@@ -0,0 +1,67 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+platform: "ensemble"
+max_batch_size: 4
+
+input [
+  {
+    name: "ENSEMBLE_INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+output [
+  {
+    name: "ENSEMBLE_OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  },
+  {
+    name: "ENSEMBLE_OUTPUT1"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+ensemble_scheduling
+{
+  step [
+    {
+      model_name: "dynamic_composing"
+      model_version: -1
+      input_map { key: "INPUT0", value: "ENSEMBLE_INPUT0" }
+      output_map { key: "OUTPUT0", value: "ENSEMBLE_OUTPUT0" }
+    },
+    {
+      model_name: "default_composing"
+      model_version: -1
+      input_map { key: "INPUT0", value: "ENSEMBLE_INPUT0" }
+      output_map { key: "OUTPUT0", value: "ENSEMBLE_OUTPUT1" }
+    }
+  ]
+}
diff --git a/qa/L0_metrics/identity_delay/config.pbtxt b/qa/L0_metrics/identity_delay/config.pbtxt
new file mode 100644
index 0000000000..1062868c2b
--- /dev/null
+++ b/qa/L0_metrics/identity_delay/config.pbtxt
@@ -0,0 +1,58 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "identity"
+max_batch_size: 4
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
+
+parameters [
+  {
+    key: "execute_delay_ms"
+    value: { string_value: "2000" }
+  }
+]
diff --git a/qa/L0_metrics/metrics_test.py b/qa/L0_metrics/metrics_config_test.py
similarity index 99%
rename from qa/L0_metrics/metrics_test.py
rename to qa/L0_metrics/metrics_config_test.py
index 13efdb0d10..a1324ac28e 100755
--- a/qa/L0_metrics/metrics_test.py
+++ b/qa/L0_metrics/metrics_config_test.py
@@ -58,7 +58,7 @@
 CACHE_SUMMARY_PATTERNS = ["nv_cache_hit_summary", "nv_cache_miss_summary"]
 
 
-class MetricsTest(tu.TestResultCollector):
+class MetricsConfigTest(tu.TestResultCollector):
     def _get_metrics(self):
         metrics_url = "http://localhost:8002/metrics"
         r = requests.get(metrics_url)
diff --git a/qa/L0_metrics/metrics_queue_size_test.py b/qa/L0_metrics/metrics_queue_size_test.py
new file mode 100755
index 0000000000..18a601d01b
--- /dev/null
+++ b/qa/L0_metrics/metrics_queue_size_test.py
@@ -0,0 +1,268 @@
+#!/usr/bin/python
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../common")
+
+import math
+import time
+import unittest
+from functools import partial
+
+import numpy as np
+import requests
+import test_util as tu
+import tritonclient.http
+from tritonclient.utils import triton_to_np_dtype
+
+QUEUE_METRIC_TEMPLATE = (
+    'nv_inference_pending_request_count{{model="{model_name}",version="1"}}'
+)
+INFER_METRIC_TEMPLATE = 'nv_inference_count{{model="{model_name}",version="1"}}'
+EXEC_METRIC_TEMPLATE = 'nv_inference_exec_count{{model="{model_name}",version="1"}}'
+
+
+class MetricsPendingRequestCountTest(tu.TestResultCollector):
+    def setUp(self):
+        self.metrics = None
+        self.metrics_url = "http://localhost:8002/metrics"
+        self.server_url = "localhost:8000"
+
+        # Used to verify model config is set to expected values
+        self.max_batch_size = 4
+        self.delay_ms = 2000
+        self.delay_sec = self.delay_ms // 1000
+
+        # Setup dummy inputs
+        dtype = "FP32"
+        shape = (1, 1)
+        input_np = np.ones(shape, dtype=triton_to_np_dtype(dtype))
+        self.inputs = [
+            tritonclient.http.InferInput("INPUT0", shape, dtype).set_data_from_numpy(
+                input_np
+            )
+        ]
+        self.ensemble_inputs = [
+            tritonclient.http.InferInput(
+                "ENSEMBLE_INPUT0", shape, dtype
+            ).set_data_from_numpy(input_np)
+        ]
+
+        # Verify values for filling request queues
+        self.num_requests = 10
+        self.concurrency = 10
+        # Concurrency must be at least as high as number of async requests we intend
+        # to send N requests to fill request queues before blocking on any results.
+        self.assertGreaterEqual(self.concurrency, self.num_requests)
+        self.client = tritonclient.http.InferenceServerClient(
+            url=self.server_url, concurrency=self.concurrency
+        )
+
+    def _validate_model_config(self, model_name):
+        config = self.client.get_model_config(model_name)
+        print(config)
+        params = config.get("parameters", {})
+        delay_ms = int(params.get("execute_delay_ms", {}).get("string_value"))
+        max_batch_size = config.get("max_batch_size")
+        self.assertEqual(delay_ms, self.delay_ms)
+        self.assertEqual(max_batch_size, self.max_batch_size)
+        return config
+
+    def _get_metrics(self):
+        r = requests.get(self.metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _get_metric_line(self, metric, metrics):
+        for line in metrics.splitlines():
+            if metric in line:
+                return line
+        return None
+
+    def _get_metric_value(self, metric):
+        metrics = self._get_metrics()
+        self.assertIn(metric, metrics)
+        line = self._get_metric_line(metric, metrics)
+        print(line)
+        if not line:
+            return None
+        value = line.split()[1]
+        return float(value)
+
+    def _assert_metric_equals(self, metric, expected_value):
+        value = self._get_metric_value(metric)
+        self.assertEqual(value, expected_value)
+
+    def _assert_metric_greater_than(self, metric, gt_value):
+        value = self._get_metric_value(metric)
+        self.assertGreater(value, gt_value)
+
+    def _send_async_requests(self, model_name, inputs, futures):
+        for _ in range(self.num_requests):
+            futures.append(self.client.async_infer(model_name, inputs))
+
+    def _send_async_requests_sequence(self, num_seq_slots, model_name, inputs, futures):
+        started_seqs = {}
+        num_sent = 0
+        while num_sent < self.num_requests:
+            # Add requests to each sequence slot round-robin, seq_id must be > 0
+            # We don't care about finishing any sequences, just need to queue up
+            # requests for each sequence until num_requests is hit.
+            seq_id = (num_sent % num_seq_slots) + 1
+            # Toggle start flag to False after first request per sequence ID
+            start = True if seq_id not in started_seqs else False
+            started_seqs[seq_id] = True
+            futures.append(
+                self.client.async_infer(
+                    model_name,
+                    inputs,
+                    request_id=str(num_sent),
+                    sequence_id=seq_id,
+                    sequence_start=start,
+                )
+            )
+            num_sent += 1
+
+    def _test_helper(self, model_name, batch_size, send_requests_func):
+        self._validate_model_config(model_name)
+
+        queue_size = QUEUE_METRIC_TEMPLATE.format(model_name=model_name)
+        infer_count = INFER_METRIC_TEMPLATE.format(model_name=model_name)
+        exec_count = EXEC_METRIC_TEMPLATE.format(model_name=model_name)
+        # Metric should be zero before sending any requests
+        self._assert_metric_equals(queue_size, 0)
+        # Send N requests, letting scheduler delay queue fill up when applicable
+        futures = []
+        send_requests_func(model_name, self.inputs, futures)
+        # Give Triton a second to load all requests into queues
+        time.sleep(1)
+
+        starting_queue_size = self.num_requests - batch_size
+        # Start from (num_requests-batch_size) because 1 batch should be executing,
+        # and the rest of the requests should be queued.
+        for expected_queue_size in range(starting_queue_size, 0, -1 * batch_size):
+            self._assert_metric_equals(queue_size, expected_queue_size)
+            time.sleep(self.delay_sec)
+        # Queue should be empty now
+        self._assert_metric_equals(queue_size, 0)
+        # Let final batch finish
+        time.sleep(self.delay_sec)
+
+        # All requests should've been executed without any batching
+        self._assert_metric_equals(infer_count, self.num_requests)
+        expected_exec_count = math.ceil(self.num_requests / batch_size)
+        self._assert_metric_equals(exec_count, expected_exec_count)
+
+        # Verify no inference exceptions were raised
+        for future in futures:
+            future.get_result()
+
+    def test_default_scheduler(self):
+        model_name = "default"
+        # Default scheduler won't do any batching
+        batch_size = 1
+        self._test_helper(model_name, batch_size, self._send_async_requests)
+
+    def test_dynamic_batch_scheduler(self):
+        model_name = "dynamic"
+        # With sufficient queue delay set, we expect full batches to be executed
+        batch_size = self.max_batch_size
+        self._test_helper(model_name, batch_size, self._send_async_requests)
+
+    def test_sequence_batch_scheduler_direct(self):
+        model_name = "sequence_direct"
+        # With sufficient queue delay and minimum_slot_utilization set, we
+        # expect full batches to be executed.
+        batch_size = self.max_batch_size
+        num_seq_slots = batch_size
+        send_requests_func = partial(self._send_async_requests_sequence, num_seq_slots)
+        self._test_helper(model_name, batch_size, send_requests_func)
+
+    def test_sequence_batch_scheduler_oldest(self):
+        model_name = "sequence_oldest"
+        # With sufficient queue delay set, we expect full batches to be executed
+        batch_size = self.max_batch_size
+        num_seq_slots = batch_size
+        send_requests_func = partial(self._send_async_requests_sequence, num_seq_slots)
+        self._test_helper(model_name, batch_size, send_requests_func)
+
+    def test_ensemble_scheduler(self):
+        ensemble_model_name = "ensemble"
+        composing_model_names = ["dynamic_composing", "default_composing"]
+        ensemble_queue_size = QUEUE_METRIC_TEMPLATE.format(
+            model_name=ensemble_model_name
+        )
+        composing_queue_sizes = [
+            QUEUE_METRIC_TEMPLATE.format(model_name=name)
+            for name in composing_model_names
+        ]
+        ensemble_infer_count = INFER_METRIC_TEMPLATE.format(
+            model_name=ensemble_model_name
+        )
+        composing_infer_counts = [
+            INFER_METRIC_TEMPLATE.format(model_name=name)
+            for name in composing_model_names
+        ]
+
+        # Metric should be zero before sending any requests
+        self._assert_metric_equals(ensemble_queue_size, 0)
+        for queue_size in composing_queue_sizes:
+            self._assert_metric_equals(queue_size, 0)
+        # Send some ensemble requests
+        futures = []
+        self._send_async_requests(ensemble_model_name, self.ensemble_inputs, futures)
+        # Give Triton time to pass some requests to composing models. This test
+        # is less comprehensive on checking exact queue values, and just verifies
+        # each composing queue gets filled and ensemble's queue is empty.
+        time.sleep(1)
+
+        # Top-level ensemble size should still be zero, as all pending requests should
+        # be scheduled and reflected in composing models, and not considered "pending" at ensemble level.
+        self._assert_metric_equals(ensemble_queue_size, 0)
+        # Composing models should be non-zero
+        for queue_size in composing_queue_sizes:
+            self._assert_metric_greater_than(queue_size, 0)
+
+        # Verify no inference exceptions were raised and let composing models
+        # finish their requests
+        for future in futures:
+            future.get_result()
+
+        # Check that all queues are empty after getting results
+        self._assert_metric_equals(ensemble_queue_size, 0)
+        for queue_size in composing_queue_sizes:
+            self._assert_metric_equals(queue_size, 0)
+
+        # Sanity check infer counts on ensemble and composing models
+        self._assert_metric_equals(ensemble_infer_count, self.num_requests)
+        for infer_count in composing_infer_counts:
+            self._assert_metric_equals(infer_count, self.num_requests)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
index d0cf0193ae..9b0f634b82 100755
--- a/qa/L0_metrics/test.sh
+++ b/qa/L0_metrics/test.sh
@@ -55,7 +55,7 @@ function check_unit_test() {
         echo -e "\n***\n*** Test Failed\n***"
         RET=1
     else
-	EXPECTED_NUM_TESTS=1
+        EXPECTED_NUM_TESTS="${1:-1}"
         check_test_results ${TEST_RESULT_FILE} ${EXPECTED_NUM_TESTS}
         if [ $? -ne 0 ]; then
             cat $CLIENT_LOG
@@ -77,6 +77,12 @@ function run_and_check_server() {
 rm -f $SERVER_LOG
 RET=0
 
+if [ `ps | grep -c "tritonserver"` != "0" ]; then
+    echo -e "Tritonserver already running"
+    echo -e `ps | grep tritonserver`
+    exit 1
+fi
+
 ### UNIT TESTS
 
 TEST_LOG="./metrics_api_test.log"
@@ -221,18 +227,18 @@ MODELDIR="${PWD}/unit_test_models"
 mkdir -p "${MODELDIR}/identity_cache_on/1"
 mkdir -p "${MODELDIR}/identity_cache_off/1"
 BASE_SERVER_ARGS="--model-repository=${MODELDIR} --model-control-mode=explicit"
-PYTHON_TEST="metrics_test.py"
+PYTHON_TEST="metrics_config_test.py"
 
 # Check default settings: Counters should be enabled, summaries should be disabled
 SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off"
 run_and_check_server
-python3 ${PYTHON_TEST} MetricsTest.test_inf_counters_exist 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_counters_exist 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_inf_summaries_missing 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_summaries_missing 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_cache_counters_missing 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_counters_missing 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_cache_summaries_missing 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_summaries_missing 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
 kill $SERVER_PID
 wait $SERVER_PID
@@ -240,13 +246,13 @@ wait $SERVER_PID
 # Enable summaries, counters still enabled by default
 SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off --metrics-config summary_latencies=true"
 run_and_check_server
-python3 ${PYTHON_TEST} MetricsTest.test_inf_counters_exist 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_counters_exist 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_inf_summaries_exist 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_summaries_exist 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_cache_counters_missing 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_counters_missing 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_cache_summaries_missing 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_summaries_missing 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
 kill $SERVER_PID
 wait $SERVER_PID
@@ -254,13 +260,13 @@ wait $SERVER_PID
 # Enable summaries, disable counters
 SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off --metrics-config summary_latencies=true --metrics-config counter_latencies=false"
 run_and_check_server
-python3 ${PYTHON_TEST} MetricsTest.test_inf_counters_missing 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_counters_missing 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_inf_summaries_exist 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_summaries_exist 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_cache_counters_missing 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_counters_missing 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_cache_summaries_missing 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_summaries_missing 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
 kill $SERVER_PID
 wait $SERVER_PID
@@ -269,15 +275,15 @@ wait $SERVER_PID
 CACHE_ARGS="--cache-config local,size=1048576"
 SERVER_ARGS="${BASE_SERVER_ARGS} ${CACHE_ARGS} --load-model=identity_cache_on --metrics-config summary_latencies=true --metrics-config counter_latencies=true"
 run_and_check_server
-python3 ${PYTHON_TEST} MetricsTest.test_inf_counters_exist 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_counters_exist 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
 # DLIS-4762: Asserts that request summary is not published when cache is
 # enabled for a model, until this if fixed.
-python3 ${PYTHON_TEST} MetricsTest.test_inf_summaries_exist_with_cache 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_inf_summaries_exist_with_cache 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_cache_counters_exist 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_counters_exist 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
-python3 ${PYTHON_TEST} MetricsTest.test_cache_summaries_exist 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_cache_summaries_exist 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
 kill $SERVER_PID
 wait $SERVER_PID
@@ -286,11 +292,64 @@ wait $SERVER_PID
 export SUMMARY_QUANTILES="0.1:0.0.1,0.7:0.01,0.75:0.01"
 SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off --metrics-config summary_latencies=true --metrics-config summary_quantiles=${SUMMARY_QUANTILES}"
 run_and_check_server
-python3 ${PYTHON_TEST} MetricsTest.test_summaries_custom_quantiles 2>&1 | tee ${CLIENT_LOG}
+python3 ${PYTHON_TEST} MetricsConfigTest.test_summaries_custom_quantiles 2>&1 | tee ${CLIENT_LOG}
 check_unit_test
 kill $SERVER_PID
 wait $SERVER_PID
 
+### Pending Request Count (Queue Size) Metric Behavioral Tests ###
+MODELDIR="${PWD}/queue_size_models"
+SERVER_ARGS="--model-repository=${MODELDIR} --log-verbose=1"
+PYTHON_TEST="metrics_queue_size_test.py"
+rm -rf "${MODELDIR}"
+mkdir -p "${MODELDIR}"
+
+# Re-use an identity model that sleeps during execution for N seconds for the
+# batch of requests. Then we can confirm queue size behaviors for various
+# scheduling/batching strategies.
+BASE_MODEL="identity_delay"
+# Don't use special debug env var for this, just set sufficient parameters for
+# each scheduler to let them fill batches when possible.
+unset TRITONSERVER_DELAY_SCHEDULER
+export MAX_BATCH_SIZE=4
+# Delay up to 100ms to form batches up to MAX_BATCH_SIZE
+export MAX_QUEUE_DELAY_US=100000
+
+# Create a model per scheduler type
+DEFAULT_MODEL="${MODELDIR}/default"
+cp -r "${BASE_MODEL}" "${DEFAULT_MODEL}"
+mkdir -p "${DEFAULT_MODEL}/1"
+sed -i "s/^max_batch_size.*/max_batch_size: ${MAX_BATCH_SIZE}/" "${DEFAULT_MODEL}/config.pbtxt"
+
+DYNAMIC_MODEL="${MODELDIR}/dynamic"
+cp -r "${DEFAULT_MODEL}" "${DYNAMIC_MODEL}"
+echo -e "\ndynamic_batching { max_queue_delay_microseconds: ${MAX_QUEUE_DELAY_US} }\n" >> "${DYNAMIC_MODEL}/config.pbtxt"
+
+SEQUENCE_DIRECT_MODEL="${MODELDIR}/sequence_direct"
+cp -r "${DEFAULT_MODEL}" "${SEQUENCE_DIRECT_MODEL}"
+echo -e "\nsequence_batching { direct { max_queue_delay_microseconds: ${MAX_QUEUE_DELAY_US}, minimum_slot_utilization: 1.0 } }\n" >> "${SEQUENCE_DIRECT_MODEL}/config.pbtxt"
+
+SEQUENCE_OLDEST_MODEL="${MODELDIR}/sequence_oldest"
+cp -r "${DEFAULT_MODEL}" "${SEQUENCE_OLDEST_MODEL}"
+echo -e "\nsequence_batching { oldest { max_queue_delay_microseconds: ${MAX_QUEUE_DELAY_US}, max_candidate_sequences: ${MAX_BATCH_SIZE} } }\n" >> "${SEQUENCE_OLDEST_MODEL}/config.pbtxt"
+
+BASE_ENSEMBLE="ensemble_delay"
+ENSEMBLE_MODEL="${MODELDIR}/ensemble"
+cp -r "${BASE_ENSEMBLE}" "${ENSEMBLE_MODEL}"
+mkdir -p "${ENSEMBLE_MODEL}/1"
+# Use uniquely named composing models to avoid clashing
+# metric values with individual and ensemble tests.
+cp -r "${DEFAULT_MODEL}" "${MODELDIR}/default_composing"
+cp -r "${DYNAMIC_MODEL}" "${MODELDIR}/dynamic_composing"
+
+
+run_and_check_server
+python3 ${PYTHON_TEST} 2>&1 | tee ${CLIENT_LOG}
+kill $SERVER_PID
+wait $SERVER_PID
+expected_tests=5
+check_unit_test "${expected_tests}"
+
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
 else