ray-project · JoshKarpel · Jun 10, 2024 · Jun 13, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/python/ray/serve/_private/autoscaling_state.py b/python/ray/serve/_private/autoscaling_state.py
@@ -25,6 +25,9 @@ class HandleMetricReport:
     """Report from a deployment handle on queued and ongoing requests.
 
     Args:
+        deployment_id: The identifier for the deployment that the
+            handle targets.
+        handle_id: The unique identifier for the deployment handle.
         actor_id: If the deployment handle (from which this metric was
             sent) lives on an actor, the actor ID of that actor.
         handle_source: Describes what kind of entity holds this
@@ -39,6 +42,8 @@ class HandleMetricReport:
         timestamp: The time at which this report was received.
     """
 
+    deployment_id: DeploymentID
+    handle_id: str
     actor_id: Optional[str]
     handle_source: DeploymentHandleSource
     queued_requests: float
@@ -88,10 +93,10 @@ def __init__(self, deployment_id: DeploymentID):
         # Map from handle ID to handle request metric report. Metrics
         # are removed from this dict either when the actor on which the
         # handle lived dies, or after a period of no updates.
-        self._handle_requests: Dict[str, HandleMetricReport] = dict()
+        self._handle_requests: Dict[str, HandleMetricReport] = {}
         # Map from replica ID to replica request metric report. Metrics
         # are removed from this dict when a replica is stopped.
-        self._replica_requests: Dict[ReplicaID, ReplicaMetricReport] = dict()
+        self._replica_requests: Dict[ReplicaID, ReplicaMetricReport] = {}
 
         self._deployment_info = None
         self._config = None
@@ -183,40 +188,22 @@ def record_request_metrics_for_replica(
         if window_avg is None:
             return
 
-        if (
-            replica_id not in self._replica_requests
-            or send_timestamp > self._replica_requests[replica_id].timestamp
-        ):
+        previous_report = self._replica_requests.get(replica_id)
+
+        if previous_report is None or send_timestamp > previous_report.timestamp:
             self._replica_requests[replica_id] = ReplicaMetricReport(
                 running_requests=window_avg,
                 timestamp=send_timestamp,
             )
 
-    def record_request_metrics_for_handle(
-        self,
-        *,
-        handle_id: str,
-        actor_id: Optional[str],
-        handle_source: DeploymentHandleSource,
-        queued_requests: float,
-        running_requests: Dict[ReplicaID, float],
-        send_timestamp: float,
-    ) -> None:
+    def record_request_metrics_for_handle(self, report: HandleMetricReport) -> None:
         """Records average number of queued and running requests at a handle for this
         deployment.
         """
+        previous_report = self._handle_requests.get(report.handle_id)
 
-        if (
-            handle_id not in self._handle_requests
-            or send_timestamp > self._handle_requests[handle_id].timestamp
-        ):
-            self._handle_requests[handle_id] = HandleMetricReport(
-                actor_id=actor_id,
-                handle_source=handle_source,
-                queued_requests=queued_requests,
-                running_requests=running_requests,
-                timestamp=send_timestamp,
-            )
+        if previous_report is None or report.timestamp > previous_report.timestamp:
+            self._handle_requests[report.handle_id] = report
 
     def drop_stale_handle_metrics(self, alive_serve_actor_ids: Set[str]) -> None:
         """Drops handle metrics that are no longer valid.
@@ -394,28 +381,15 @@ def record_request_metrics_for_replica(
                 send_timestamp=send_timestamp,
             )
 
-    def record_request_metrics_for_handle(
-        self,
-        *,
-        deployment_id: str,
-        handle_id: str,
-        actor_id: Optional[str],
-        handle_source: DeploymentHandleSource,
-        queued_requests: float,
-        running_requests: Dict[ReplicaID, float],
-        send_timestamp: float,
-    ) -> None:
+    def record_request_metrics_for_handle(self, report: HandleMetricReport) -> None:
         """Update request metric for a specific handle."""
 
-        if deployment_id in self._autoscaling_states:
-            self._autoscaling_states[deployment_id].record_request_metrics_for_handle(
-                handle_id=handle_id,
-                actor_id=actor_id,
-                handle_source=handle_source,
-                queued_requests=queued_requests,
-                running_requests=running_requests,
-                send_timestamp=send_timestamp,
-            )
+        try:
+            autoscaling_state = self._autoscaling_states[report.deployment_id]
+        except KeyError:
+            return
+
+        autoscaling_state.record_request_metrics_for_handle(report)
 
     def drop_stale_handle_metrics(self, alive_serve_actor_ids: Set[str]) -> None:
         """Drops handle metrics that are no longer valid.

diff --git a/python/ray/serve/_private/controller.py b/python/ray/serve/_private/controller.py
@@ -4,6 +4,7 @@
 import os
 import pickle
 import time
+from collections.abc import Sequence
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import ray
@@ -12,9 +13,11 @@
 from ray._raylet import GcsClient
 from ray.actor import ActorHandle
 from ray.serve._private.application_state import ApplicationStateManager, StatusOverview
-from ray.serve._private.autoscaling_state import AutoscalingStateManager
+from ray.serve._private.autoscaling_state import (
+    AutoscalingStateManager,
+    HandleMetricReport,
+)
 from ray.serve._private.common import (
-    DeploymentHandleSource,
     DeploymentID,
     MultiplexedReplicaInfo,
     NodeId,
@@ -260,29 +263,19 @@ def record_autoscaling_metrics(
             replica_id, window_avg, send_timestamp
         )
 
-    def record_handle_metrics(
-        self,
-        deployment_id: str,
-        handle_id: str,
-        actor_id: Optional[str],
-        handle_source: DeploymentHandleSource,
-        queued_requests: float,
-        running_requests: Dict[str, float],
-        send_timestamp: float,
-    ):
+    def record_handle_metrics(self, report: HandleMetricReport) -> None:
         logger.debug(
-            f"Received metrics from handle {handle_id} for deployment {deployment_id}: "
-            f"{queued_requests} queued requests and {running_requests} running requests"
-        )
-        self.autoscaling_state_manager.record_request_metrics_for_handle(
-            deployment_id=deployment_id,
-            handle_id=handle_id,
-            actor_id=actor_id,
-            handle_source=handle_source,
-            queued_requests=queued_requests,
-            running_requests=running_requests,
-            send_timestamp=send_timestamp,
+            f"Received metrics from handle {report.handle_id} "
+            f"for deployment {report.deployment_id}: "
+            f"{report.queued_requests} queued requests "
+            f"and {report.running_requests} running requests"
         )
+        self.autoscaling_state_manager.record_request_metrics_for_handle(report)
+
+    def bulk_record_handle_metrics(self, reports: Sequence[HandleMetricReport]) -> None:
+        logger.debug(f"Received {len(reports)} bulk handle metrics reports")
+        for report in reports:
+            self.record_handle_metrics(report)
 
     def _dump_autoscaling_metrics_for_testing(self):
         return self.autoscaling_state_manager.get_metrics()

diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py
@@ -231,12 +231,13 @@ def record_request_metrics(
 
     def _push_autoscaling_metrics(self) -> Dict[str, Any]:
         look_back_period = self._autoscaling_config.look_back_period_s
+        now = time.time()
         self._controller_handle.record_autoscaling_metrics.remote(
             replica_id=self._replica_id,
             window_avg=self._metrics_store.window_average(
-                self._replica_id, time.time() - look_back_period
+                self._replica_id, now - look_back_period
             ),
-            send_timestamp=time.time(),
+            send_timestamp=now,
         )
 
     def _add_autoscaling_metrics_point(self) -> None:

diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py
@@ -3,15 +3,17 @@
 import logging
 import threading
 import time
+import weakref
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from contextlib import contextmanager
-from functools import partial
+from functools import lru_cache, partial
 from typing import Any, Coroutine, DefaultDict, Dict, List, Optional, Tuple, Union
 
 import ray
 from ray.actor import ActorHandle
 from ray.exceptions import ActorDiedError, ActorUnavailableError, RayError
+from ray.serve._private.autoscaling_state import HandleMetricReport
 from ray.serve._private.common import (
     DeploymentHandleSource,
     DeploymentID,
@@ -201,10 +203,12 @@ def update_deployment_config(
                     ),
                 )
                 # Push metrics to the controller periodically.
-                self.metrics_pusher.register_or_update_task(
-                    self.PUSH_METRICS_TO_CONTROLLER_TASK_NAME,
-                    self.push_autoscaling_metrics_to_controller,
-                    autoscaling_config.metrics_interval_s,
 # Handle metric push interval. (This interval will affect the cold start time period) 
 HANDLE_METRIC_PUSH_INTERVAL_S = float( 
     os.environ.get("RAY_SERVE_HANDLE_METRIC_PUSH_INTERVAL_S", "10") 
 ) 
 HANDLE_METRIC_PUSH_INTERVAL_S, 
 min( 
     RAY_SERVE_HANDLE_AUTOSCALING_METRIC_RECORD_PERIOD_S, 
     autoscaling_config.metrics_interval_s, 
 ), 
 # Handle metric push interval. (This interval will affect the cold start time period) 
 HANDLE_METRIC_PUSH_INTERVAL_S = float( 
     os.environ.get("RAY_SERVE_HANDLE_METRIC_PUSH_INTERVAL_S", "10") 
 ) 
 HANDLE_METRIC_PUSH_INTERVAL_S, 
 min( 
     RAY_SERVE_HANDLE_AUTOSCALING_METRIC_RECORD_PERIOD_S, 
     autoscaling_config.metrics_interval_s, 
 ), 
+                shared = SharedHandleMetricsPusher.get_or_create(
+                    self._controller_handle
+                )
+                shared.register(self)
+                logger.info(
+                    f"Registered {self._handle_id} with shared metrics pusher {shared}."
                 )
             else:
                 self.metrics_pusher.register_or_update_task(
@@ -249,21 +253,24 @@ def should_send_scaled_to_zero_optimized_push(self, curr_num_replicas: int) -> b
             and self.num_queued_requests > 0
         )
 
-    def push_autoscaling_metrics_to_controller(self):
-        """Pushes queued and running request metrics to the controller.
-
-        These metrics are used by the controller for autoscaling.
-        """
-
-        self._controller_handle.record_handle_metrics.remote(
-            send_timestamp=time.time(),
+    def metrics_report(self) -> HandleMetricReport:
+        return HandleMetricReport(
+            timestamp=time.time(),
             deployment_id=self._deployment_id,
             handle_id=self._handle_id,
             actor_id=self._self_actor_id,
             handle_source=self._handle_source,
             **self._get_aggregated_requests(),
         )
 
+    def push_autoscaling_metrics_to_controller(self):
+        """Pushes queued and running request metrics to the controller.
+
+        These metrics are used by the controller for autoscaling.
+        """
+
+        self._controller_handle.record_handle_metrics.remote(self.metrics_report())
+
     def _add_autoscaling_metrics_point(self):
         """Adds metrics point for queued and running requests at replicas.
 
@@ -280,16 +287,17 @@ def _add_autoscaling_metrics_point(self):
             )
 
         # Prevent in memory metrics store memory from growing
-        start_timestamp = time.time() - self.autoscaling_config.look_back_period_s
+        start_timestamp = timestamp - self.autoscaling_config.look_back_period_s
         self.metrics_store.prune_keys_and_compact_data(start_timestamp)
 
     def _get_aggregated_requests(self):
-        running_requests = dict()
+        running_requests = {}
         if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE and self.autoscaling_config:
             look_back_period = self.autoscaling_config.look_back_period_s
+            window_start_time = time.time() - look_back_period
             running_requests = {
                 replica_id: self.metrics_store.window_average(
-                    replica_id, time.time() - look_back_period
+                    replica_id, window_start_time
                 )
                 # If data hasn't been recorded yet, return current
                 # number of queued and ongoing requests.
@@ -311,6 +319,45 @@ async def shutdown(self):
         self._shutdown = True
 
 
+class SharedHandleMetricsPusher:
+    def __init__(self, controller_handle: ActorHandle):
+        self._controller_handler = controller_handle
+
+        self._metrics_pusher = MetricsPusher()
+        self._router_metrics_managers: weakref.WeakSet[
+            RouterMetricsManager
+        ] = weakref.WeakSet()
+
+    @classmethod
+    @lru_cache(maxsize=None)
+    def get_or_create(
+        cls, controller_handle: ActorHandle
+    ) -> "SharedHandleMetricsPusher":
+        pusher = cls(controller_handle=controller_handle)
+        pusher.start()
+        logger.info(f"Started {pusher}.")
+        return pusher
+
+    def register(self, router_metrics_manager: RouterMetricsManager) -> None:
+        self._router_metrics_managers.add(router_metrics_manager)
+
+    def start(self) -> None:
+        self._metrics_pusher.start()
+
+        self._metrics_pusher.register_or_update_task(
+            "push_metrics_to_controller",
+            self.push_metrics,
+            HANDLE_METRIC_PUSH_INTERVAL_S,
+        )
+
+    def push_metrics(self) -> None:
+        # TODO: gathering reports could block the event loop for a long time
+        logger.debug("Pushing handle metrics to controller")
+        self._controller_handler.bulk_record_handle_metrics.remote(
+            [m.metrics_report() for m in self._router_metrics_managers]
+        )
+
+
 class Router(ABC):
     @abstractmethod
     def running_replicas_populated(self) -> bool:

diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py
@@ -11,6 +11,7 @@
 from ray._private.test_utils import async_wait_for_condition
 from ray._private.utils import get_or_create_event_loop
 from ray.exceptions import ActorDiedError, ActorUnavailableError
+from ray.serve._private.autoscaling_state import HandleMetricReport
 from ray.serve._private.common import (
     DeploymentHandleSource,
     DeploymentID,
@@ -907,13 +908,15 @@ def test_push_autoscaling_metrics_to_controller(self):
             # Check metrics are pushed correctly
             metrics_manager.push_autoscaling_metrics_to_controller()
             mock_controller_handle.record_handle_metrics.remote.assert_called_with(
-                deployment_id=deployment_id,
-                handle_id=handle_id,
-                actor_id=self_actor_id,
-                handle_source=DeploymentHandleSource.PROXY,
-                queued_requests=n,
-                running_requests=running_requests,
-                send_timestamp=start,
+                HandleMetricReport(
+                    deployment_id=deployment_id,
+                    handle_id=handle_id,
+                    actor_id=self_actor_id,
+                    handle_source=DeploymentHandleSource.PROXY,
+                    queued_requests=n,
+                    running_requests=running_requests,
+                    timestamp=start,
+                )
             )
 
     @pytest.mark.skipif(