diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 13cee4961..c0af2d2aa 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -20,7 +20,7 @@ jobs: should_skip: ${{ steps.skip_check.outputs.should_skip }} steps: - id: skip_check - uses: fkirc/skip-duplicate-actions@v4 + uses: fkirc/skip-duplicate-actions@master with: concurrent_skipping: 'same_content' paths_ignore: '["**/README.md", "**/docs/**", "CHANGELOG.md"]' diff --git a/servo/cli.py b/servo/cli.py index 4bac9fbec..4b73fc2d0 100644 --- a/servo/cli.py +++ b/servo/cli.py @@ -28,6 +28,7 @@ import click import devtools import kubernetes_asyncio +import kubernetes_asyncio.config import loguru import pydantic import pygments @@ -44,6 +45,8 @@ import servo.utilities.yaml import servo.utilities.strings +from servo.connectors.kubernetes_helpers import DeploymentHelper + class Section(str, enum.Enum): assembly = "Assembly Commands" @@ -1557,34 +1560,21 @@ def inject_sidecar( if target.startswith("deploy"): deployment = run_async( - servo.connectors.kubernetes.Deployment.read( - target.split("/", 1)[1], namespace - ) + DeploymentHelper.read(target.split("/", 1)[1], namespace) ) run_async( - deployment.inject_sidecar( - "opsani-envoy", image, service=service, port=port + DeploymentHelper.inject_sidecar( + deployment, "opsani-envoy", image, service=service, port=port ) ) typer.echo( - f"Envoy sidecar injected to Deployment {deployment.name} in {namespace}" + f"Envoy sidecar injected to Deployment {deployment.metadata.name} in {namespace}" ) elif target.startswith("rollout"): - rollout = run_async( - servo.connectors.kubernetes.Rollout.read( - target.split("/", 1)[1], namespace - ) - ) - run_async( - rollout.inject_sidecar( - "opsani-envoy", image, service=service, port=port - ) - ) - typer.echo( - f"Envoy sidecar injected to Rollout {rollout.name} in {namespace}" + raise typer.BadParameter( + "Rollout sidecar injection is not yet implemented" ) - elif target.startswith("pod"): raise typer.BadParameter("Pod sidecar injection is not yet implemented") else: diff --git a/servo/connectors/kube_metrics.py b/servo/connectors/kube_metrics.py index c6846b576..f48e75ae0 100644 --- a/servo/connectors/kube_metrics.py +++ b/servo/connectors/kube_metrics.py @@ -14,21 +14,26 @@ import servo from servo.checks import CheckError +from servo.connectors.kubernetes_helpers import ( + dict_to_selector, + find_container, + get_containers, + ContainerHelper, + DeploymentHelper, + PodHelper, + StatefulSetHelper, +) from servo.connectors.kubernetes import ( - Container, - Deployment, DNSSubdomainName, Core, PermissionSet, - Pod, - ResourceRequirement, - Rollout, - selector_string, ShortByteSize, ) -from servo.types import DataPoint, Metric, TimeSeries +import servo.types +from servo.types import DataPoint, Metric, TimeSeries, Resource, ResourceRequirement import kubernetes_asyncio.client +from kubernetes_asyncio.client import V1Container, V1Deployment, V1Pod, V1StatefulSet import kubernetes_asyncio.client.api_client import kubernetes_asyncio.client.exceptions import kubernetes_asyncio.config @@ -84,8 +89,10 @@ class KubeMetricsConfiguration(servo.BaseConfiguration): description="Namespace of the target resource" ) name: str = pydantic.Field(description="Name of the target resource") - kind: pydantic.constr(regex=r"^([Dd]eployment|[Rr]ollout)$") = pydantic.Field( - default="Deployment", description="Kind of the target resource" + kind: str = pydantic.Field( + default="Deployment", + description="Kind of the target resource", + regex=r"^([Dd]eployment|[Ss]tateful[Ss]et)$", ) container: Optional[str] = pydantic.Field( default=None, description="Name of the target resource container" @@ -141,22 +148,18 @@ async def check_metrics_api_permissions(self) -> None: for permission in KUBERNETES_PERMISSIONS: for resource in permission.resources: for verb in permission.verbs: - attributes = ( - kubernetes_asyncio.client.models.V1ResourceAttributes( - namespace=self.config.namespace, - group=permission.group, - resource=resource, - verb=verb, - ) + attributes = kubernetes_asyncio.client.V1ResourceAttributes( + namespace=self.config.namespace, + group=permission.group, + resource=resource, + verb=verb, ) - spec = kubernetes_asyncio.client.models.V1SelfSubjectAccessReviewSpec( + spec = kubernetes_asyncio.client.V1SelfSubjectAccessReviewSpec( resource_attributes=attributes ) - review = ( - kubernetes_asyncio.client.models.V1SelfSubjectAccessReview( - spec=spec - ) + review = kubernetes_asyncio.client.V1SelfSubjectAccessReview( + spec=spec ) access_review = await v1.create_self_subject_access_review( body=review @@ -171,7 +174,9 @@ async def check_metrics_api(self) -> None: async with kubernetes_asyncio.client.api_client.ApiClient() as api: cust_obj_api = kubernetes_asyncio.client.CustomObjectsApi(api_client=api) await cust_obj_api.list_namespaced_custom_object( - label_selector=selector_string(target_resource.match_labels), + label_selector=dict_to_selector( + target_resource.spec.selector.match_labels + ), namespace=self.config.namespace, **METRICS_CUSTOM_OJBECT_CONST_ARGS, ) @@ -181,17 +186,13 @@ async def check_target_containers(self) -> None: target_resource = await _get_target_resource(self.config) if self.config.container: assert ( - next( - ( - c - for c in target_resource.containers - if c.name == self.config.container - ), - None, - ) + find_container(workload=target_resource, name=self.config.container) is not None - ), f"Configured container {self.config.container} was not found in target app containers ({', '.join((c.name for c in target_resource.containers))})" - elif len(target_resource.containers) > 1: + ), ( + f"Configured container {self.config.container} was not found in target app containers" + f" ({', '.join((c.name for c in get_containers(workload=target_resource)))})" + ) + elif len(get_containers(workload=target_resource)) > 1: raise CheckError( "Container name must be configured for target application with multiple containers" ) @@ -261,7 +262,6 @@ async def measure( target_metrics = [ m for m in self.config.metrics_to_collect if m.value in metrics ] - target_resource = await _get_target_resource(self.config) progress_duration = servo.Duration(control.warmup + control.duration) progress = servo.EventProgress(timeout=progress_duration) @@ -286,7 +286,6 @@ async def measure( try: await self.periodic_measure( - target_resource=target_resource, target_metrics=target_metrics, datapoints_dicts=datapoints_dicts, ) @@ -353,19 +352,20 @@ def _get_target_container_metrics( async def periodic_measure( self, - target_resource: Union[Deployment, Rollout], target_metrics: list[SupportedKubeMetrics], datapoints_dicts: Dict[str, Dict[str, List[DataPoint]]], ) -> None: # Retrieve latest main state - await target_resource.refresh() + target_resource = await _get_target_resource(self.config) target_resource_container = _get_target_resource_container( self.config, target_resource ) async with kubernetes_asyncio.client.api_client.ApiClient() as api: cust_obj_api = kubernetes_asyncio.client.CustomObjectsApi(api_client=api) - label_selector_str = selector_string(target_resource.match_labels) + label_selector_str = dict_to_selector( + target_resource.spec.selector.match_labels + ) timestamp = datetime.now() if any((m in MAIN_METRICS_REQUIRE_CUST_OBJ for m in target_metrics)): @@ -404,8 +404,8 @@ async def periodic_measure( value=mem_usage, ) - cpu_resources = target_resource_container.get_resource_requirements( - "cpu" + cpu_resources = ContainerHelper.get_resource_requirements( + target_resource_container, Resource.cpu.value ) # Set requests = limits if not specified if ( @@ -441,8 +441,8 @@ async def periodic_measure( value=cpu_saturation, ) - mem_resources = target_resource_container.get_resource_requirements( - "memory" + mem_resources = ContainerHelper.get_resource_requirements( + target_resource_container, Resource.memory.value ) # Set requests = limits if not specified if ( @@ -486,40 +486,45 @@ async def periodic_measure( datapoints_dicts=datapoints_dicts, time=timestamp, ) - for pod in await target_resource.get_pods(): + target_pods = [ + pod + for pod in await PodHelper.list_pods_with_labels( + target_resource.metadata.namespace, + target_resource.spec.selector.match_labels, + ) + if "tuning" not in pod.metadata.name + ] + for pod in target_pods: _append_data_point_for_time( - pod_name=pod.name, + pod_name=pod.metadata.name, metric_name=SupportedKubeMetrics.MAIN_POD_RESTART_COUNT.value, - value=pod.restart_count, + value=PodHelper.get_restart_count(pod), ) # Retrieve latest tuning state - target_resource_tuning_pod_name = f"{target_resource.name}-tuning" - target_resource_tuning_pod: Pod = next( - ( - p - for p in await target_resource.get_pods() - if p.name == target_resource_tuning_pod_name - ), - None, - ) + target_resource_tuning_pod_name = f"{target_resource.metadata.name}-tuning" + try: + target_resource_tuning_pod = await PodHelper.read( + target_resource_tuning_pod_name, target_resource.metadata.namespace + ) + except kubernetes_asyncio.client.exceptions.ApiException as e: + if e.status != 404 or e.reason != "Not Found": + raise + target_resource_tuning_pod = None + if target_resource_tuning_pod: target_resource_tuning_pod_container = _get_target_resource_container( self.config, target_resource_tuning_pod ) - cpu_resources = ( - target_resource_tuning_pod_container.get_resource_requirements( - "cpu" - ) + cpu_resources = ContainerHelper.get_resource_requirements( + target_resource_tuning_pod_container, Resource.cpu.value ) # Set requests = limits if not specified if (cpu_request := cpu_resources[ResourceRequirement.request]) is None: cpu_request = cpu_resources[ResourceRequirement.limit] - mem_resources = ( - target_resource_tuning_pod_container.get_resource_requirements( - "memory" - ) + mem_resources = ContainerHelper.get_resource_requirements( + target_resource_tuning_pod_container, Resource.memory.value ) if (mem_request := mem_resources[ResourceRequirement.request]) is None: mem_request = mem_resources[ResourceRequirement.limit] @@ -537,7 +542,9 @@ async def periodic_measure( restart_count = None if SupportedKubeMetrics.TUNING_POD_RESTART_COUNT in target_metrics: if target_resource_tuning_pod is not None: - restart_count = target_resource_tuning_pod.restart_count + restart_count = PodHelper.get_restart_count( + target_resource_tuning_pod + ) else: restart_count = 0 @@ -550,7 +557,7 @@ async def periodic_measure( # TODO: (potential improvement) raise error if more than 1 tuning pod? for pod_entry in tuning_metrics["items"]: pod_name = pod_entry["metadata"]["name"] - if pod_name != f"{target_resource.name}-tuning": + if pod_name != target_resource_tuning_pod_name: raise RuntimeError(f"Got unexpected tuning pod name {pod_name}") timestamp = isoparse(pod_entry["timestamp"]) _append_data_point_for_pod = functools.partial( @@ -669,12 +676,12 @@ def _append_data_point( async def _get_target_resource( config: KubeMetricsConfiguration, -) -> Union[Deployment, Rollout]: +) -> Union[V1Deployment, V1StatefulSet]: read_args = dict(name=config.name, namespace=config.namespace) if config.kind.lower() == "deployment": - return await Deployment.read(**read_args) - elif config.kind.lower() == "rollout": - return await Rollout.read(**read_args) + return await DeploymentHelper.read(**read_args) + elif config.kind.lower() == "statefulset": + return await StatefulSetHelper.read(**read_args) else: raise NotImplementedError( f"Resource type {config.kind} is not supported by the kube-metrics connector" @@ -682,27 +689,26 @@ async def _get_target_resource( def _get_target_resource_container( - config: KubeMetricsConfiguration, target_resource: Union[Deployment, Rollout, Pod] -) -> Container: + config: KubeMetricsConfiguration, + target_resource: Union[V1Deployment, V1StatefulSet, V1Pod], +) -> V1Container: if config.container: - if isinstance(target_resource, Pod): - target_resource_container: Container = target_resource.get_container( - config.container - ) - else: - target_resource_container: Container = target_resource.find_container( - config.container - ) - + target_resource_container = find_container( + workload=target_resource, name=config.container + ) if target_resource_container is None: raise RuntimeError( - f"Unable to locate container {config.container} in {target_resource.obj.kind} {target_resource.name}" + f"Unable to locate container {config.container} in {target_resource.kind} {target_resource.metadata.name}" ) - elif len(target_resource.containers) > 1: - # TODO (improvement) can support this with ID append - raise RuntimeError(f"Unable to derive metrics for multi-container resources") else: - target_resource_container: Container = target_resource.containers[0] + containers = get_containers(workload=target_resource) + # TODO (improvement) can support this with ID append + if len(containers) > 1: + raise RuntimeError( + f"Unable to derive metrics for multi-container resources" + ) + + target_resource_container = containers[0] return target_resource_container diff --git a/servo/connectors/kubernetes.py b/servo/connectors/kubernetes.py index e2e0254ba..026be009f 100644 --- a/servo/connectors/kubernetes.py +++ b/servo/connectors/kubernetes.py @@ -4,3383 +4,56 @@ import abc import asyncio -import collections import contextlib -import copy -import datetime import decimal import enum import functools import itertools -import json -import operator import os import pathlib -import re -from typing import ( - Any, - AsyncIterator, - AsyncContextManager, - Callable, - ClassVar, - Collection, - Coroutine, - Dict, - Generator, - Iterable, - List, - Mapping, - Optional, - Protocol, - Tuple, - Type, - Union, - cast, - get_type_hints, - runtime_checkable, -) - -import backoff -import kubernetes_asyncio -import kubernetes_asyncio.client -import kubernetes_asyncio.client.api_client -import kubernetes_asyncio.client.exceptions -import kubernetes_asyncio.client.models -from kubernetes_asyncio.client.models.v1_container import V1Container -from kubernetes_asyncio.client.models.v1_container_status import V1ContainerStatus -from kubernetes_asyncio.client.models.v1_env_var import V1EnvVar -import kubernetes_asyncio.watch -import pydantic - -import servo -from servo.telemetry import ONE_MiB -from servo.types.kubernetes import * - - -class Condition(servo.logging.Mixin): - """A Condition is a convenience wrapper around a function and its arguments - which allows the function to be called at a later time. - - The function is called in the ``check`` method, which resolves the result to - a boolean value, thus the condition function should return a boolean or - something that ultimately resolves to a Truthy or Falsey value. - - Args: - name: The name of the condition to make it easier to identify. - fn: The condition function that will be checked. - *args: Any arguments for the condition function. - **kwargs: Any keyword arguments for the condition function. - - Attributes: - name (str): The name of the Condition. - fn (callable): The condition function that will be checked. - args (tuple): Arguments for the checking function. - kwargs (dict): Keyword arguments for the checking function. - last_check (bool): Holds the state of the last condition check. - - Raises: - ValueError: The given ``fn`` is not callable. - """ - - def __init__(self, name: str, fn: Callable, *args, **kwargs) -> None: # noqa: D107 - if not callable(fn): - raise ValueError("The Condition function must be callable") - - self.name = name - self.fn = fn - self.args = args - self.kwargs = kwargs - - # last check holds the state of the last check. - self.last_check = False - - def __str__(self) -> str: - return f"" - - def __repr__(self) -> str: - return self.__str__() - - async def check(self) -> bool: - """Check that the condition was met. - - Returns: - True if the condition was met; False otherwise. - """ - if asyncio.iscoroutinefunction(self.fn): - self.last_check = bool(await self.fn(*self.args, **self.kwargs)) - else: - self.last_check = bool(self.fn(*self.args, **self.kwargs)) - return self.last_check - - -async def wait_for_condition( - condition: Condition, - interval: servo.DurationDescriptor = 0.05, - fail_on_api_error: bool = True, -) -> None: - """Wait for a condition to be met. - - Args: - condition: The Condition to wait for. - timeout: The maximum time to wait, in seconds, for the condition to be met. - If unspecified, this function will wait indefinitely. If specified and - the timeout is met or exceeded, a TimeoutError will be raised. - interval: The time, in seconds, to wait before re-checking the condition. - fail_on_api_error: Fail the condition checks if a Kubernetes API error is - incurred. An API error can be raised for a number of reasons, including - a Pod being restarted and temporarily unavailable. Disabling this will - cause those errors to be ignored, allowing the check to continue until - timeout or resolution. (default: True). - - Raises: - TimeoutError: The specified timeout was exceeded. - """ - servo.logger.debug(f"waiting for condition: {condition}") - - started_at = datetime.datetime.now() - duration = servo.Duration(interval) - - async def _wait_for_condition() -> None: - servo.logger.debug(f"wait for condition: {condition}") - while True: - try: - servo.logger.trace(f"checking condition {condition}") - if await condition.check(): - servo.logger.trace(f"condition passed: {condition}") - break - - # if the condition is not met, sleep for the interval - # to re-check later - servo.logger.trace(f"sleeping for {duration}") - await asyncio.sleep(duration.total_seconds()) - - except asyncio.CancelledError: - servo.logger.trace(f"wait for condition cancelled: {condition}") - raise - - except kubernetes_asyncio.client.exceptions.ApiException as e: - servo.logger.warning(f"encountered API exception while waiting: {e}") - if fail_on_api_error: - raise - - task = asyncio.create_task(_wait_for_condition()) - try: - await task - except asyncio.CancelledError: - task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await task - - raise - finally: - servo.logger.debug( - f"wait completed (total={servo.Duration.since(started_at)}) {condition}" - ) - - -class Resource(str, enum.Enum): - memory = "memory" - cpu = "cpu" - - @classmethod - def values(cls) -> List[str]: - """ - Return a list of strings that identifies all resource values. - """ - return list(map(lambda rsrc: rsrc.value, cls.__members__.values())) - - -class ResourceRequirement(enum.Enum): - """ - The ResourceRequirement enumeration determines how optimization values are submitted to the - Kubernetes scheduler to allocate core compute resources. Requests establish the lower bounds - of the CPU and memory necessary for an application to execute while Limits define the upper - bounds for resources that can be consumed by a given Pod. The Opsani engine can determine - optimal values for these settings by identifying performant, low cost configurations that meet - target SLOs and/or maximizing performance while identifying the point of diminishing returns - on further resourcing. - """ - - request = "request" - limit = "limit" - - @property - def resources_key(self) -> str: - """ - Return a string value for accessing resource requirements within a Kubernetes Container representation. - """ - if self == ResourceRequirement.request: - return "requests" - elif self == ResourceRequirement.limit: - return "limits" - else: - raise NotImplementedError( - f'missing resources_key implementation for resource requirement "{self}"' - ) - - -@runtime_checkable -class KubernetesObj(Protocol): - """ - KubernetesObj is a protocol that defines the common attributes - of objects retrieved from the Kubernetes API. - """ - - @property - def api_version(self) -> str: - ... - - @property - def kind(self) -> str: - ... - - @property - def metadata(self) -> kubernetes_asyncio.client.V1ObjectMeta: - ... - - -class KubernetesModel(abc.ABC, servo.logging.Mixin): - """ - KubernetesModel is an abstract base class for Servo connector - models that wrap Kubernetes API objects. - - This base class provides common functionality and common object - properties for all API wrappers. It also defines the following - abstract methods which all subclasses must implement: - - - ``create``: create the resource on the cluster - - ``patch``: partially update the resource on the cluster - - ``delete``: remove the resource from the cluster - - ``refresh``: refresh the underlying object model - - ``is_ready``: check if the object is in the ready state - - Args: - api_object: The underlying Kubernetes API object. - - Attributes: - obj: The underlying Kubernetes API object. - """ - - obj: KubernetesObj - """The underlying Kubernetes API object. Subclasses must update - the type hint to reflect the type that they are wrapping. - """ - - api_clients: ClassVar[Dict[str, Type]] - """A mapping of all the supported api clients for the API - object type. Various resources can have multiple versions, - e.g. "apps/v1", "apps/v1beta1", etc. The preferred version - for each resource type should be defined under the "preferred" - key. The preferred API client will be used when the apiVersion - is not specified for the resource. - """ - - def __init__(self, obj, **kwargs) -> None: # noqa: D107 - self.obj = obj - self._logger = servo.logger - - def __str__(self) -> str: - return str(self.obj) - - def __repr__(self) -> str: - return self.__str__() - - @classmethod - def obj_type(cls) -> Type: - """The type of the underlying Kubernetes API object.""" - return get_type_hints(cls)["obj"] - - @property - def api_version(self) -> str: - """The API version of the Kubernetes object (`obj.apiVersion``).""" - return self.obj.api_version - - @property - def name(self) -> str: - """The name of the Kubernetes object (``obj.metadata.name``).""" - return cast(str, self.obj.metadata.name) - - @name.setter - def name(self, name: str): - """Set the name of the Kubernetes object (``obj.metadata.name``).""" - self.obj.metadata.name = name - - @property - def namespace(self) -> str: - """The namespace of the Kubernetes object (``obj.metadata.namespace``).""" - return cast(str, self.obj.metadata.namespace) - - @namespace.setter - def namespace(self, namespace: str): - """Set the namespace of the Kubernetes object (``obj.metadata.namespace``).""" - self.obj.metadata.namespace = namespace - - @contextlib.asynccontextmanager - async def api_client( - self, default_headers: Dict[str, str] = {} - ) -> Generator[Any, None, None]: - """The API client for the Kubernetes object. This is determined - by the ``apiVersion`` of the object configuration. - - Raises: - ValueError: The API version is not supported. - """ - c = self.api_clients.get(self.api_version) - # If we didn't find the client in the api_clients dict, use the - # preferred version. - if c is None: - self.logger.debug( - f"unknown API version ({self.api_version}) for {self.__class__.__name__}, falling back to preferred version" - ) - c = self.api_clients.get("preferred") - if c is None: - raise ValueError( - "unknown version specified and no preferred version " - f"defined for resource ({self.api_version})" - ) - # If we did find it, initialize that client version. - async with kubernetes_asyncio.client.api_client.ApiClient() as api: - for k, v in default_headers.items(): - api.set_default_header(k, v) - yield c(api) - - @classmethod - @contextlib.asynccontextmanager - async def preferred_client(cls) -> Generator[Any, None, None]: - """The preferred API client type for the Kubernetes object. This is defined in the - ``api_clients`` class member dict for each object. - - Raises: - ValueError: No preferred client is defined for the object. - """ - c = cls.api_clients.get("preferred") - if c is None: - raise ValueError( - f"no preferred api client defined for object {cls.__name__}", - ) - async with kubernetes_asyncio.client.api_client.ApiClient() as api: - yield c(api) - - @abc.abstractclassmethod - async def read(cls, name: str, namespace: str) -> "KubernetesModel": - """Read the underlying Kubernetes resource from the cluster and - return a model instance. - - Args: - name: The name of the resource to read. - namespace: The namespace to read the resource from. - """ - - @abc.abstractmethod - async def create(self, namespace: str = None) -> None: - """Create the underlying Kubernetes resource in the cluster - under the given namespace. - - Args: - namespace: The namespace to create the resource under. - If no namespace is provided, it will use the instance's - namespace member, which is set when the object is created - via the kubernetes_asyncio.client - """ - - @abc.abstractmethod - async def patch(self) -> None: - """Partially update the underlying Kubernetes resource in the cluster.""" - - @abc.abstractmethod - async def delete( - self, options: kubernetes_asyncio.client.V1DeleteOptions - ) -> kubernetes_asyncio.client.V1Status: - """Delete the underlying Kubernetes resource from the cluster. - - This method expects the resource to have been loaded or otherwise - assigned a namespace already. If it has not, the namespace will need - to be set manually. - - Args: - options: Options for resource deletion. - """ - - @abc.abstractmethod - async def refresh(self) -> None: - """Refresh the local state (``obj``) of the underlying Kubernetes resource.""" - - @abc.abstractmethod - async def is_ready(self) -> bool: - """Check if the resource is in the ready state. - - It is up to the wrapper subclass to define what "ready" means for - that particular resource. - - Returns: - True if in the ready state; False otherwise. - """ - - async def wait_until_ready( - self, - interval: servo.DurationDescriptor = 1, - fail_on_api_error: bool = False, - ) -> None: - """Wait until the resource is in the ready state. - - Args: - timeout: The maximum time to wait, in seconds, for the resource - to reach the ready state. If unspecified, this will wait - indefinitely. If specified and the timeout is met or exceeded, - a TimeoutError will be raised. - interval: The time, in seconds, to wait before re-checking if the - object is ready. - fail_on_api_error: Fail if an API error is raised. An API error can - be raised for a number of reasons, such as 'resource not found', - which could be the case when a resource is just being started or - restarted. When waiting for readiness we generally do not want to - fail on these conditions. - - Raises: - TimeoutError: The specified timeout was exceeded. - """ - ready_condition = Condition( - "api object ready", - self.is_ready, - ) - - task = asyncio.create_task( - wait_for_condition( - condition=ready_condition, - interval=interval, - fail_on_api_error=fail_on_api_error, - ) - ) - try: - await task - except asyncio.CancelledError: - task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await task - raise - - async def wait_until_deleted(self, interval: servo.DurationDescriptor = 1) -> None: - """Wait until the resource is deleted from the cluster. - - Args: - timeout: The maximum time to wait, in seconds, for the resource to - be deleted from the cluster. If unspecified, this will wait - indefinitely. If specified and the timeout is met or exceeded, - a TimeoutError will be raised. - interval: The time, in seconds, to wait before re-checking if the - object has been deleted. - - Raises: - TimeoutError: The specified timeout was exceeded. - """ - - async def deleted_fn(): - try: - await self.refresh() - except kubernetes_asyncio.client.exceptions.ApiException as e: - # If we can no longer find the deployment, it is deleted. - # If we get any other exception, raise it. - if e.status == 404 and e.reason == "Not Found": - return True - else: - self.logger.error("error refreshing object state") - raise e - else: - # The object was still found, so it has not been deleted - return False - - delete_condition = Condition("api object deleted", deleted_fn) - - task = asyncio.create_task( - wait_for_condition( - condition=delete_condition, - interval=interval, - ) - ) - - try: - await task - except asyncio.CancelledError: - task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await task - raise - - async def raise_for_status(self) -> None: - """Raise an exception if in an unhealthy state.""" - self.logger.warning( - f"raise_for_status not implemented on {self.__class__.__name__}" - ) - - -class Namespace(KubernetesModel): - """Kubetest wrapper around a Kubernetes `Namespace`_ API Object. - - The actual ``kubernetes.client.V1Namespace`` instance that this - wraps can be accessed via the ``obj`` instance member. - - This wrapper provides some convenient functionality around the - API Object and provides some state management for the `Namespace`_. - - .. _Namespace: - https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#namespace-v1-core - """ - - obj: kubernetes_asyncio.client.V1Namespace - api_clients: ClassVar[Dict[str, Type]] = { - "preferred": kubernetes_asyncio.client.CoreV1Api, - "v1": kubernetes_asyncio.client.CoreV1Api, - } - - @classmethod - def new(cls, name: str) -> "Namespace": - """Create a new Namespace with object backing. - - Args: - name: The name of the new Namespace. - - Returns: - A new Namespace instance. - """ - return cls( - obj=kubernetes_asyncio.client.V1Namespace( - api_version="v1", - metadata=kubernetes_asyncio.client.V1ObjectMeta(name=name), - ) - ) - - @classmethod - async def read(cls, name: str) -> "Namespace": - """Read a Namespace from the Kubernetes API. - - Args: - name: The name of the Namespace to read. - - Returns: - A hydrated Namespace instance. - """ - namespace = cls.new(name) - await namespace.refresh() - return namespace - - async def create(self, name: str = None) -> None: - """Create the Namespace under the given name. - - Args: - name: The name to create the Namespace under. If the - name is not provided, it will be assumed to already be - in the underlying object spec. If it is not, namespace - operations will fail. - """ - if name is not None: - self.name = name - - self.logger.info(f'creating namespace "{self.name}"') - - async with self.api_client() as api_client: - self.obj = await api_client.create_namespace( - body=self.obj, - ) - - async def patch(self) -> None: - """ - TODO: Add docs.... - """ - async with self.api_client() as api_client: - await api_client.patch_namespace( - name=self.name, - body=self.obj, - ) - - async def delete( - self, options: kubernetes_asyncio.client.V1DeleteOptions = None - ) -> kubernetes_asyncio.client.V1Status: - """Delete the Namespace. - - Args: - options: Options for Namespace deletion. - - Returns: - The status of the delete operation. - """ - if options is None: - options = kubernetes_asyncio.client.V1DeleteOptions() - - self.logger.info(f'deleting namespace "{self.name}"') - self.logger.debug(f"delete options: {options}") - - async with self.api_client() as api_client: - return await api_client.delete_namespace( - name=self.name, - body=options, - ) - - async def refresh(self) -> None: - """Refresh the underlying Kubernetes Namespace resource.""" - async with self.api_client() as api_client: - self.obj = await api_client.read_namespace( - name=self.name, - ) - - async def is_ready(self) -> bool: - """Check if the Namespace is in the ready state. - - Returns: - True if in the ready state; False otherwise. - """ - await self.refresh() - - status = self.obj.status - if status is None: - return False - - return status.phase.lower() == "active" - - -_DEFAULT_SENTINEL = object() - - -class Container(servo.logging.Mixin): - """Kubetest wrapper around a Kubernetes `Container`_ API Object. - - The actual ``kubernetes.client.V1Container`` instance that this - wraps can be accessed via the ``obj`` instance member. - - This wrapper provides some convenient functionality around the - API Object and provides some state management for the `Container`_. - - This wrapper does **NOT** subclass the ``objects.ApiObject`` like other - object wrappers because it is not intended to be created or - managed from manifest file. It is merely meant to wrap the - Container spec for a Pod to make Container-targeted actions - easier. - - .. _Container: - https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#container-v1-core - """ - - def __init__(self, api_object, pod) -> None: # noqa: D107 - self.obj: V1Container = api_object - self.pod: Pod = pod - - @property - def name(self) -> str: - return self.obj.name - - @property - def image(self) -> str: - """ - Returns the container image name from the underlying container object. - """ - return self.obj.image - - async def get_restart_count(self) -> int: - """Get the number of times the Container has been restarted. - - Returns: - The number of times the Container has been restarted. - """ - container_name = self.obj.name - pod_status = await self.pod.get_status() - - # If there are no container status, the container hasn't started - # yet, so there cannot be any restarts. - if pod_status.container_statuses is None: - return 0 - - for status in pod_status.container_statuses: - if status.name == container_name: - return status.restart_count - - raise RuntimeError(f"Unable to determine container status for {container_name}") - - @property - def resources(self) -> kubernetes_asyncio.client.V1ResourceRequirements: - """ - Return the resource requirements for the Container. - - Returns: - The Container resource requirements. - """ - return self.obj.resources - - @resources.setter - def resources( - self, resources: kubernetes_asyncio.client.V1ResourceRequirements - ) -> None: - """ - Set the resource requirements for the Container. - - Args: - resources: The resource requirements to set. - """ - self.obj.resources = resources - - def get_resource_requirements( - self, resource_type: str - ) -> Dict[ResourceRequirement, Optional[str]]: - """Return a dictionary mapping resource requirements to values for a given resource (e.g., cpu or memory). - - This method is safe to call for containers that do not define any resource requirements (e.g., the `resources` property is None). - - Requirements that are not defined for the named resource are returned as None. For example, a container - that defines CPU requests but does not define limits would return a dict with a `None` value for - the `ResourceRequirement.limit` key. - - Args: - resource_type: The type of resource to get the requirements of (e.g., "cpu" or "memory"). - - Returns: - A dictionary mapping ResourceRequirement enum members to optional string values. - """ - resources: kubernetes_asyncio.client.V1ResourceRequirements = getattr( - self, "resources", kubernetes_asyncio.client.V1ResourceRequirements() - ) - requirements = {} - for requirement in ResourceRequirement: - # Get the 'requests' or 'limits' nested structure - requirement_subdict = getattr(resources, requirement.resources_key, {}) - if requirement_subdict: - requirements[requirement] = requirement_subdict.get(resource_type) - else: - requirements[requirement] = None - - return requirements - - def set_resource_requirements( - self, resource_type: str, requirements: dict[ResourceRequirement, Optional[str]] - ) -> None: - """Sets resource requirements on the container for the values in the given dictionary. - - If no resources have been defined yet, a resources model is provisioned. - If no requirements have been defined for the given resource name, a requirements dictionary is defined. - Values of None are removed from the target requirements. - ResourceRequirement keys that are not present in the dict are not modified. - - Args: - resource_type: The name of the resource to set the requirements of (e.g., "cpu" or "memory"). - requirements: A dict mapping requirements to target values (e.g., `{ResourceRequirement.request: '500m', ResourceRequirement.limit: '2000m'}) - """ - resources: kubernetes_asyncio.client.V1ResourceRequirements = copy.copy( - getattr( - self, "resources", kubernetes_asyncio.client.V1ResourceRequirements() - ) - ) - - for requirement, value in requirements.items(): - resource_to_values = getattr(resources, requirement.resources_key, {}) - if not resource_to_values: - resource_to_values = {} - - if value is not None: - # NOTE: Coerce to string as values are headed into Kubernetes resource model - resource_to_values[resource_type] = str(value) - else: - resource_to_values.pop(resource_type, None) - setattr(resources, requirement.resources_key, resource_to_values) - - self.resources = resources - - @property - def env(self) -> Optional[list[V1EnvVar]]: - return self.obj.env - - def get_environment_variable(self, variable_name: str) -> Optional[str]: - if self.obj.env: - return next( - iter( - v.value or f"valueFrom: {v.value_from}" - for v in cast(Iterable[V1EnvVar], self.obj.env) - if v.name == variable_name - ), - None, - ) - return None - - def set_environment_variable(self, variable_name: str, value: Any) -> None: - # V1EnvVar value type is str so value will be converted eventually. Might as well do it up front - val_str = str(value) - if "valueFrom" in val_str: - raise ValueError("Adjustment of valueFrom variables is not supported yet") - - new_vars: list[V1EnvVar] = self.obj.env or [] - if new_vars: - # Filter out vars with the same name as the ones we are setting - new_vars = [v for v in new_vars if v.name != variable_name] - - new_vars.append(V1EnvVar(name=variable_name, value=val_str)) - self.obj.env = new_vars - - @property - def ports(self) -> List[kubernetes_asyncio.client.V1ContainerPort]: - """ - Return the ports for the Container. - - Returns: - The Container ports. - """ - return self.obj.ports or [] - - def __str__(self) -> str: - return str(self.obj) - - def __repr__(self) -> str: - return self.__str__() - - -class HPA(KubernetesModel): - - obj: kubernetes_asyncio.client.V1HorizontalPodAutoscaler - - api_clients: ClassVar[Dict[str, Type]] = { - "preferred": kubernetes_asyncio.client.AutoscalingV1Api, - "autoscaling/v1": kubernetes_asyncio.client.AutoscalingV1Api, - "autoscaling/v2beta1": kubernetes_asyncio.client.AutoscalingV2beta1Api, - "autoscaling/v2beta2": kubernetes_asyncio.client.AutoscalingV2beta2Api, - } - - @classmethod - async def read(cls, name: str, namespace: str) -> "HPA": - """Read the HPA from the cluster under the given namespace. - - Args: - name: The name of the HPA to read. - namespace: The namespace to read the HPA from. - """ - servo.logger.debug(f'reading hpa "{name}" in namespace "{namespace}"') - async with cls.preferred_client() as api_client: - obj = await api_client.read_namespaced_horizontal_pod_autoscaler( - name, namespace - ) - servo.logger.trace(f"read HorizontalPodAutoscaler: {obj}") - return HPA(obj) - - async def create(self, namespace: str = None) -> None: - raise NotImplementedError - - async def patch(self) -> None: - """ - Patches an HPA, applying spec changes to the cluster. - """ - self.logger.info(f'patching HPA "{self.name}"') - async with self.api_client() as api_client: - api_client.api_client.set_default_header( - "content-type", "application/strategic-merge-patch+json" - ) - hpa_result = await api_client.patch_namespaced_horizontal_pod_autoscaler( - name=self.name, - namespace=self.namespace, - body=self.obj, - ) - self.logger.trace(f"patched HPA, spec={hpa_result}") - - async def delete( - self, options: kubernetes_asyncio.client.V1DeleteOptions = None - ) -> kubernetes_asyncio.client.V1Status: - raise NotImplementedError - - async def refresh(self) -> None: - """Refresh the underlying Kubernetes HPA resource.""" - async with self.api_client() as api_client: - self.obj = ( - await api_client.read_namespaced_horizontal_pod_autoscaler_status( - name=self.name, - namespace=self.namespace, - ) - ) - - async def is_ready(self) -> bool: - NotImplementedError - - @property - def target_cpu_utilization_percentage(self) -> int: - return self.obj.spec.target_cpu_utilization_percentage - - @target_cpu_utilization_percentage.setter - def target_cpu_utilization_percentage(self, target: int) -> None: - if not isinstance(target, int): - self.logger.debug(f"got target={target}, attemptint to coerce to int") - target = int(target) - self.obj.spec.target_cpu_utilization_percentage = target - - async def get_cpu_utilization_scaling_threshold(self) -> int: - await self.refresh() - return self.target_cpu_utilization_percentage - - -class Pod(KubernetesModel): - """Wrapper around a Kubernetes `Pod`_ API Object. - - The actual ``kubernetes.client.V1Pod`` instance that this - wraps can be accessed via the ``obj`` instance member. - - This wrapper provides some convenient functionality around the - API Object and provides some state management for the `Pod`_. - - .. _Pod: - https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#pod-v1-core - """ - - obj: kubernetes_asyncio.client.V1Pod - - api_clients: ClassVar[Dict[str, Type]] = { - "preferred": kubernetes_asyncio.client.CoreV1Api, - "v1": kubernetes_asyncio.client.CoreV1Api, - } - - @classmethod - async def read(cls, name: str, namespace: str) -> "Pod": - """Read the Pod from the cluster under the given namespace. - - Args: - name: The name of the Pod to read. - namespace: The namespace to read the Pod from. - """ - servo.logger.debug(f'reading pod "{name}" in namespace "{namespace}"') - - async with cls.preferred_client() as api_client: - obj = await api_client.read_namespaced_pod_status(name, namespace) - return Pod(obj) - - async def create(self, namespace: str = None) -> None: - """Create the Pod under the given namespace. - - Args: - namespace: The namespace to create the Pod under. - If the Pod was loaded via the kubetest client, the - namespace will already be set, so it is not needed - here. Otherwise, the namespace will need to be provided. - """ - if namespace is None: - namespace = self.namespace - - self.logger.info(f'creating pod "{self.name}" in namespace "{namespace}"') - - async with self.preferred_client() as api_client: - self.obj = await api_client.create_namespaced_pod( - namespace=namespace, - body=self.obj, - ) - - async def patch(self) -> None: - """ - Patches a Pod, applying spec changes to the cluster. - """ - self.logger.info(f'patching pod "{self.name}"') - async with self.api_client() as api_client: - api_client.api_client.set_default_header( - "content-type", "application/strategic-merge-patch+json" - ) - await api_client.patch_namespaced_pod( - name=self.name, - namespace=self.namespace, - body=self.obj, - ) - - async def delete( - self, options: kubernetes_asyncio.client.V1DeleteOptions = None - ) -> kubernetes_asyncio.client.V1Status: - """Delete the Pod. - - This method expects the Pod to have been loaded or otherwise - assigned a namespace already. If it has not, the namespace will - need to be set manually. - - Args: - options: Options for Pod deletion. - - Return: - The status of the delete operation. - """ - if options is None: - options = kubernetes_asyncio.client.V1DeleteOptions() - - self.logger.info(f'deleting pod "{self.name}"') - self.logger.trace(f"delete options: {options}") - - async with self.api_client() as api_client: - return await api_client.delete_namespaced_pod( - name=self.name, - namespace=self.namespace, - body=options, - ) - - async def refresh(self) -> None: - """Refresh the underlying Kubernetes Pod resource.""" - async with self.api_client() as api_client: - self.obj = await api_client.read_namespaced_pod_status( - name=self.name, - namespace=self.namespace, - ) - - async def is_ready(self) -> bool: - """Check if the Pod is in the ready state. - - Returns: - True if in the ready state; False otherwise. - """ - self.logger.trace("refreshing pod status to check is_ready") - await self.refresh() - - # if there is no status, the pod is definitely not ready - status = self.obj.status - self.logger.trace(f"current pod status is {status}") - if status is None: - return False - - # check the pod phase to make sure it is running. a pod in - # the 'failed' or 'success' state will no longer be running, - # so we only care if the pod is in the 'running' state. - status.phase - self.logger.trace(f"current pod phase is {status}") - if not status.conditions: - return False - - self.logger.trace(f"checking status conditions {status.conditions}") - for cond in status.conditions: - if cond.reason == "Unschedulable": - return False - - # we only care about the condition type 'ready' - if cond.type.lower() != "ready": - continue - - # check that the readiness condition is True - return cond.status.lower() == "true" - - # Catchall - self.logger.trace(f"unable to find ready=true, continuing to wait...") - return False - - async def _try_get_container_log( - self, - api_client: kubernetes_asyncio.client.CoreV1Api, - container: str, - limit_bytes: int = ONE_MiB, - previous=False, - ) -> str: - """Get logs for a container while handling common error cases (eg. Not Found)""" - try: - return await api_client.read_namespaced_pod_log( - name=self.name, - namespace=self.namespace, - container=container, - limit_bytes=limit_bytes, - previous=previous, - ) - except kubernetes_asyncio.client.exceptions.ApiException as ae: - if ae.status == 400: - ae.data = ae.body - status: kubernetes_asyncio.client.models.V1Status = ( - api_client.api_client.deserialize(ae, "V1Status") - ) - if (status.message or "").endswith("not found"): - return "Logs not found" - - raise - - async def get_logs_for_container_statuses( - self, - container_statuses: list[V1ContainerStatus], - limit_bytes: int = ONE_MiB, - logs_selector: ContainerLogOptions = ContainerLogOptions.both, - ) -> list[str]: - """ - Get container logs from the current pod for the container's whose statuses are provided in the list - - Args: - container_statuses (list[V1ContainerStatus]): The name of the Container. - limit_bytes (int): Maximum bytes to provide per log (NOTE: this will be 2x per container ) - logs_selector (ContainerLogOptions): "previous", "current", or "both" - - Returns: - list[str]: List of logs per container in the same order as the list of container_statuses - """ - api_client: kubernetes_asyncio.client.CoreV1Api - async with self.api_client() as api_client: - read_logs_partial = functools.partial( - self._try_get_container_log, - api_client=api_client, - limit_bytes=limit_bytes, - ) - if logs_selector == ContainerLogOptions.both: - return [ - f"previous (crash):\n {await read_logs_partial(container=cs.name, previous=True)} \n\n--- \n\n" - f"current (latest):\n {await read_logs_partial(container=cs.name, previous=False)}" - for cs in container_statuses - ] - else: - previous = logs_selector == ContainerLogOptions.previous - return [ - await read_logs_partial(container=cs.name, previous=previous) - for cs in container_statuses - ] - - async def raise_for_status( - self, adjustments: List[servo.Adjustment], include_container_logs=False - ) -> None: - """Raise an exception if the Pod status is not not ready.""" - # NOTE: operate off of current state, assuming you have checked is_ready() - status = self.obj.status - self.logger.trace(f"current pod status is {status}") - if status is None: - raise RuntimeError(f"No such pod: {self.name}") - - # check the pod phase to make sure it is running. a pod in - # the 'failed' or 'success' state will no longer be running, - # so we only care if the pod is in the 'running' state. - # phase = status.phase - if not status.conditions: - raise RuntimeError(f"Pod is not running: {self.name}") - - self.logger.trace(f"checking container statuses: {status.container_statuses}") - if status.container_statuses: - for cont_stat in status.container_statuses: - if ( - cont_stat.state - and cont_stat.state.waiting - and cont_stat.state.waiting.reason - in ["ImagePullBackOff", "ErrImagePull"] - ): - raise servo.AdjustmentFailedError( - "Container image pull failure detected", - reason="image-pull-failed", - ) - - restarted_container_statuses: List[V1ContainerStatus] = [ - cont_stat - for cont_stat in status.container_statuses or [] - if cont_stat.restart_count > 0 - ] - if restarted_container_statuses: - container_logs: list[str] = [ - "DISABLED" for _ in restarted_container_statuses - ] - if include_container_logs: # TODO enable logs config on per container basis - container_logs = await self.get_logs_for_container_statuses( - restarted_container_statuses - ) - container_messages = [ - ( - f"{cont_stat.name} x{cont_stat.restart_count}" - f"{'' if not include_container_logs else f' container logs {container_logs[idx]}'}" - ) - for idx, cont_stat in enumerate(restarted_container_statuses) - ] - raise servo.AdjustmentRejectedError( - # NOTE: cant use f-string with newline (backslash) insertion - ( - f"Tuning optimization {self.name} crash restart detected on container(s): " - + ", \n".join(container_messages) - ), - reason="unstable", - ) - - self.logger.trace(f"checking status conditions {status.conditions}") - for cond in status.conditions: - if cond.reason == "Unschedulable": - # FIXME: The servo rejected error should be raised further out. This should be a generic scheduling error - unschedulable_adjustments = list( - filter(lambda a: a.setting_name in cond.message, adjustments) - ) - raise servo.AdjustmentRejectedError( - f"Requested adjustment(s) ({', '.join(map(str, unschedulable_adjustments))}) cannot be scheduled due to \"{cond.message}\"", - reason="unschedulable", - ) - - if cond.type == "Ready" and cond.status == "False": - rejection_message = cond.message - if include_container_logs and cond.reason == "ContainersNotReady": - unready_container_statuses: List[V1ContainerStatus] = [ - cont_stat - for cont_stat in status.container_statuses or [] - if not cont_stat.ready - ] - container_logs = await self.get_logs_for_container_statuses( - unready_container_statuses - ) - # NOTE: cant use f-string with newline (backslash) insertion - rejection_message = ( - f"{rejection_message} container logs " - + "\n\n--- \n\n".join(container_logs) - ) - raise servo.AdjustmentRejectedError( - f"(reason {cond.reason}) {rejection_message}", reason="start-failed" - ) - - # we only care about the condition type 'ready' - if cond.type.lower() != "ready": - continue - - # check that the readiness condition is True - if cond.status.lower() == "true": - return - - # Catchall - self.logger.trace(f"unable to find ready=true, continuing to wait...") - raise RuntimeError(f"Unknown Pod status for '{self.name}': {status}") - - async def get_status(self) -> kubernetes_asyncio.client.V1PodStatus: - """Get the status of the Pod. - - Returns: - The status of the Pod. - """ - # first, refresh the pod state to ensure latest status - await self.refresh() - - # return the status of the pod - return cast(kubernetes_asyncio.client.V1PodStatus, self.obj.status) - - @property - def containers(self) -> List[Container]: - """ - Return a list of Container objects from the underlying pod template spec. - """ - return list(map(lambda c: Container(c, self), self.obj.spec.containers)) - - async def get_containers(self) -> List[Container]: - """Get the Pod's containers. - - Returns: - A list of containers that belong to the Pod. - """ - self.logger.debug(f'getting containers for pod "{self.name}"') - await self.refresh() - - return self.containers - - def get_container(self, name: str) -> Union[Container, None]: - """Get a container in the Pod by name. - - Args: - name (str): The name of the Container. - - Returns: - Container: The Pod's Container with the matching name. If - no container with the given name is found, ``None`` is returned. - """ - return next(filter(lambda c: c.name == name, self.containers), None) - - async def get_restart_count(self) -> int: - """Get the total number of Container restarts for the Pod. - - Returns: - The total number of Container restarts. - """ - await self.refresh() - return self.restart_count - - @property - def restart_count(self) -> int: - if self.obj.status is None or self.obj.status.container_statuses is None: - return 0 - - total = 0 - for container_status in self.obj.status.container_statuses: - total += container_status.restart_count - - return total - - async def containers_started(self) -> bool: - """Check if the Pod's Containers have all started. - - Returns: - True if all Containers have started; False otherwise. - """ - # start the flag as true - we will check the state and set - # this to False if any container is not yet running. - containers_started = True - - status = await self.get_status() - if status.container_statuses is not None: - for container_status in status.container_statuses: - if container_status.state is not None: - if container_status.state.running is not None: - if container_status.state.running.started_at is not None: - # The container is started, so move on to check the - # next container - continue - # If we get here, then the container has not started. - containers_started = containers_started and False - break - - return containers_started - - def uid(self) -> str: - """ - Gets the UID for the Pod. - - UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids # noqa: E501 - """ - return self.obj.metadata.uid - - -class Service(KubernetesModel): - """Kubetest wrapper around a Kubernetes `Service`_ API Object. - - The actual ``kubernetes.client.V1Service`` instance that this - wraps can be accessed via the ``obj`` instance member. - - This wrapper provides some convenient functionality around the - API Object and provides some state management for the `Service`_. - - .. _Service: - https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#service-v1-core - """ - - obj: kubernetes_asyncio.client.V1Service - - api_clients: ClassVar[Dict[str, Type]] = { - "preferred": kubernetes_asyncio.client.CoreV1Api, - "v1": kubernetes_asyncio.client.CoreV1Api, - } - - @classmethod - async def read(cls, name: str, namespace: str) -> "Service": - """Read the Service from the cluster under the given namespace. - - Args: - name: The name of the Service to read. - namespace: The namespace to read the Service from. - """ - servo.logger.trace(f'reading service "{name}" in namespace "{namespace}"') - - async with cls.preferred_client() as api_client: - obj = await api_client.read_namespaced_service(name, namespace) - servo.logger.trace("service: ", obj) - return Service(obj) - - async def create(self, namespace: str = None) -> None: - """Creates the Service under the given namespace. - - Args: - namespace: The namespace to create the Service under. - If the Service was loaded via the kubetest client, the - namespace will already be set, so it is not needed here. - Otherwise, the namespace will need to be provided. - """ - if namespace is None: - namespace = self.namespace - - self.logger.info( - f'creating service "{self.name}" in namespace "{self.namespace}"' - ) - - async with self.api_client() as api_client: - self.obj = await api_client.create_namespaced_service( - namespace=namespace, - body=self.obj, - ) - - async def patch(self) -> None: - """ - TODO: Add docs.... - """ - async with self.api_client() as api_client: - api_client.api_client.set_default_header( - "content-type", "application/strategic-merge-patch+json" - ) - await api_client.patch_namespaced_service( - name=self.name, - namespace=self.namespace, - body=self.obj, - ) - - async def delete( - self, options: kubernetes_asyncio.client.V1DeleteOptions = None - ) -> kubernetes_asyncio.client.V1Status: - """Deletes the Service. - - This method expects the Service to have been loaded or otherwise - assigned a namespace already. If it has not, the namespace will need - to be set manually. - - Args: - options: Options for Service deletion. - - Returns: - The status of the delete operation. - """ - if options is None: - options = kubernetes_asyncio.client.V1DeleteOptions() - - self.logger.info(f'deleting service "{self.name}"') - self.logger.debug(f"delete options: {options}") - - async with self.api_client() as api_client: - return await api_client.delete_namespaced_service( - name=self.name, - namespace=self.namespace, - body=options, - ) - - async def refresh(self) -> None: - """Refresh the underlying Kubernetes Service resource.""" - async with self.api_client() as api_client: - self.obj = await api_client.read_namespaced_service( - name=self.name, - namespace=self.namespace, - ) - - async def is_ready(self) -> bool: - """Check if the Service is in the ready state. - - The readiness state is not clearly available from the Service - status, so to see whether or not the Service is ready this - will check whether the endpoints of the Service are ready. - - This comes with the caveat that in order for a Service to - have endpoints, there needs to be some backend hooked up to it. - If there is no backend, the Service will never have endpoints, - so this will never resolve to True. - - Returns: - True if in the ready state; False otherwise. - """ - await self.refresh() - - # check the status. if there is no status, the service is - # definitely not ready. - if self.obj.status is None: - return False - - endpoints = await self.get_endpoints() - - # if the Service has no endpoints, its not ready. - if len(endpoints) == 0: - return False - - # get the service endpoints and check that they are all ready. - for endpoint in endpoints: - # if we have an endpoint, but there are no subsets, we - # consider the endpoint to be not ready. - if endpoint.subsets is None: - return False - - for subset in endpoint.subsets: - # if the endpoint has no addresses setup yet, its not ready - if subset.addresses is None or len(subset.addresses) == 0: - return False - - # if there are still addresses that are not ready, the - # service is not ready - not_ready = subset.not_ready_addresses - if not_ready is not None and len(not_ready) > 0: - return False - - # if we got here, then all endpoints are ready, so the service - # must also be ready - return True - - @property - def status(self) -> kubernetes_asyncio.client.V1ServiceStatus: - return self.obj.status - - async def get_status(self) -> kubernetes_asyncio.client.V1ServiceStatus: - """Get the status of the Service. - - Returns: - The status of the Service. - """ - self.logger.info(f'checking status of service "{self.name}"') - # first, refresh the service state to ensure the latest status - await self.refresh() - - # return the status from the service - return self.obj.status - - @property - def ports(self) -> List[kubernetes_asyncio.client.V1ServicePort]: - """Return the list of ports exposed by the service.""" - return self.obj.spec.ports - - def find_port( - self, selector: Union[str, int] - ) -> Optional[kubernetes_asyncio.client.V1ServicePort]: - for port in self.ports: - if isinstance(selector, str): - if port.name == selector: - return port - elif isinstance(selector, int): - if port.port == selector: - return port - else: - raise TypeError( - f"Unknown port selector type '{selector.__class__.__name__}': {selector}" - ) - - return None - - async def get_endpoints(self) -> List[kubernetes_asyncio.client.V1Endpoints]: - """Get the endpoints for the Service. - - This can be useful for checking internal IP addresses used - in containers, e.g. for container auto-discovery. - - Returns: - A list of endpoints associated with the Service. - """ - self.logger.info(f'getting endpoints for service "{self.name}"') - async with self.api_client() as api_client: - endpoints = await api_client.list_namespaced_endpoints( - namespace=self.namespace, - ) - - svc_endpoints = [] - for endpoint in endpoints.items: - # filter to include only the endpoints with the same - # name as the service. - if endpoint.metadata.name == self.name: - svc_endpoints.append(endpoint) - - self.logger.debug(f"endpoints: {svc_endpoints}") - return svc_endpoints - - async def _proxy_http_request(self, method, path, **kwargs) -> tuple: - """Template request to proxy of a Service. - - Args: - method: The http request method e.g. 'GET', 'POST' etc. - path: The URI path for the request. - kwargs: Keyword arguments for the proxy_http_get function. - - Returns: - The response data - """ - path_params = { - "name": f"{self.name}:{self.obj.spec.ports[0].port}", - "namespace": self.namespace, - "path": path, - } - return await kubernetes_asyncio.client.CoreV1Api().api_client.call_api( - "/api/v1/namespaces/{namespace}/services/{name}/proxy/{path}", - method, - path_params=path_params, - **kwargs, - ) - - async def proxy_http_get(self, path: str, **kwargs) -> tuple: - """Issue a GET request to proxy of a Service. - - Args: - path: The URI path for the request. - kwargs: Keyword arguments for the proxy_http_get function. - - Returns: - The response data - """ - return await self._proxy_http_request("GET", path, **kwargs) - - async def proxy_http_post(self, path: str, **kwargs) -> tuple: - """Issue a POST request to proxy of a Service. - - Args: - path: The URI path for the request. - kwargs: Keyword arguments for the proxy_http_post function. - - Returns: - The response data - """ - return await self._proxy_http_request("POST", path, **kwargs) - - @property - def selector(self) -> Dict[str, str]: - return self.obj.spec.selector - - async def get_pods(self) -> List[Pod]: - """Get the pods that the Service is routing traffic to. - - Returns: - A list of pods that the service is routing traffic to. - """ - self.logger.debug(f'getting pods for service "{self.name}"') - - async with Pod.preferred_client() as api_client: - self.obj.spec.selector.match_labels - pod_list: kubernetes_asyncio.client.V1PodList = ( - await api_client.list_namespaced_pod( - namespace=self.namespace, - label_selector=selector_string(self.selector), - ) - ) - - pods = [Pod(p) for p in pod_list.items] - return pods - - -class WatchTimeoutError(Exception): - """The kubernetes watch timeout has elapsed. The api client raises no error - on timeout expiration so this should be raised in fall-through logic. - """ - - -class Deployment(KubernetesModel): - """Kubetest wrapper around a Kubernetes `Deployment`_ API Object. - - The actual ``kubernetes.client.V1Deployment`` instance that this - wraps can be accessed via the ``obj`` instance member. - - This wrapper provides some convenient functionality around the - API Object and provides some state management for the `Deployment`_. - - .. _Deployment: - https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#deployment-v1-apps - """ - - obj: kubernetes_asyncio.client.V1Deployment - api_clients: ClassVar[Dict[str, Type]] = { - "preferred": kubernetes_asyncio.client.AppsV1Api, - "apps/v1": kubernetes_asyncio.client.AppsV1Api, - "apps/v1beta1": kubernetes_asyncio.client.AppsV1beta1Api, - "apps/v1beta2": kubernetes_asyncio.client.AppsV1beta2Api, - } - status_type: Type = kubernetes_asyncio.client.V1DeploymentStatus - - @contextlib.asynccontextmanager - async def create_method( - self, - ) -> AsyncContextManager: - async with self.api_client() as api_client: - yield api_client.create_namespaced_deployment - - @contextlib.asynccontextmanager - async def patch_method( - self, - api_client_default_headers: Optional[dict[str, str]] = { - "content-type": "application/strategic-merge-patch+json" - }, - ) -> AsyncContextManager: - async with self.api_client() as api_client: - # TODO: move up to baser class helper method - for k, v in (api_client_default_headers or {}).items(): - api_client.api_client.set_default_header(k, v) - - yield api_client.patch_namespaced_deployment - - @contextlib.asynccontextmanager - async def replace_method( - self, - ) -> AsyncContextManager: - async with self.api_client() as api_client: - yield api_client.replace_namespaced_deployment - - @contextlib.asynccontextmanager - async def delete_method(self) -> AsyncContextManager: - async with self.api_client() as api_client: - yield api_client.delete_namespaced_deployment - - @classmethod - def list_method(cls, api_client) -> Coroutine: - # TODO maybe refactor to use self.api_client like other methods - # NOTE I'm resisting the urge to refactor rollout(). Lets keep the instability surface minimal - return api_client.list_namespaced_deployment - - # Moved up additional props being shadowed for clarity - @property - def status(self) -> kubernetes_asyncio.client.V1DeploymentStatus: - """Return the status of the Deployment. - - Returns: - The status of the Deployment. - """ - return cast(kubernetes_asyncio.client.V1DeploymentStatus, self.obj.status) - - @property - def unavailable_replicas(self) -> int: - # NOTE this field is N/A for StatefulSets unless the MaxUnavailableStatefulSet flag is enabled - return self.status.unavailable_replicas - - async def create(self, namespace: str = None) -> None: - """Create the Deployment under the given namespace. - - Args: - namespace: The namespace to create the Deployment under. - If the Deployment was loaded via the kubetest client, the - namespace will already be set, so it is not needed here. - Otherwise, the namespace will need to be provided. - """ - if namespace is None: - namespace = self.namespace - - # TODO: add debug or trace loggers to other CRUD methods - self.logger.info( - f'creating {self.__class__.__name__} "{self.name}" in namespace "{self.namespace}"' - ) - - async with self.create_method() as create_method: - self.obj = await create_method( - namespace=namespace, - body=self.obj, - ) - - @classmethod - async def read(cls, name: str, namespace: str) -> "Deployment": - """Read a Deployment by name under the given namespace. - - Args: - name: The name of the Deployment to read. - namespace: The namespace to read the Deployment from. - """ - - async with cls.preferred_client() as api_client: - obj = await api_client.read_namespaced_deployment(name, namespace) - return Deployment( - obj - ) # TODO, dont always need to construct whole class from a read method - - async def patch(self) -> None: - """Update the changed attributes of the Deployment.""" - async with self.patch_method() as patch_method: - self.obj = await patch_method( - name=self.name, namespace=self.namespace, body=self.obj - ) - - async def replace(self) -> None: - """Update the changed attributes of the Deployment.""" - async with self.replace_method() as replace_method: - self.obj = await replace_method( - name=self.name, namespace=self.namespace, body=self.obj - ) - - async def delete( - self, options: kubernetes_asyncio.client.V1DeleteOptions = None - ) -> kubernetes_asyncio.client.V1Status: - """Delete the Deployment. - - This method expects the Deployment to have been loaded or otherwise - assigned a namespace already. If it has not, the namespace will need - to be set manually. - - Args: - options: Options for Deployment deletion. - - Returns: - The status of the delete operation. - """ - if options is None: - options = kubernetes_asyncio.client.V1DeleteOptions() - - self.logger.info(f'deleting {self.__class__.__name__} "{self.name}"') - self.logger.debug(f"delete options: {options}") - - async with self.delete_method() as delete_method: - return await delete_method( - name=self.name, - namespace=self.namespace, - body=options, - ) - - async def scale_to_zero(self) -> None: - """This is used as a "soft" 'delete'/'destroy'. - Since the Deployment object is used as a wrapper around an existing k8s object that we did not create, - it shouldn't be destroyed. Instead, the deployments pods are destroyed by scaling it to 0 replicas. - """ - - await self.refresh() - self.replicas = 0 - await self.patch() - - async def refresh(self) -> None: - """Refresh the underlying Kubernetes Deployment resource.""" - async with self.api_client() as api_client: - self.obj = ( - await self.read( - name=self.name, - namespace=self.namespace, - ) - ).obj - - async def rollback(self) -> None: - """Roll back an unstable Deployment revision to a previous version.""" - async with kubernetes_asyncio.client.api_client.ApiClient() as api: - api_client = kubernetes_asyncio.client.ExtensionsV1beta1Api(api) - self.obj = await api_client.create_namespaced_deployment_rollback( - name=self.name, - namespace=self.namespace, - body=self.obj, - ) - - async def get_status( - self, - ) -> kubernetes_asyncio.client.V1DeploymentStatus: # TODO is actually self.status_type - """Get the status of the Deployment. - - Returns: - The status of the Deployment. - """ - self.logger.info(f'checking status of deployment "{self.name}"') - # first, refresh the deployment state to ensure the latest status - await self.refresh() - - # return the status from the deployment - return cast(self.status_type, self.obj.status) - - async def get_pods(self) -> List[Pod]: - """Get the pods for the Deployment. - - Returns: - A list of pods that belong to the deployment. - """ - self.logger.debug(f'getting pods for {self.__class__.__name__} "{self.name}"') - - async with Pod.preferred_client() as api_client: - label_selector = self.match_labels - pod_list: kubernetes_asyncio.client.V1PodList = ( - await api_client.list_namespaced_pod( - namespace=self.namespace, - label_selector=selector_string(label_selector), - ) - ) - - pods = [Pod(p) for p in pod_list.items] - return pods - - async def get_latest_pods(self) -> List[Pod]: - """Get only the Deployment pods that belong to the latest ResourceVersion. - - Returns: - A list of pods that belong to the latest deployment replicaset. - """ - self.logger.trace( - f'getting replicaset for {self.__class__.__name__} "{self.name}"' - ) - async with self.api_client() as api_client: - label_selector = self.obj.spec.selector.match_labels - rs_list: kubernetes_asyncio.client.V1ReplicasetList = ( - await api_client.list_namespaced_replica_set( - namespace=self.namespace, - label_selector=selector_string(label_selector), - ) - ) - - # Verify all returned RS have this deployment as an owner - rs_list = [ - rs - for rs in rs_list.items - if rs.metadata.owner_references - and any( - ownRef.kind == "Deployment" and ownRef.uid == self.obj.metadata.uid - for ownRef in rs.metadata.owner_references - ) - ] - if not rs_list: - raise servo.ConnectorError( - f'Unable to locate replicaset(s) for deployment "{self.name}"' - ) - if missing_revision_rsets := list( - filter( - lambda rs: "deployment.kubernetes.io/revision" - not in rs.metadata.annotations, - rs_list, - ) - ): - raise servo.ConnectorError( - f'Unable to determine latest replicaset for deployment "{self.name}" due to missing revision annotation in replicaset(s)' - f' "{", ".join(list(map(lambda rs: rs.metadata.name, missing_revision_rsets)))}"' - ) - latest_rs = sorted( - rs_list, - key=lambda rs: int( - rs.metadata.annotations["deployment.kubernetes.io/revision"] - ), - reverse=True, - )[0] - - return [ - pod - for pod in await self.get_pods() - if any( - ownRef.kind == "ReplicaSet" and ownRef.uid == latest_rs.metadata.uid - for ownRef in pod.obj.metadata.owner_references - ) - ] - - @property - def resource_version(self) -> str: - """ - Returns the resource version of the Deployment. - """ - return self.obj.metadata.resource_version - - @property - def observed_generation(self) -> str: - """ - Returns the observed generation of the Deployment status. - - The generation is observed by the deployment controller. - """ - return self.obj.status.observed_generation - - async def is_ready(self) -> bool: - """Check if the Deployment is in the ready state. - - Returns: - True if in the ready state; False otherwise. - """ - await self.refresh() - - # if there is no status, the deployment is definitely not ready - status = self.obj.status - if status is None: - return False - - # check the status for the number of total replicas and compare - # it to the number of ready replicas. if the numbers are - # equal, the deployment is ready; otherwise it is not ready. - total = status.replicas - ready = status.ready_replicas - - if total is None: - return False - - return total == ready - - @property - def containers(self) -> List[Container]: - """ - Return a list of Container objects from the underlying pod template spec. - """ - return list( - map(lambda c: Container(c, None), self.obj.spec.template.spec.containers) - ) - - def find_container(self, name: str) -> Optional[Container]: - """ - Return the container with the given name. - """ - return next(filter(lambda c: c.name == name, self.containers), None) - - # TODO 86 this function - async def get_target_container( - self, config: ContainerConfiguration - ) -> Optional[Container]: - """Return the container targeted by the supplied configuration""" - return self.find_container(config.name) - - def set_container(self, name: str, container: Container) -> None: - """Set the container with the given name to a new value.""" - # TODO make this pythonic and support append use case - index = next( - filter( - lambda i: self.containers[i].name == name, range(len(self.containers)) - ) - ) - self.containers[index] = container - self.obj.spec.template.spec.containers[index] = container.obj - - def remove_container(self, name: str) -> Optional[Container]: - """Set the container with the given name to a new value.""" - index = next( - filter( - lambda i: self.containers[i].name == name, range(len(self.containers)) - ), - None, - ) - if index is not None: - return Container(self.obj.spec.template.spec.containers.pop(index), None) - - return None - - @property - def replicas(self) -> int: - """ - Return the number of desired pods. - """ - return self.obj.spec.replicas - - @replicas.setter - def replicas(self, replicas: int) -> None: - """ - Set the number of desired pods. - """ - self.obj.spec.replicas = replicas - - @property - def field_selector(self) -> str: - """ - Return a string for matching the Deployment fields in Kubernetes API calls. - """ - return selector_string( - { - "metadata.name": self.name, - } - ) - - @property - def match_labels(self) -> Dict[str, str]: - """Return the matchLabels dict of the selector field""" - return self.obj.spec.selector.match_labels - - @property - def label_selector(self) -> Optional[str]: - """ - Return a string for matching the Deployment in Kubernetes API calls. - """ - if not self.obj.metadata.labels: - return None - - return selector_string(self.obj.metadata.labels) - - # TODO: I need to model these two and add label/annotation helpers - @property - def pod_template_spec(self) -> kubernetes_asyncio.client.models.V1PodTemplateSpec: - """Return the pod template spec for instances of the Deployment.""" - return self.obj.spec.template - - async def get_pod_template_spec_copy( - self, - ) -> kubernetes_asyncio.client.models.V1PodTemplateSpec: - """Return a deep copy of the pod template spec. Eg. for creation of a tuning pod""" - return copy.deepcopy(self.pod_template_spec) - - # TODO remove this boilerplate that arose from the... interesting demands that arose during code review - def update_pod( - self, pod: kubernetes_asyncio.client.models.V1Pod - ) -> kubernetes_asyncio.client.models.V1Pod: - """Update the pod with the latest state of the controller if needed""" - # NOTE: Deployment currently needs no updating - return pod - - @property - def pod_spec(self) -> kubernetes_asyncio.client.models.V1PodSpec: - """Return the pod spec for instances of the Deployment.""" - return self.pod_template_spec.spec - - # TODO figure out what triggered the need for backoff and fix it more elegantly - @backoff.on_exception( - backoff.expo, kubernetes_asyncio.client.exceptions.ApiException, max_tries=3 - ) - async def inject_sidecar( - self, - name: str, - image: str, - *, - service: Optional[str] = None, - port: Optional[int] = None, - index: Optional[int] = None, - service_port: int = 9980, - ) -> None: - """ - Injects an Envoy sidecar into a target Deployment that proxies a service - or literal TCP port, generating scrapeable metrics usable for optimization. - - The service or port argument must be provided to define how traffic is proxied - between the Envoy sidecar and the container responsible for fulfilling the request. - - Args: - name: The name of the sidecar to inject. - image: The container image for the sidecar container. - deployment: Name of the target Deployment to inject the sidecar into. - service: Name of the service to proxy. Envoy will accept ingress traffic - on the service port and reverse proxy requests back to the original - target container. - port: The name or number of a port within the Deployment to wrap the proxy around. - index: The index at which to insert the sidecar container. When `None`, the sidecar is appended. - service_port: The port to receive ingress traffic from an upstream service. - """ - - await self.refresh() - - if not (service or port): - raise ValueError(f"a service or port must be given") - - if isinstance(port, str) and port.isdigit(): - port = int(port) - - # check for a port conflict - container_ports = list( - itertools.chain(*map(operator.attrgetter("ports"), self.containers)) - ) - if service_port in list( - map(operator.attrgetter("container_port"), container_ports) - ): - raise ValueError( - f"Port conflict: {self.__class__.__name__} '{self.name}' already exposes port {service_port} through an existing container" - ) - - # lookup the port on the target service - if service: - try: - service_obj = await Service.read(service, self.namespace) - except kubernetes_asyncio.client.exceptions.ApiException as error: - if error.status == 404: - raise ValueError(f"Unknown Service '{service}'") from error - else: - raise error - if not port: - port_count = len(service_obj.obj.spec.ports) - if port_count == 0: - raise ValueError( - f"Target Service '{service}' does not expose any ports" - ) - elif port_count > 1: - raise ValueError( - f"Target Service '{service}' exposes multiple ports -- target port must be specified" - ) - port_obj = service_obj.obj.spec.ports[0] - else: - if isinstance(port, int): - port_obj = next( - filter(lambda p: p.port == port, service_obj.obj.spec.ports), - None, - ) - elif isinstance(port, str): - port_obj = next( - filter(lambda p: p.name == port, service_obj.obj.spec.ports), - None, - ) - else: - raise TypeError( - f"Unable to resolve port value of type {port.__class__} (port={port})" - ) - - if not port_obj: - raise ValueError( - f"Port '{port}' does not exist in the Service '{service}'" - ) - - # resolve symbolic name in the service target port to a concrete container port - if isinstance(port_obj.target_port, str): - container_port_obj = next( - filter(lambda p: p.name == port_obj.target_port, container_ports), - None, - ) - if not container_port_obj: - raise ValueError( - f"Port '{port_obj.target_port}' could not be resolved to a destination container port" - ) - - container_port = container_port_obj.container_port - else: - container_port = port_obj.target_port - - else: - # find the container port - container_port_obj = next( - filter(lambda p: p.container_port == port, container_ports), None - ) - if not container_port_obj: - raise ValueError( - f"Port '{port}' could not be resolved to a destination container port" - ) - - container_port = container_port_obj.container_port - - # build the sidecar container - container = kubernetes_asyncio.client.V1Container( - name=name, - image=image, - image_pull_policy="IfNotPresent", - resources=kubernetes_asyncio.client.V1ResourceRequirements( - requests={"cpu": "125m", "memory": "128Mi"}, - limits={"cpu": "250m", "memory": "256Mi"}, - ), - env=[ - kubernetes_asyncio.client.V1EnvVar( - name="OPSANI_ENVOY_PROXY_SERVICE_PORT", value=str(service_port) - ), - kubernetes_asyncio.client.V1EnvVar( - name="OPSANI_ENVOY_PROXIED_CONTAINER_PORT", - value=str(container_port), - ), - kubernetes_asyncio.client.V1EnvVar( - name="OPSANI_ENVOY_PROXY_METRICS_PORT", value="9901" - ), - ], - ports=[ - kubernetes_asyncio.client.V1ContainerPort( - name="opsani-proxy", container_port=service_port - ), - kubernetes_asyncio.client.V1ContainerPort( - name="opsani-metrics", container_port=9901 - ), - ], - ) - - # add the sidecar to the Deployment - if index is None: - self.obj.spec.template.spec.containers.append(container) - else: - self.obj.spec.template.spec.containers.insert(index, container) - - # patch the deployment - await self.patch() - - async def eject_sidecar(self, name: str) -> bool: - """Eject an Envoy sidecar from the Deployment. - - Returns True if the sidecar was ejected. - """ - await self.refresh() - container = self.remove_container(name) - if container: - await self.replace() - return True - - return False - - @contextlib.asynccontextmanager - async def rollout( - self, *, timeout: Optional[servo.DurationDescriptor] = None - ) -> None: - """Asynchronously wait for changes to a deployment to roll out to the cluster.""" - # NOTE: The timeout_seconds argument must be an int or the request will fail - timeout_seconds = ( - int(servo.Duration(timeout).total_seconds()) if timeout else None - ) - - # Resource version lets us track any change. Observed generation only increments - # when the deployment controller sees a significant change that requires rollout - resource_version = self.resource_version - observed_generation = self.status.observed_generation - desired_replicas = self.replicas - - self.logger.info( - f"applying adjustments to {self.__class__.__name__} '{self.name}' and rolling out to cluster" - ) - - # Yield to let the changes be made - yield self - - # Return fast if nothing was changed - if self.resource_version == resource_version: - self.logger.info( - f"adjustments applied to {self.__class__.__name__} '{self.name}' made no changes, continuing" - ) - return - - # Create a Kubernetes watch against the deployment under optimization to track changes - self.logger.debug( - f"watching {self.__class__.__name__} Using label_selector={self.label_selector}, resource_version={resource_version}" - ) - - async with kubernetes_asyncio.client.api_client.ApiClient() as api: - v1 = kubernetes_asyncio.client.AppsV1Api(api) - async with kubernetes_asyncio.watch.Watch().stream( - self.list_method(v1), - namespace=self.namespace, - field_selector=self.field_selector, - label_selector=self.label_selector, - timeout_seconds=timeout_seconds, - ) as stream: - async for event in stream: - # NOTE: Event types are ADDED, DELETED, MODIFIED, ERROR - # TODO: Create an enum... - event_type, deployment = event["type"], event["object"] - status: self.status_type = deployment.status - - self.logger.debug( - f"{self.__class__.__name__} watch yielded event: {event_type} {deployment.kind} {deployment.metadata.name}" - f" in {deployment.metadata.namespace}: {status}" - ) - - if event_type == "ERROR": - stream.stop() - # FIXME: Not sure what types we expect here - raise servo.AdjustmentRejectedError( - str(deployment), reason="start-failed" - ) - - # Check that the conditions aren't reporting a failure, raises exception if failure detected - # NOTE: conditions are never set on stateful_set - if status.conditions: - self._check_conditions(status.conditions) - - # Early events in the watch may be against previous generation - if status.observed_generation == observed_generation: - self.logger.debug( - "observed generation has not changed, continuing watch" - ) - continue - - # Check the replica counts. Once available, updated, and ready match - # our expected count and the unavailable count is zero we are rolled out - if unavailable_count := self.unavailable_replicas: - self.logger.debug( - "found unavailable replicas, continuing watch", - unavailable_count, - ) - continue - - replica_counts: list[int] = [ - status.replicas, - status.ready_replicas, - status.updated_replicas, - ] - # NOTE: available counts is not always present on StatefulSets, assumedly due to the - # beta status of minReadySeconds https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#minimum-ready-seconds - if ( - available_replicas := getattr( - status, "available_replicas", None - ) - ) is not None: - replica_counts.append(available_replicas) - if replica_counts.count(desired_replicas) == len(replica_counts): - # We are done: all the counts match. Stop the watch and return - self.logger.success( - f"adjustments to {self.__class__.__name__} '{self.name}' rolled out successfully", - status, - ) - stream.stop() - return - - # watch doesn't raise a timeoutError when when elapsed, treat fall through as timeout - raise WatchTimeoutError() - - def _check_conditions( - self, - conditions: List[ - # TODO update type hint in refactor - # NOTE the only variation from StatefulSet is last_update_time being present exclusively on DeploymentStatus but - # said property is not used in the condition checking logic - kubernetes_asyncio.client.V1DeploymentCondition - ], - ) -> None: - for condition in conditions: - if condition.type == "Available": - if condition.status == "True": - # If we hit on this and have not raised yet we are good to go - break - elif condition.status in ("False", "Unknown"): - # Condition has not yet been met, log status and continue monitoring - self.logger.debug( - f"Condition({condition.type}).status == '{condition.status}' ({condition.reason}): {condition.message}" - ) - else: - raise servo.AdjustmentFailedError( - f"encountered unexpected Condition status '{condition.status}'" - ) - - elif condition.type == "ReplicaFailure": - # TODO: Check what this error looks like - raise servo.AdjustmentRejectedError( - f"ReplicaFailure: message='{condition.status.message}', reason='{condition.status.reason}'", - reason="start-failed", - ) - - elif condition.type == "Progressing": - if condition.status in ("True", "Unknown"): - # Still working - self.logger.debug( - f"{self.__class__.__name__} update is progressing", condition - ) - break - elif condition.status == "False": - raise servo.AdjustmentRejectedError( - f"ProgressionFailure: message='{condition.status.message}', reason='{condition.status.reason}'", - reason="start-failed", - ) - else: - raise servo.AdjustmentFailedError( - f"unknown {self.__class__.__name__} status condition: {condition.status}" - ) - - async def raise_for_status( - self, adjustments: List[servo.Adjustment], include_container_logs=False - ) -> None: - # NOTE: operate off of current state, assuming you have checked is_ready() - status = self.status - self.logger.trace(f"current {self.__class__.__name__} status is {status}") - if status is None: - raise RuntimeError(f"No such {self.__class__.__name__}: {self.name}") - - if not status.conditions: - raise RuntimeError(f"{self.__class__.__name__} is not running: {self.name}") - - # Check for failure conditions - self._check_conditions(status.conditions) - await self.raise_for_failed_pod_adjustments( - adjustments=adjustments, include_container_logs=include_container_logs - ) - - # Catchall - self.logger.trace( - f"unable to map {self.__class__.__name__} status to exception. Deployment: {self.obj}" - ) - raise RuntimeError( - f"Unknown {self.__class__.__name__} status for '{self.name}': {status}" - ) - - async def raise_for_failed_pod_adjustments( - self, adjustments: List[servo.Adjustment], include_container_logs=False - ): - pods = await self.get_latest_pods() - self.logger.trace( - f"latest pod(s) status {list(map(lambda p: p.obj.status, pods))}" - ) - unschedulable_pods = [ - pod - for pod in pods - if pod.obj.status.conditions - and any( - cond.reason == "Unschedulable" for cond in pod.obj.status.conditions - ) - ] - if unschedulable_pods: - pod_messages = [] - for pod in unschedulable_pods: - cond_msgs = [] - for unschedulable_condition in filter( - lambda cond: cond.reason == "Unschedulable", - pod.obj.status.conditions, - ): - unschedulable_adjustments = list( - filter( - lambda a: a.setting_name in unschedulable_condition.message, - adjustments, - ) - ) - cond_msgs.append( - f"Requested adjustment(s) ({', '.join(map(str, unschedulable_adjustments))}) cannot be scheduled due to \"{unschedulable_condition.message}\"" - ) - pod_messages.append(f"{pod.obj.metadata.name} - {'; '.join(cond_msgs)}") - - raise servo.AdjustmentRejectedError( - f"{len(unschedulable_pods)} pod(s) could not be scheduled for {self.__class__.__name__} {self.name}: {', '.join(pod_messages)}", - reason="unschedulable", - ) - - image_pull_failed_pods = [ - pod - for pod in pods - if pod.obj.status.container_statuses - and any( - cont_stat.state - and cont_stat.state.waiting - and cont_stat.state.waiting.reason - in ["ImagePullBackOff", "ErrImagePull"] - for cont_stat in pod.obj.status.container_statuses - ) - ] - if image_pull_failed_pods: - raise servo.AdjustmentFailedError( - f"Container image pull failure detected on {len(image_pull_failed_pods)} pods: {', '.join(map(lambda pod: pod.obj.metadata.name, pods))}", - reason="image-pull-failed", - ) - - restarted_pods_container_statuses: list[tuple[Pod, V1ContainerStatus]] = [ - (pod, cont_stat) - for pod in pods - for cont_stat in (pod.obj.status.container_statuses or []) - if cont_stat.restart_count > 0 - ] - if restarted_pods_container_statuses: - container_logs: list[str] = [ - "DISABLED" for _ in range(len(restarted_pods_container_statuses)) - ] - if include_container_logs: # TODO enable logs config on per container basis - # Reduce api requests to 1 per pod then fan back out into per container status list - curpod = restarted_pods_container_statuses[0][0] - curstats = [] - for pod, container_status in restarted_pods_container_statuses: - if pod == curpod: - curstats.append(container_status) - else: - # Set up for next pod in list - container_logs.extend( - await curpod.get_logs_for_container_statuses(curstats) - ) - curpod = pod - curstats = [container_status] - # Get statuses for the last (or only) pod in the list - container_logs.extend( - await curpod.get_logs_for_container_statuses(curstats) - ) - - pod_to_counts = collections.defaultdict(list) - for idx, (pod, cont_stat) in enumerate(restarted_pods_container_statuses): - pod_to_counts[pod.obj.metadata.name].append( - f"{cont_stat.name} x{cont_stat.restart_count} " - f"{'' if not include_container_logs else f' container logs {container_logs[idx]}'}" - ) - - pod_message = ", ".join( - map( - lambda kv_tup: f"{kv_tup[0]} - {'; '.join(kv_tup[1])}", - list(pod_to_counts.items()), - ) - ) - raise servo.AdjustmentRejectedError( - f"{self.__class__.__name__} {self.name} pod(s) crash restart detected: {pod_message}", - reason="unstable", - ) - - # Unready pod catchall - unready_pod_conds = [ - (pod, cond) - for pod in pods - for cond in (pod.obj.status.conditions or []) - if cond.type == "Ready" and cond.status == "False" - ] - if unready_pod_conds: - pod_messages = [] - for pod, cond in unready_pod_conds: - pod_message = ( - f"{pod.obj.metadata.name} - (reason {cond.reason}) {cond.message}" - ) - - # TODO expand criteria for safely getting container logs and/or implement graceful fallback - if include_container_logs and cond.reason == "ContainersNotReady": - unready_container_statuses: List[V1ContainerStatus] = [ - cont_stat - for cont_stat in pod.obj.status.container_statuses or [] - if not cont_stat.ready - ] - container_logs = await pod.get_logs_for_container_statuses( - unready_container_statuses - ) - # NOTE: cant use f-string with newline (backslash) insertion - pod_message = ( - f"{pod_message} container logs " - + "\n\n--- \n\n".join(container_logs) - ) - - pod_messages.append(pod_message) - - raise servo.AdjustmentRejectedError( - f"Found {len(unready_pod_conds)} unready pod(s) for deployment {self.name}: {', '.join(pod_messages)}", - reason="start-failed", - ) - - async def get_restart_count(self) -> int: - count = 0 - for pod in await self.get_latest_pods(): - try: - count += await pod.get_restart_count() - except kubernetes_asyncio.client.exceptions.ApiException as error: - if error.status == 404: - # Pod no longer exists, move on - pass - else: - raise error - - return count - - -class StatefulSet(Deployment): - - obj: kubernetes_asyncio.client.V1StatefulSet - api_clients: ClassVar[Dict[str, Type]] = { - "preferred": kubernetes_asyncio.client.AppsV1Api, - "apps/v1": kubernetes_asyncio.client.AppsV1Api, - } - status_type: Type = kubernetes_asyncio.client.V1DeploymentStatus - - @contextlib.asynccontextmanager - async def create_method( - self, - ) -> Callable: # TODO google the boilerplatey af proper typing for this return type - async with self.api_client() as api_client: - yield api_client.create_namespaced_stateful_set - - # TODO placeholder thingy - @contextlib.asynccontextmanager - async def patch_method( - self, - api_client_default_headers: Optional[dict[str, str]] = { - "content-type": "application/strategic-merge-patch+json" - }, - ) -> AsyncContextManager: - async with self.api_client() as api_client: - # TODO: move up to baser class helper method - for k, v in (api_client_default_headers or {}).items(): - api_client.api_client.set_default_header(k, v) - - yield api_client.patch_namespaced_stateful_set - - @contextlib.asynccontextmanager - async def replace_method( - self, - ) -> AsyncContextManager: - async with self.api_client() as api_client: - yield api_client.replace_namespaced_stateful_set - - @contextlib.asynccontextmanager - async def delete_method(self) -> AsyncContextManager: - async with self.api_client() as api_client: - yield api_client.delete_namespaced_stateful_set - - @classmethod - def list_method(cls, api_client) -> Coroutine: - # TODO maybe refactor to use self.api_client like other methods - # NOTE I'm resisting the urge to refactor rollout(). Lets keep the instability surface minimal - return api_client.list_namespaced_stateful_set - - # Moved up additional props being shadowed for clarity - @property - def status(self) -> kubernetes_asyncio.client.V1DeploymentStatus: - """Return the status of the Deployment. - - Returns: - The status of the Deployment. - """ - return cast(kubernetes_asyncio.client.V1StatefulSetStatus, self.obj.status) - - @property - def unavailable_replicas(self) -> int: - # NOTE this field is N/A for StatefulSets unless the MaxUnavailableStatefulSet flag is enabled - # TODO long term config support for the above caveat - return 0 - - @classmethod - async def read(cls, name: str, namespace: str) -> "StatefulSet": - async with cls.preferred_client() as api_client: - obj = await api_client.read_namespaced_stateful_set(name, namespace) - return StatefulSet(obj) - - async def get_latest_pods(self) -> List[Pod]: - # TODO proper docstring - # TODO podManagementPolicy: Parallel might leverage replicasets like Deployments do - return await self.get_pods() - - # Need custom raise_for_status because statefulsets do not set conditions - # https://github.com/kubernetes/kubernetes/issues/79606#issuecomment-594490746 - async def raise_for_status( - self, adjustments: List[servo.Adjustment], include_container_logs=False - ) -> None: - # NOTE: operate off of current state, assuming you have checked is_ready() - status = self.status - self.logger.trace(f"current {self.__class__.__name__} status is {status}") - if status is None: - raise RuntimeError(f"No such {self.__class__.__name__}: {self.name}") - - await self.raise_for_failed_pod_adjustments( - adjustments=adjustments, include_container_logs=include_container_logs - ) - - # Catchall - self.logger.trace( - f"unable to map {self.__class__.__name__} status to exception. StatefulSet: {self.obj}" - ) - raise RuntimeError( - f"Unknown {self.__class__.__name__} status for '{self.name}' (likely due to no-op known error): {status}" - ) - - -# Workarounds to allow use of api_client.deserialize() public method instead of private api_client._ApiClient__deserialize -# TODO: is this workaround worth it just to avoid using the private method? -# fix for https://github.com/kubernetes-client/python/issues/977#issuecomment-594045477 -def default_kubernetes_json_serializer(o: Any) -> Any: - if isinstance(o, (datetime.datetime, datetime.date)): - return o.isoformat() - raise TypeError( - f"Object of type {o.__class__.__name__} " f"is not JSON serializable" - ) - - -# https://github.com/kubernetes-client/python/issues/977#issuecomment-592030030 -class FakeKubeResponse: - """Mocks the RESTResponse object as a workaround for kubernetes python api_client deserialization""" - - def __init__(self, obj): - self.data = json.dumps(obj, default=default_kubernetes_json_serializer) - - -# Use alias generator so that dromedary case can be parsed to snake case properties to match k8s python client behaviour -def to_dromedary_case(string: str) -> str: - split = string.split("_") - return split[0] + "".join(word.capitalize() for word in split[1:]) - - -class RolloutBaseModel(pydantic.BaseModel): - class Config: - # arbitrary_types_allowed = True - alias_generator = to_dromedary_case - allow_population_by_field_name = True - - -# Pydantic type models for argo rollout spec: https://argoproj.github.io/argo-rollouts/features/specification/ -# https://github.com/argoproj/argo-rollouts/blob/master/manifests/crds/rollout-crd.yaml -# NOTE/TODO: fields typed with Any should maintain the same form when dumped as when they are parsed. Should the need -# arise to interact with such fields, they will need to have an explicit type defined so the alias_generator is applied -class RolloutV1LabelSelector( - RolloutBaseModel -): # must type out k8s models as well to allow parse_obj to work - match_expressions: Any - match_labels: Optional[Dict[str, str]] - - -class RolloutV1ObjectMeta(RolloutBaseModel): - annotations: Optional[Dict[str, str]] - cluster_name: Optional[str] - creation_timestamp: Optional[datetime.datetime] - deletion_grace_period_seconds: Optional[int] - deletion_timestamp: Optional[datetime.datetime] - finalizers: Optional[List[str]] - generate_name: Optional[str] - generation: Optional[int] - labels: Optional[Dict[str, str]] - managed_fields: Any - name: Optional[str] - namespace: Optional[str] - owner_references: Any - resource_version: Optional[str] - self_link: Optional[str] - uid: Optional[str] - - -class RolloutV1EnvVar(RolloutBaseModel): - name: str - value: Optional[str] - value_from: Any - - -class RolloutV1ContainerPort(RolloutBaseModel): - container_port: int - host_ip: Optional[str] - host_port: Optional[int] - name: Optional[str] - protocol: Optional[str] - - -class RolloutV1ResourceRequirements(RolloutBaseModel): - limits: Optional[Dict[str, str]] - requests: Optional[Dict[str, str]] - - -class RolloutV1Container(RolloutBaseModel): - args: Optional[List[str]] - command: Optional[List[str]] - env: Optional[List[RolloutV1EnvVar]] - env_from: Any - image: str - image_pull_policy: Optional[str] - lifecycle: Any - liveness_probe: Any - name: str - ports: Optional[List[RolloutV1ContainerPort]] - readiness_probe: Any - resources: Optional[RolloutV1ResourceRequirements] - security_context: Any - startup_probe: Any - stdin: Optional[bool] - stdin_once: Optional[bool] - termination_message_path: Optional[str] - termination_message_policy: Optional[str] - tty: Optional[bool] - volume_devices: Any - volume_mounts: Any - working_dir: Optional[str] - - -class RolloutV1PodSpec(RolloutBaseModel): - active_deadline_seconds: Optional[int] - affinity: Any - automount_service_account_token: Optional[bool] - containers: List[RolloutV1Container] - dns_config: Any - dns_policy: Optional[str] - enable_service_links: Optional[bool] - ephemeral_containers: Any - host_aliases: Any - host_ipc: Optional[bool] - host_network: Optional[bool] - host_pid: Optional[bool] - hostname: Optional[str] - image_pull_secrets: Any - init_containers: Optional[List[RolloutV1Container]] - node_name: Optional[str] - node_selector: Optional[Dict[str, str]] - overhead: Optional[Dict[str, str]] - preemption_policy: Optional[str] - priority: Optional[int] - priority_class_name: Optional[str] - readiness_gates: Any - restart_policy: Optional[str] - runtime_class_name: Optional[str] - scheduler_name: Optional[str] - security_context: Any - service_account: Optional[str] - service_account_name: Optional[str] - share_process_namespace: Optional[bool] - subdomain: Optional[str] - termination_grace_period_seconds: Optional[int] - tolerations: Any - topology_spread_constraints: Any - volumes: Any - - -class RolloutV1PodTemplateSpec(RolloutBaseModel): - metadata: RolloutV1ObjectMeta - spec: RolloutV1PodSpec - - -class RolloutV1WorkloadRef(RolloutBaseModel): - api_version: str - kind: str - name: str - - -class RolloutSpec(RolloutBaseModel): - replicas: int - selector: Optional[RolloutV1LabelSelector] - template: Optional[RolloutV1PodTemplateSpec] - workload_ref: Optional[RolloutV1WorkloadRef] - min_ready_seconds: Optional[int] - revision_history_limit: Optional[int] - paused: Optional[bool] - progress_deadline_seconds: Optional[int] - restart_at: Optional[datetime.datetime] - strategy: Any - - -class RolloutBlueGreenStatus(RolloutBaseModel): - active_selector: Optional[str] - post_promotion_analysis_run: Optional[str] - post_promotion_analysis_run_status: Any - pre_promotion_analysis_run: Optional[str] - pre_promotion_analysis_run_status: Any - preview_selector: Optional[str] - previous_active_selector: Optional[str] - scale_down_delay_start_time: Optional[datetime.datetime] - scale_up_preview_check_point: Optional[bool] - - -class RolloutStatusCondition(RolloutBaseModel): - last_transition_time: datetime.datetime - last_update_time: datetime.datetime - message: str - reason: str - status: str - type: str - - -class RolloutStatus(RolloutBaseModel): - hpa_replicas: Optional[int] = pydantic.Field(..., alias="HPAReplicas") - abort: Optional[bool] - aborted_at: Optional[datetime.datetime] - available_replicas: Optional[int] - blue_green: RolloutBlueGreenStatus - canary: Any # TODO type this out if connector needs to interact with it - collision_count: Optional[int] - conditions: List[RolloutStatusCondition] - controller_pause: Optional[bool] - current_pod_hash: str - current_step_hash: Optional[str] - current_step_index: Optional[int] - observed_generation: str - pause_conditions: Any - ready_replicas: Optional[int] - replicas: Optional[int] - restarted_at: Optional[datetime.datetime] - selector: str - stable_RS: Optional[str] - updated_replicas: Optional[int] - - -class RolloutObj(RolloutBaseModel): # TODO is this the right base to inherit from? - api_version: str - kind: str - metadata: RolloutV1ObjectMeta - spec: RolloutSpec - status: Optional[RolloutStatus] - - -# TODO expose to config if needed -ROLLOUT_GROUP = "argoproj.io" -ROLLOUT_VERSION = "v1alpha1" -ROLLOUT_PURAL = "rollouts" - - -class Rollout(KubernetesModel): - """Wrapper around an ArgoCD Kubernetes `Rollout` Object. - The actual instance that this - wraps can be accessed via the ``obj`` instance member. - This wrapper provides some convenient functionality around the - API Object and provides some state management for the `Rollout`. - .. Rollout: - https://argoproj.github.io/argo-rollouts/features/specification/ - """ - - obj: RolloutObj - workload_ref_controller: Optional[Deployment] = None - - _rollout_const_args: Dict[str, str] = dict( - group=ROLLOUT_GROUP, - version=ROLLOUT_VERSION, - plural=ROLLOUT_PURAL, - ) - - api_clients: ClassVar[Dict[str, Type]] = { - "preferred": kubernetes_asyncio.client.CustomObjectsApi, - f"{ROLLOUT_GROUP}/{ROLLOUT_VERSION}": kubernetes_asyncio.client.CustomObjectsApi, - } - - async def create(self, namespace: str = None) -> None: - """Create the Rollout under the given namespace. - Args: - namespace: The namespace to create the Rollout under. - """ - if namespace is None: - namespace = self.namespace - - self.logger.info(f'creating rollout "{self.name}" in namespace "{namespace}"') - self.logger.debug(f"rollout: {self.obj}") - - async with self.api_client() as api_client: - self.obj = RolloutObj.parse_obj( - await api_client.create_namespaced_custom_object( - namespace=namespace, - body=self.obj.dict(by_alias=True, exclude_none=True), - **self._rollout_const_args, - ) - ) - - @classmethod - async def read(cls, name: str, namespace: str) -> "Rollout": - """Read a Rollout by name under the given namespace. - Args: - name: The name of the Rollout to read. - namespace: The namespace to read the Rollout from. - """ - - async with cls.preferred_client() as api_client: - obj = await api_client.get_namespaced_custom_object( - namespace=namespace, - name=name, - **cls._rollout_const_args, - ) - rollout = Rollout(RolloutObj.parse_obj(obj)) - if rollout.obj.spec.workload_ref: - await rollout.read_workfload_ref(namespace=namespace) - return rollout - - async def read_workfload_ref(self, namespace: str) -> None: - if self.obj.spec.workload_ref.kind != "Deployment": - raise RuntimeError( - f"Rollout integration does not currently support workloadRef kind of {self.obj.spec.workload_ref.kind}" - ) - - self.workload_ref_controller = await Deployment.read( - name=self.obj.spec.workload_ref.name, namespace=namespace - ) - if not self.workload_ref_controller: - raise ValueError( - f'cannot read Rollout: workloadRef Deployment "{self.obj.spec.workload_ref.name}"' - f' does not exist in Namespace "{namespace}"' - ) - - async def patch(self) -> None: - """Update the changed attributes of the Rollout.""" - async with self.api_client( - {"content-type": "application/merge-patch+json"} - ) as api_client: - self.obj = RolloutObj.parse_obj( - await api_client.patch_namespaced_custom_object( - namespace=self.namespace, - name=self.name, - body=self.obj.dict(by_alias=True, exclude_none=True), - **self._rollout_const_args, - ) - ) - - async def delete( - self, options: kubernetes_asyncio.client.V1DeleteOptions = None - ) -> kubernetes_asyncio.client.V1Status: - """Delete the Rollout. - This method expects the Rollout to have been loaded or otherwise - assigned a namespace already. If it has not, the namespace will need - to be set manually. - Args: - options: Unsupported, options for Rollout deletion. - Returns: - The status of the delete operation. - """ - if options is not None: - raise RuntimeError("Rollout deletion does not support V1DeleteOptions") - - self.logger.info(f'deleting rollout "{self.name}"') - self.logger.trace(f"rollout: {self.obj}") - - async with self.api_client() as api_client: - return await api_client.delete_namespaced_custom_object( - namespace=self.namespace, - name=self.name, - **self._rollout_const_args, - ) - - async def refresh(self) -> None: - """Refresh the underlying Kubernetes Rollout resource.""" - async with self.api_client() as api_client: - self.obj = RolloutObj.parse_obj( - await api_client.get_namespaced_custom_object_status( - namespace=self.namespace, name=self.name, **self._rollout_const_args - ) - ) - - if self.workload_ref_controller: - await self.workload_ref_controller.refresh() - - async def rollback(self) -> None: - # TODO rollbacks are automated in Argo Rollouts, not sure if making this No Op will cause issues - # but I was unable to locate a means of triggering a rollout rollback manually - raise TypeError( - ( - "rollback is not supported under the optimization of rollouts because rollbacks are applied to " - "Kubernetes Deployment objects whereas this is automated by argocd" - ) - ) - - async def get_status(self) -> RolloutStatus: - """Get the status of the Rollout. - Returns: - The status of the Rollout. - """ - self.logger.info(f'checking status of rollout "{self.name}"') - # first, refresh the rollout state to ensure the latest status - await self.refresh() - - # return the status from the rollout - return self.obj.status - - async def get_pods(self) -> List[Pod]: - """Get the pods for the Rollout. - - Returns: - A list of pods that belong to the rollout. - """ - self.logger.debug(f'getting pods for rollout "{self.name}"') - - async with Pod.preferred_client() as api_client: - label_selector = self.match_labels - pod_list: kubernetes_asyncio.client.V1PodList = ( - await api_client.list_namespaced_pod( - namespace=self.namespace, - label_selector=selector_string(label_selector), - ) - ) - - pods = [Pod(p) for p in pod_list.items] - return pods - - @property - def status(self) -> RolloutStatus: - """Return the status of the Rollout. - Returns: - The status of the Rollout. - """ - return self.obj.status - - @property - def observed_generation(self) -> str: - """ - Returns the observed generation of the Deployment status. - - The generation is observed by the deployment controller. - """ - if self.workload_ref_controller: - return self.workload_ref_controller.observed_generation - - return self.obj.status.observed_generation - - async def is_ready(self) -> bool: - """Check if the Rollout is in the ready state. - - Returns: - True if in the ready state; False otherwise. - """ - await self.refresh() - - # if there is no status, the deployment is definitely not ready - status = self.obj.status - if status is None: - return False - - # check for the rollout completed status condition - completed_condition = next( - filter(lambda con: con.type == "Completed", status.conditions), None - ) - if completed_condition.status != "True": - return False - - # check the status for the number of total replicas and compare - # it to the number of ready replicas. if the numbers are - # equal, the deployment is ready; otherwise it is not ready. - total = status.replicas - ready = status.ready_replicas - - if total is None: - return False - - return total == ready - - @property - def containers(self) -> List[Container]: - """ - Return a list of Container objects from the underlying pod template spec. - """ - if self.workload_ref_controller: - return self.workload_ref_controller.containers - - return list( - map(lambda c: Container(c, None), self.obj.spec.template.spec.containers) - ) - - def find_container(self, name: str) -> Optional[Container]: - """ - Return the container with the given name. - """ - return next(filter(lambda c: c.name == name, self.containers), None) - - async def get_target_container( - self, config: ContainerConfiguration - ) -> Optional[Container]: - """Return the container targeted by the supplied configuration""" - target_container = self.find_container(config.name) - if target_container is not None and isinstance( - target_container.obj, RolloutV1Container - ): - async with kubernetes_asyncio.client.ApiClient() as api_client: - target_container.obj = api_client.deserialize( - response=FakeKubeResponse( - target_container.obj.dict(by_alias=True, exclude_none=True) - ), - response_type=kubernetes_asyncio.client.models.V1Container, - ) - return target_container - - @property - def replicas(self) -> int: - """ - Return the number of desired pods. - """ - return self.obj.spec.replicas - - @replicas.setter - def replicas(self, replicas: int) -> None: - """ - Set the number of desired pods. - """ - self.obj.spec.replicas = replicas - - @property - def match_labels(self) -> Dict[str, str]: - """Return the matchLabels dict of the selector field (from the workloadRef if applicable""" - if self.workload_ref_controller: - return self.workload_ref_controller.match_labels - return self.obj.spec.selector.match_labels - - @property - def pod_template_spec(self) -> RolloutV1PodTemplateSpec: - """Return the pod template spec for instances of the Rollout.""" - if self.workload_ref_controller: - return self.workload_ref_controller.pod_template_spec - - return self.obj.spec.template - - async def get_pod_template_spec_copy( - self, - ) -> kubernetes_asyncio.client.models.V1PodTemplateSpec: - """Return a deep copy of the pod template spec. Eg. for creation of a tuning pod""" - if self.workload_ref_controller: - return await self.workload_ref_controller.get_pod_template_spec_copy() - - async with kubernetes_asyncio.client.ApiClient() as api_client: - return api_client.deserialize( - response=FakeKubeResponse( - self.pod_template_spec.dict(by_alias=True, exclude_none=True) - ), - response_type=kubernetes_asyncio.client.models.V1PodTemplateSpec, - ) - - def update_pod( - self, pod: kubernetes_asyncio.client.models.V1Pod - ) -> kubernetes_asyncio.client.models.V1Pod: - """Update the pod with the latest state of the controller if needed. In the case of argo rollouts, the - pod labels are updated with the latest template hash so that it will be routed to by the appropriate service""" - # Apply the latest template hash so the active service register the tuning pod as an endpoint - pod.metadata.labels[ - "rollouts-pod-template-hash" - ] = self.obj.status.current_pod_hash - return pod - - @backoff.on_exception( - backoff.expo, kubernetes_asyncio.client.exceptions.ApiException, max_tries=3 - ) - async def inject_sidecar( - self, - name: str, - image: str, - *args, - service: Optional[str] = None, - port: Optional[int] = None, - index: Optional[int] = None, - service_port: int = 9980, - ) -> None: - """ - Injects an Envoy sidecar into a target Deployment that proxies a service - or literal TCP port, generating scrapeable metrics usable for optimization. - - The service or port argument must be provided to define how traffic is proxied - between the Envoy sidecar and the container responsible for fulfilling the request. - - Args: - name: The name of the sidecar to inject. - image: The container image for the sidecar container. - service: Name of the service to proxy. Envoy will accept ingress traffic - on the service port and reverse proxy requests back to the original - target container. - port: The name or number of a port within the Deployment to wrap the proxy around. - index: The index at which to insert the sidecar container. When `None`, the sidecar is appended. - service_port: The port to receive ingress traffic from an upstream service. - """ - - if self.workload_ref_controller: - await self.workload_ref_controller.inject_sidecar( - name=name, - image=image, - *args, - service=service, - port=port, - index=index, - service_port=service_port, - ) - return - - await self.refresh() - - if not (service or port): - raise ValueError(f"a service or port must be given") - - if isinstance(port, str) and port.isdigit(): - port = int(port) - - # check for a port conflict - container_ports = list( - itertools.chain(*map(operator.attrgetter("ports"), self.containers)) - ) - if service_port in list( - map(operator.attrgetter("container_port"), container_ports) - ): - raise ValueError( - f"Port conflict: Rollout '{self.name}' already exposes port {service_port} through an existing container" - ) - - # lookup the port on the target service - if service: - try: - service_obj = await Service.read(service, self.namespace) - except kubernetes_asyncio.client.exceptions.ApiException as error: - if error.status == 404: - raise ValueError(f"Unknown Service '{service}'") from error - else: - raise error - if not port: - port_count = len(service_obj.obj.spec.ports) - if port_count == 0: - raise ValueError( - f"Target Service '{service}' does not expose any ports" - ) - elif port_count > 1: - raise ValueError( - f"Target Service '{service}' exposes multiple ports -- target port must be specified" - ) - port_obj = service_obj.obj.spec.ports[0] - else: - if isinstance(port, int): - port_obj = next( - filter(lambda p: p.port == port, service_obj.obj.spec.ports), - None, - ) - elif isinstance(port, str): - port_obj = next( - filter(lambda p: p.name == port, service_obj.obj.spec.ports), - None, - ) - else: - raise TypeError( - f"Unable to resolve port value of type {port.__class__} (port={port})" - ) - - if not port_obj: - raise ValueError( - f"Port '{port}' does not exist in the Service '{service}'" - ) - - # resolve symbolic name in the service target port to a concrete container port - if isinstance(port_obj.target_port, str): - container_port_obj = next( - filter(lambda p: p.name == port_obj.target_port, container_ports), - None, - ) - if not container_port_obj: - raise ValueError( - f"Port '{port_obj.target_port}' could not be resolved to a destination container port" - ) - - container_port = container_port_obj.container_port - else: - container_port = port_obj.target_port - - else: - # find the container port - container_port_obj = next( - filter(lambda p: p.container_port == port, container_ports), None - ) - if not container_port_obj: - raise ValueError( - f"Port '{port}' could not be resolved to a destination container port" - ) - - container_port = container_port_obj.container_port - - # build the sidecar container - container = RolloutV1Container( - name=name, - image=image, - image_pull_policy="IfNotPresent", - resources=RolloutV1ResourceRequirements( - requests={"cpu": "125m", "memory": "128Mi"}, - limits={"cpu": "250m", "memory": "256Mi"}, - ), - env=[ - RolloutV1EnvVar( - name="OPSANI_ENVOY_PROXY_SERVICE_PORT", value=str(service_port) - ), - RolloutV1EnvVar( - name="OPSANI_ENVOY_PROXIED_CONTAINER_PORT", - value=str(container_port), - ), - RolloutV1EnvVar(name="OPSANI_ENVOY_PROXY_METRICS_PORT", value="9901"), - ], - ports=[ - RolloutV1ContainerPort( - name="opsani-proxy", container_port=service_port, protocol="TCP" - ), - RolloutV1ContainerPort( - name="opsani-metrics", container_port=9901, protocol="TCP" - ), - ], - ) - - # add the sidecar to the Rollout - if index is None: - self.obj.spec.template.spec.containers.append(container) - else: - self.obj.spec.template.spec.containers.insert(index, container) - - # patch the Rollout - await self.patch() - - # TODO: convert to rollout logic - async def eject_sidecar(self, name: str) -> bool: - """Eject an Envoy sidecar from the Deployment. +import pydantic +import re +from typing import ( + AsyncIterator, + Collection, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, + Union, + cast, +) - Returns True if the sidecar was ejected. - """ - await self.refresh() - container = self.remove_container(name) - if container: - await self.replace() - return True +import kubernetes_asyncio +import kubernetes_asyncio.client +import kubernetes_asyncio.client.api_client +import kubernetes_asyncio.client.exceptions +import kubernetes_asyncio.client.models +from kubernetes_asyncio.client import ( + V1Container, + V1Deployment, + V1EnvVar, + V1OwnerReference, + V1Pod, + V1PodTemplateSpec, + V1StatefulSet, +) - return False +import servo +from servo.telemetry import ONE_MiB +from servo.types.kubernetes import * - # TODO: rebase this and _check_conditions for saturation mode - @contextlib.asynccontextmanager - async def rollout(self, *, timeout: Optional[servo.Duration] = None) -> None: - raise NotImplementedError("To be implemented in future update") +from .kubernetes_helpers import ( + ContainerHelper, + DeploymentHelper, + PodHelper, + NamespaceHelper, + ReplicasetHelper, + StatefulSetHelper, + find_container, +) class Core(decimal.Decimal): @@ -3495,14 +168,20 @@ class CPU(servo.CPU): # Kubernetes resource requirements request: Optional[Core] limit: Optional[Core] - get: pydantic.conlist(ResourceRequirement, min_items=1) = [ - ResourceRequirement.request, - ResourceRequirement.limit, - ] - set: pydantic.conlist(ResourceRequirement, min_items=1) = [ - ResourceRequirement.request, - ResourceRequirement.limit, - ] + get: list[ResourceRequirement] = pydantic.Field( + default=[ + ResourceRequirement.request, + ResourceRequirement.limit, + ], + min_items=1, + ) + set: list[ResourceRequirement] = pydantic.Field( + default=[ + ResourceRequirement.request, + ResourceRequirement.limit, + ], + min_items=1, + ) def __opsani_repr__(self) -> dict: o_dict = super().__opsani_repr__() @@ -3574,14 +253,20 @@ class Memory(servo.Memory): # Kubernetes resource requirements request: Optional[ShortByteSize] limit: Optional[ShortByteSize] - get: pydantic.conlist(ResourceRequirement, min_items=1) = [ - ResourceRequirement.request, - ResourceRequirement.limit, - ] - set: pydantic.conlist(ResourceRequirement, min_items=1) = [ - ResourceRequirement.request, - ResourceRequirement.limit, - ] + get: list[ResourceRequirement] = pydantic.Field( + default=[ + ResourceRequirement.request, + ResourceRequirement.limit, + ], + min_items=1, + ) + set: list[ResourceRequirement] = pydantic.Field( + default=[ + ResourceRequirement.request, + ResourceRequirement.limit, + ], + min_items=1, + ) def __opsani_repr__(self) -> dict: o_dict = super().__opsani_repr__() @@ -3707,9 +392,6 @@ async def handle_error(self, error: Exception) -> bool: self.logger.opt(exception=error).warning(f"ignoring exception") return True - elif self.on_failure == FailureMode.rollback: - await self.rollback(error) - elif self.on_failure == FailureMode.shutdown: await self.shutdown(error) @@ -3724,17 +406,6 @@ async def handle_error(self, error: Exception) -> bool: except Exception as handler_error: raise handler_error from error # reraising an error from itself is safe - @abc.abstractmethod - async def rollback(self, error: Optional[Exception] = None) -> None: - """ - Asynchronously roll back the Optimization to a previous known - good state. - - Args: - error: An optional exception that contextualizes the cause of the rollback. - """ - ... - @abc.abstractmethod async def shutdown(self, error: Optional[Exception] = None) -> None: """ @@ -3773,49 +444,53 @@ class Config: arbitrary_types_allowed = True -# TODO: Update class name, saturation mode optimization is not specific to Deployment workloads -class DeploymentOptimization(BaseOptimization): +class SaturationOptimization(BaseOptimization): """ - The DeploymentOptimization class implements an optimization strategy based on directly reconfiguring a Kubernetes - Deployment and its associated containers. + The SaturationOptimization class implements an optimization strategy based on directly reconfiguring a Kubernetes + workload and its associated containers. """ - deployment_config: Optional["DeploymentConfiguration"] - stateful_set_config: Optional["StatefulSetConfiguration"] - - # TODO currently shoehorning the statefulset support into the deployment property - # which should likely be renamed to workload upon refactor - deployment: Optional[Union[Deployment, StatefulSet]] - # stateful_set: Optional[StatefulSet] + workload_helper: Optional[Union[Type[DeploymentHelper], Type[StatefulSetHelper]]] + workload_config: Optional[ + Union["DeploymentConfiguration", "StatefulSetConfiguration"] + ] + workload: Optional[Union[V1Deployment, V1StatefulSet]] container_config: "ContainerConfiguration" - container: Container + container: V1Container @classmethod async def create( cls, config: Union["DeploymentConfiguration", "StatefulSetConfiguration"], **kwargs, - ) -> "DeploymentOptimization": + ) -> "SaturationOptimization": # TODO switch for type of config if isinstance(config, StatefulSetConfiguration): - workload = await StatefulSet.read(config.name, config.namespace) + workload_helper = StatefulSetHelper elif isinstance(config, DeploymentConfiguration): - workload = await Deployment.read(config.name, config.namespace) + workload_helper = DeploymentHelper else: raise ValueError( f"Unrecognized workload for configuration type of {config.__class__.__name__}" ) + workload = await workload_helper.read(config.name, config.namespace) replicas = config.replicas.copy() - replicas.value = workload.replicas + # NOTE: Assign to the config to trigger validations + replicas.value = workload.spec.replicas # FIXME: Currently only supporting one container for container_config in config.containers: - container = workload.find_container(container_config.name) + container = find_container(workload=workload, name=container_config.name) if not container: names = servo.utilities.strings.join_to_series( - list(map(lambda c: c.name, workload.containers)) + list( + map( + lambda c: c.metadata.name, + workload.spec.template.spec.containers, + ) + ) ) raise ValueError( f'no container named "{container_config.name}" exists in the Pod (found {names})' @@ -3827,12 +502,15 @@ async def create( ) name = container_config.alias or ( - f"{workload.name}/{container.name}" if container else workload.name + f"{workload.metadata.name}/{container.name}" + if container + else workload.metadata.name ) return cls( name=name, - deployment_config=config, - deployment=workload, + workload_config=config, + workload=workload, + workload_helper=workload_helper, container_config=container_config, container=container, **kwargs, @@ -3846,7 +524,9 @@ def cpu(self) -> CPU: cpu = self.container_config.cpu.copy() # Determine the value in priority order from the config - resource_requirements = self.container.get_resource_requirements("cpu") + resource_requirements = ContainerHelper.get_resource_requirements( + self.container, "cpu" + ) cpu.request = resource_requirements.get(ResourceRequirement.request) cpu.limit = resource_requirements.get(ResourceRequirement.limit) value = resource_requirements.get( @@ -3870,7 +550,9 @@ def memory(self) -> Memory: memory = self.container_config.memory.copy() # Determine the value in priority order from the config - resource_requirements = self.container.get_resource_requirements("memory") + resource_requirements = ContainerHelper.get_resource_requirements( + self.container, "memory" + ) memory.request = resource_requirements.get(ResourceRequirement.request) memory.limit = resource_requirements.get(ResourceRequirement.limit) value = resource_requirements.get( @@ -3891,7 +573,9 @@ def env(self) -> Optional[list[servo.EnvironmentSetting]]: env: list[servo.EnvironmentSetting] = [] env_setting: Union[servo.EnvironmentRangeSetting, servo.EnvironmentEnumSetting] for env_setting in self.container_config.env or []: - if env_val := self.container.get_environment_variable(env_setting.name): + if env_val := ContainerHelper.get_environment_variable( + self.container, env_setting.name + ): env_setting = env_setting.safe_set_value_copy(env_val) env.append(env_setting) @@ -3902,8 +586,8 @@ def replicas(self) -> servo.Replicas: """ Return the current Replicas setting for the optimization. """ - replicas = self.deployment_config.replicas.copy() - replicas.value = self.deployment.replicas + replicas = self.workload_config.replicas.copy() + replicas.value = self.workload.spec.replicas return replicas @property @@ -3912,20 +596,7 @@ def on_failure(self) -> FailureMode: Return the configured failure behavior. If not set explicitly, this will be cascaded from the base kubernetes configuration (or its default) """ - return self.deployment_config.on_failure - - async def rollback(self, error: Optional[Exception] = None) -> None: - """ - Initiates an asynchronous rollback to a previous version of the Deployment. - - Args: - error: An optional error that triggered the rollback. - """ - self.logger.info(f"adjustment failed: rolling back deployment... ({error})") - await asyncio.wait_for( - self.deployment.rollback(), - timeout=self.timeout.total_seconds(), - ) + return self.workload_config.on_failure async def shutdown(self, error: Optional[Exception] = None) -> None: """ @@ -3935,8 +606,12 @@ async def shutdown(self, error: Optional[Exception] = None) -> None: error: An optional error that triggered the destruction. """ self.logger.info(f"adjustment failed: shutting down deployment's pods...") - await asyncio.wait_for( - self.deployment.scale_to_zero(), + self.workload = await self.workload_helper.read( + self.workload_config.name, self.workload_config.namespace + ) + self.workload.spec.replicas = 0 + self.workload = await asyncio.wait_for( + self.workload_helper.patch(self.workload), timeout=self.timeout.total_seconds(), ) @@ -3974,17 +649,19 @@ def adjust( for requirement in setting.set: requirements[requirement] = value - self.container.set_resource_requirements(setting_name, requirements) + ContainerHelper.set_resource_requirements( + self.container, setting_name, requirements + ) elif setting_name == "replicas": # NOTE: Assign to the config to trigger validations - self.deployment_config.replicas.value = value - self.deployment.replicas = value + self.workload_config.replicas.value = value + self.workload.spec.replicas = value elif env_setting := servo.find_setting(self.container_config.env, setting_name): env_setting = env_setting.safe_set_value_copy(value) - self.container.set_environment_variable( - env_setting.variable_name, env_setting.value + ContainerHelper.set_environment_variable( + self.container, env_setting.variable_name, env_setting.value ) else: @@ -4029,167 +706,119 @@ async def apply(self) -> None: # The resource_version attribute lets us efficiently watch for changes # reference: https://kubernetes.io/docs/reference/using-api/api-concepts/#efficient-detection-of-changes """ + # Patch the Deployment via the Kubernetes API + self.workload = await self.workload_helper.patch(self.workload) try: - async with self.deployment.rollout(timeout=self.timeout) as deployment: - # Patch the Deployment via the Kubernetes API - await deployment.patch() - except WatchTimeoutError: + await asyncio.wait_for( + self.workload_helper.wait_until_ready(self.workload), + timeout=self.timeout.total_seconds(), + ) + except asyncio.exceptions.TimeoutError: servo.logger.error( - f"Timed out waiting for {self.deployment.__class__.__name__} to become ready..." + f"Timed out waiting for {self.workload.__class__.__name__} to become ready..." ) await self.raise_for_status() + servo.logger.success( + f"adjustments to {self.workload.kind} '{self.workload.metadata.name}' rolled out successfully" + ) async def is_ready(self) -> bool: - is_ready, restart_count = await asyncio.gather( - self.deployment.is_ready(), self.deployment.get_restart_count() + self.workload = await self.workload_helper.read( + self.workload.metadata.name, self.workload.metadata.namespace + ) + return ( + self.workload_helper.is_ready(self.workload) + and await self.workload_helper.get_restart_count(self.workload) == 0 ) - return is_ready and restart_count == 0 async def raise_for_status(self) -> None: """Raise an exception if in an unhealthy state.""" - await self.deployment.raise_for_status( + self.workload = await self.workload_helper.read( + self.workload.metadata.name, self.workload.metadata.namespace + ) + await self.workload_helper.raise_for_status( + workload=self.workload, adjustments=self.adjustments, - include_container_logs=self.deployment_config.container_logs_in_error_status, + include_container_logs=self.workload_config.container_logs_in_error_status, ) -# TODO: Break down into CanaryDeploymentOptimization and CanaryContainerOptimization class CanaryOptimization(BaseOptimization): """CanaryOptimization objects manage the optimization of Containers within a Deployment using a tuning Pod that is adjusted independently and compared against the performance and cost profile of its siblings. """ + # The helper static classes define the abstractions/interfaces for interacting with the various workload types + # NOTE CanaryOptimization currently only supports Deployment + workload_helper: Type[DeploymentHelper] + # The deployment and container stanzas from the configuration - deployment_config: Optional["DeploymentConfiguration"] - rollout_config: Optional["RolloutConfiguration"] + workload_config: "DeploymentConfiguration" container_config: "ContainerConfiguration" # State for mainline resources. Read from the cluster - deployment: Optional[Deployment] - rollout: Optional[Rollout] - main_container: Container + workload: V1Deployment + main_container: V1Container # State for tuning resources - tuning_pod: Optional[Pod] - tuning_container: Optional[Container] - - _tuning_pod_template_spec: Optional[ - kubernetes_asyncio.client.models.V1PodTemplateSpec - ] = pydantic.PrivateAttr() - - @pydantic.root_validator - def check_deployment_and_rollout(cls, values): - if ( - values.get("deployment_config") is not None - and values.get("rollout_config") is not None - ): - raise ValueError( - "Cannot create a CanaryOptimization with both rollout and deployment configurations" - ) - if values.get("deployment") is not None and values.get("rollout") is not None: - raise ValueError( - "Cannot create a CanaryOptimization with both rollout and deployment" - ) - - if ( - values.get("deployment_config") is None - and values.get("rollout_config") is None - ): - raise ValueError( - "CanaryOptimization must be initialized with either a rollout or deployment configuration" - ) - if values.get("deployment") is None and values.get("rollout") is None: - raise ValueError( - "CanaryOptimization must be initialized with either a rollout or deployment" - ) - - return values - - @property - def target_controller_config( - self, - ) -> Union["DeploymentConfiguration", "RolloutConfiguration"]: - return self.deployment_config or self.rollout_config - - @property - def target_controller(self) -> Union[Deployment, Rollout]: - return self.deployment or self.rollout + tuning_pod: Optional[V1Pod] + tuning_container: Optional[V1Container] - @property - def target_controller_type(self) -> str: - return type(self.target_controller).__name__ + _tuning_pod_template_spec: Optional[V1PodTemplateSpec] = pydantic.PrivateAttr() @classmethod async def create( cls, - deployment_or_rollout_config: Union[ - "DeploymentConfiguration", "RolloutConfiguration" - ], + workload_config: "DeploymentConfiguration", **kwargs, ) -> "CanaryOptimization": - read_args = ( - deployment_or_rollout_config.name, - cast(str, deployment_or_rollout_config.namespace), - ) - if isinstance(deployment_or_rollout_config, DeploymentConfiguration): - controller_type = "Deployment" - deployment_or_rollout = await Deployment.read(*read_args) - init_args = dict( - deployment_config=deployment_or_rollout_config, - deployment=deployment_or_rollout, - ) - elif isinstance(deployment_or_rollout_config, RolloutConfiguration): - controller_type = "Rollout" - deployment_or_rollout = await Rollout.read(*read_args) - init_args = dict( - rollout_config=deployment_or_rollout_config, - rollout=deployment_or_rollout, - ) + # NOTE may eventually support other workload types + workload_helper: Type[DeploymentHelper] = None + if isinstance(workload_config, DeploymentConfiguration): + workload_helper = DeploymentHelper else: raise NotImplementedError( - f"Unknown configuration type '{type(deployment_or_rollout_config).__name__}'" - ) - if not deployment_or_rollout: - raise ValueError( - f'cannot create CanaryOptimization: target {controller_type} "{deployment_or_rollout_config.name}"' - f' does not exist in Namespace "{deployment_or_rollout_config.namespace}"' + f"Unknown/incompatible configuration type '{workload_config.__class__.__name__}'" ) + workload = await workload_helper.read( + name=workload_config.name, namespace=workload_config.namespace + ) + # NOTE: Currently only supporting one container assert ( - len(deployment_or_rollout_config.containers) == 1 + len(workload_config.containers) == 1 ), "CanaryOptimization currently only supports a single container" - container_config = deployment_or_rollout_config.containers[0] - main_container = await deployment_or_rollout.get_target_container( - container_config + container_config = workload_config.containers[0] + main_container: V1Container = find_container( + workload=workload, name=container_config.name ) + + alias = getattr(workload_config.strategy, "alias", None) name = ( - deployment_or_rollout_config.strategy.alias - if isinstance( - deployment_or_rollout_config.strategy, - CanaryOptimizationStrategyConfiguration, - ) - and deployment_or_rollout_config.strategy.alias - else f"{deployment_or_rollout.name}/{main_container.name}-tuning" + alias if alias else f"{workload_config.name}/{main_container.name}-tuning" ) optimization = cls( name=name, - **init_args, + workload_helper=workload_helper, + workload_config=workload_config, + workload=workload, container_config=container_config, main_container=main_container, **kwargs, ) await optimization._load_tuning_state() + await optimization._configure_tuning_pod_template_spec() return optimization async def _load_tuning_state(self) -> None: # Find an existing tuning Pod/Container if available try: - tuning_pod = await Pod.read(self.tuning_pod_name, cast(str, self.namespace)) - tuning_container = tuning_pod.get_container(self.container_config.name) + tuning_pod = await PodHelper.read(self.tuning_pod_name, self.namespace) + tuning_container = find_container(tuning_pod, self.container_config.name) except kubernetes_asyncio.client.exceptions.ApiException as e: if e.status != 404 or e.reason != "Not Found": @@ -4199,20 +828,18 @@ async def _load_tuning_state(self) -> None: tuning_pod = None tuning_container = None - # TODO: Factor into a new class? self.tuning_pod = tuning_pod self.tuning_container = tuning_container - await self._configure_tuning_pod_template_spec() @property - def pod_template_spec_container(self) -> Container: - container_obj = next( - filter( - lambda c: c.name == self.container_config.name, - self._tuning_pod_template_spec.spec.containers, + def pod_template_spec_container(self) -> V1Container: + if not self._tuning_pod_template_spec: + raise servo.EventError( + "Cannot retrieve tuning container: tuning pod template spec not loaded" ) + return find_container( + workload=self._tuning_pod_template_spec, name=self.container_config.name ) - return Container(container_obj, None) def adjust( self, adjustment: servo.Adjustment, control: servo.Control = servo.Control() @@ -4230,9 +857,11 @@ def adjust( if setting_name in ("cpu", "memory"): # NOTE: use copy + update to apply values that may be outside of the range servo.logger.debug(f"Adjusting {setting_name}={value}") - setting = getattr(self.container_config, setting_name).copy( - update={"value": value} - ) + # NOTE copy is called from pydantic.BaseModel due to CPU/Memory setting chain of inheritance + # https://github.com/pydantic/pydantic/blob/abd687700afe28745a3af5bca6f0f0ba48c86d1e/pydantic/main.py#L627 + setting: Union[CPU, Memory] = getattr( + self.container_config, setting_name, pydantic.BaseModel + ).copy(update={"value": value}) # Set only the requirements defined in the config requirements: Dict[ResourceRequirement, Optional[str]] = {} @@ -4243,8 +872,8 @@ def adjust( servo.logger.debug( f"Setting resource requirements for {setting_name} to {requirements} on PodTemplateSpec" ) - self.pod_template_spec_container.set_resource_requirements( - setting_name, requirements + ContainerHelper.set_resource_requirements( + self.pod_template_spec_container, setting_name, requirements ) elif setting_name == "replicas": @@ -4253,8 +882,10 @@ def adjust( elif env_setting := servo.find_setting(self.container_config.env, setting_name): env_setting = env_setting.safe_set_value_copy(value) - self.pod_template_spec_container.set_environment_variable( - env_setting.variable_name, env_setting.value + ContainerHelper.set_environment_variable( + self.pod_template_spec_container, + env_setting.variable_name, + env_setting.value, ) else: @@ -4267,49 +898,43 @@ async def apply(self) -> None: assert self.tuning_pod, "Tuning Pod not loaded" assert self.tuning_container, "Tuning Container not loaded" - servo.logger.info("Applying adjustments to Tuning Pod") - task = asyncio.create_task(self.create_or_recreate_tuning_pod()) - try: - await task - except asyncio.CancelledError: - task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await task + servo.logger.info("Deleting existing tuning pod (if any)") + await self.delete_tuning_pod(raise_if_not_found=False) - raise + servo.logger.info("Applying adjustments to Tuning Pod") + await self.create_tuning_pod() - # TODO: logging the wrong values -- should be coming from the podtemplatespec? servo.logger.success( f"Built new tuning pod with container resources: {self.tuning_container.resources}, env: {self.tuning_container.env}" ) @property def namespace(self) -> str: - return self.target_controller_config.namespace + return self.workload_config.namespace @property def tuning_pod_name(self) -> str: """ Return the name of tuning Pod for this optimization. """ - return f"{self.target_controller_config.name}-tuning" + return f"{self.workload_config.name}-tuning" async def delete_tuning_pod( self, *, raise_if_not_found: bool = True - ) -> Optional[Pod]: + ) -> Optional[V1Pod]: """ Delete the tuning Pod. """ try: # TODO: Provide context manager or standard read option that handle not found? Lots of duplication on not found/conflict handling... - tuning_pod = await Pod.read(self.tuning_pod_name, self.namespace) + tuning_pod = await PodHelper.read(self.tuning_pod_name, self.namespace) self.logger.info( - f"Deleting tuning Pod '{tuning_pod.name}' from namespace '{tuning_pod.namespace}'..." + f"Deleting tuning Pod '{tuning_pod.metadata.name}' from namespace '{tuning_pod.metadata.namespace}'..." ) - await tuning_pod.delete() - await tuning_pod.wait_until_deleted() + await PodHelper.delete(tuning_pod) + await PodHelper.wait_until_deleted(tuning_pod) self.logger.info( - f"Deleted tuning Pod '{tuning_pod.name}' from namespace '{tuning_pod.namespace}'." + f"Deleted tuning Pod '{tuning_pod.metadata.name}' from namespace '{tuning_pod.metadata.namespace}'." ) self.tuning_pod = None @@ -4317,29 +942,24 @@ async def delete_tuning_pod( return tuning_pod except kubernetes_asyncio.client.exceptions.ApiException as e: - if e.status != 404 or e.reason != "Not Found" and raise_if_not_found: + if e.status != 404 or e.reason != "Not Found" or raise_if_not_found: raise + self.logger.info( + f"Ignoring delete tuning Pod '{self.tuning_pod_name}' from namespace '{self.namespace}' (pod not found)." + ) self.tuning_pod = None self.tuning_container = None return None - @property - def target_controller_name(self) -> str: - return self.target_controller_config.name - - @property - def container_name(self) -> str: - return self.container_config.name - - # TODO: Factor into another class? async def _configure_tuning_pod_template_spec(self) -> None: # Configure a PodSpecTemplate for the tuning Pod state - pod_template_spec: kubernetes_asyncio.client.models.V1PodTemplateSpec = ( - await self.target_controller.get_pod_template_spec_copy() + pod_template_spec = self.workload_helper.get_pod_template_spec_copy( + self.workload ) pod_template_spec.metadata.name = self.tuning_pod_name + pod_template_spec.metadata.namespace = self.namespace if pod_template_spec.metadata.annotations is None: pod_template_spec.metadata.annotations = {} @@ -4351,39 +971,31 @@ async def _configure_tuning_pod_template_spec(self) -> None: pod_template_spec.metadata.labels["opsani_role"] = "tuning" # Build a container from the raw podspec - container_obj = next( - filter( - lambda c: c.name == self.container_config.name, - pod_template_spec.spec.containers, - ) - ) - container = Container(container_obj, None) + container = find_container(pod_template_spec, self.container_config.name) servo.logger.debug( f"Initialized new tuning container from Pod spec template: {container.name}" ) if self.container_config.static_environment_variables: - if container.obj.env is None: - container.obj.env = [] + if container.env is None: + container.env = [] # Filter out vars with the same name as the ones we are setting - container.obj.env = list( - filter( - lambda e: e.name - not in self.container_config.static_environment_variables, - container.obj.env, - ) - ) - + container.env = [ + e + for e in cast(list[V1EnvVar], container.env) + if e.name not in self.container_config.static_environment_variables + ] env_list = [ - kubernetes_asyncio.client.V1EnvVar(name=k, value=v) + V1EnvVar(name=k, value=v) for k, v in self.container_config.static_environment_variables.items() ] - container.obj.env.extend(env_list) + container.env.extend(env_list) if self.tuning_container: servo.logger.debug( - f"Copying resource requirements from existing tuning pod container '{self.tuning_pod.name}/{self.tuning_container.name}'" + "Copying resource requirements from existing tuning pod container" + f" '{self.tuning_pod.metadata.name}/{self.tuning_container.name}'" ) resource_requirements = self.tuning_container.resources container.resources = resource_requirements @@ -4400,39 +1012,43 @@ async def _configure_tuning_pod_template_spec(self) -> None: servo_pod_namespace = os.environ.get("POD_NAMESPACE") if servo_pod_name is not None and servo_pod_namespace is not None: self.logger.debug( - f"running within Kubernetes, registering as Pod controller... (pod={servo_pod_name}, namespace={servo_pod_namespace})" + "running within Kubernetes, registering as Pod controller..." + f" (pod={servo_pod_name}, namespace={servo_pod_namespace})" ) - servo_pod = await Pod.read(servo_pod_name, servo_pod_namespace) + + # ephemeral, get its controller + servo_pod = await PodHelper.read(servo_pod_name, servo_pod_namespace) + pod_controller = next( iter( ow - for ow in servo_pod.obj.metadata.owner_references + for ow in cast( + list[V1OwnerReference], servo_pod.metadata.owner_references + ) if ow.controller ) ) + # still ephemeral + servo_rs = await ReplicasetHelper.read( + name=pod_controller.name, namespace=servo_pod_namespace + ) - # TODO: Create a ReplicaSet class... - async with kubernetes_asyncio.client.api_client.ApiClient() as api: - api_client = kubernetes_asyncio.client.AppsV1Api(api) - - servo_rs: kubernetes_asyncio.client.V1ReplicaSet = ( - await api_client.read_namespaced_replica_set( - name=pod_controller.name, namespace=servo_pod_namespace - ) - ) # still ephemeral - rs_controller = next( - iter( - ow for ow in servo_rs.metadata.owner_references if ow.controller - ) - ) - servo_dep: kubernetes_asyncio.client.V1Deployment = ( - await api_client.read_namespaced_deployment( - name=rs_controller.name, namespace=servo_pod_namespace + rs_controller = next( + iter( + ow + for ow in cast( + list[V1OwnerReference], servo_rs.metadata.owner_references ) + if ow.controller ) + ) + # not ephemeral + servo_dep = await DeploymentHelper.read( + name=rs_controller.name, namespace=servo_pod_namespace + ) pod_template_spec.metadata.owner_references = [ - kubernetes_asyncio.client.V1OwnerReference( + V1OwnerReference( api_version=servo_dep.api_version, block_owner_deletion=True, controller=True, # Ensures the pod will not be adopted by another controller @@ -4444,15 +1060,7 @@ async def _configure_tuning_pod_template_spec(self) -> None: self._tuning_pod_template_spec = pod_template_spec - async def create_or_recreate_tuning_pod(self) -> Pod: - """ - Creates a new Tuning Pod or deletes and recreates one from the current optimization state. - """ - servo.logger.info("Deleting existing tuning pod (if any)") - await self.delete_tuning_pod(raise_if_not_found=False) - return await self.create_tuning_pod() - - async def create_tuning_pod(self) -> Pod: + async def create_tuning_pod(self) -> V1Pod: """ Creates a new Tuning Pod from the current optimization state. """ @@ -4460,26 +1068,25 @@ async def create_tuning_pod(self) -> Pod: assert self.tuning_pod is None, "Tuning Pod already exists" assert self.tuning_container is None, "Tuning Pod Container already exists" self.logger.debug( - f"creating tuning pod '{self.tuning_pod_name}' based on {self.target_controller_type} '{self.target_controller_name}' in namespace '{self.namespace}'" + f"creating tuning pod '{self.tuning_pod_name}' based on {self.workload.kind}" + f" '{self.workload.metadata.name}' in namespace '{self.namespace}'" ) # Setup the tuning Pod -- our settings are updated on the underlying PodSpec template self.logger.trace(f"building new tuning pod") - pod_obj = kubernetes_asyncio.client.V1Pod( + pod_obj = V1Pod( metadata=self._tuning_pod_template_spec.metadata, spec=self._tuning_pod_template_spec.spec, ) - # Update pod with latest controller state - pod_obj = self.target_controller.update_pod(pod_obj) - - tuning_pod = Pod(obj=pod_obj) + # TODO when supporting Argo rollout, must add rollout.status.current_pod_hash to pod labels + # under key "rollouts-pod-template-hash" # Create the Pod and wait for it to get ready self.logger.info( f"Creating tuning Pod '{self.tuning_pod_name}' in namespace '{self.namespace}'" ) - await tuning_pod.create(self.namespace) + tuning_pod = await PodHelper.create(pod_obj) servo.logger.success( f"Created Tuning Pod '{self.tuning_pod_name}' in namespace '{self.namespace}'" ) @@ -4495,7 +1102,7 @@ async def create_tuning_pod(self) -> Pod: ) progress.start() - task = asyncio.create_task(tuning_pod.wait_until_ready()) + task = asyncio.create_task(PodHelper.wait_until_ready(tuning_pod)) task.add_done_callback(lambda _: progress.complete()) gather_task = asyncio.gather( task, @@ -4514,21 +1121,17 @@ async def create_tuning_pod(self) -> Pod: await t servo.logger.debug(f"Cancelled Task: {t}, progress: {progress}") - await self.raise_for_status(tuning_pod=tuning_pod) - - # Load the in memory model for various convenience accessors - await tuning_pod.refresh() - await tuning_pod.get_containers() + # get latest status of tuning pod for raise_for_status + await self.raise_for_status() # Hydrate local state - self.tuning_pod = tuning_pod - self.tuning_container = tuning_pod.get_container(self.container_config.name) + await self._load_tuning_state() servo.logger.info(f"Tuning Pod successfully created") return tuning_pod @contextlib.asynccontextmanager - async def temporary_tuning_pod(self) -> AsyncIterator[Pod]: + async def temporary_tuning_pod(self) -> AsyncIterator[V1Pod]: """Mostly used for testing where automatic teardown is not available""" try: tuning_pod = await self.create_tuning_pod() @@ -4547,7 +1150,9 @@ def tuning_cpu(self) -> Optional[CPU]: cpu = self.container_config.cpu.copy() # Determine the value in priority order from the config - resource_requirements = self.tuning_container.get_resource_requirements("cpu") + resource_requirements = ContainerHelper.get_resource_requirements( + self.tuning_container, Resource.cpu.value + ) cpu.request = resource_requirements.get(ResourceRequirement.request) cpu.limit = resource_requirements.get(ResourceRequirement.limit) value = resource_requirements.get( @@ -4574,8 +1179,8 @@ def tuning_memory(self) -> Optional[Memory]: memory = self.container_config.memory.copy() # Determine the value in priority order from the config - resource_requirements = self.tuning_container.get_resource_requirements( - "memory" + resource_requirements = ContainerHelper.get_resource_requirements( + self.tuning_container, Resource.memory.value ) memory.request = resource_requirements.get(ResourceRequirement.request) memory.limit = resource_requirements.get(ResourceRequirement.limit) @@ -4601,8 +1206,8 @@ def tuning_env(self) -> Optional[list[servo.EnvironmentSetting]]: env: list[servo.EnvironmentSetting] = [] env_setting: Union[servo.EnvironmentRangeSetting, servo.EnvironmentEnumSetting] for env_setting in self.container_config.env or []: - if env_val := self.tuning_container.get_environment_variable( - env_setting.name + if env_val := ContainerHelper.get_environment_variable( + self.tuning_container, env_setting.name ): env_setting = env_setting.safe_set_value_copy(env_val) env.append(env_setting) @@ -4628,7 +1233,7 @@ def on_failure(self) -> FailureMode: Return the configured failure behavior. If not set explicitly, this will be cascaded from the base kubernetes configuration (or its default) """ - return self.target_controller_config.on_failure + return self.workload_config.on_failure @property def main_cpu(self) -> CPU: @@ -4636,7 +1241,9 @@ def main_cpu(self) -> CPU: Return the current CPU setting for the main containers. """ # Determine the value in priority order from the config - resource_requirements = self.main_container.get_resource_requirements("cpu") + resource_requirements = ContainerHelper.get_resource_requirements( + self.main_container, Resource.cpu.value + ) value = resource_requirements.get( next( filter( @@ -4661,7 +1268,9 @@ def main_memory(self) -> Memory: Return the current Memory setting for the main containers. """ # Determine the value in priority order from the config - resource_requirements = self.main_container.get_resource_requirements("memory") + resource_requirements = ContainerHelper.get_resource_requirements( + self.main_container, Resource.memory.value + ) value = resource_requirements.get( next( filter( @@ -4674,7 +1283,7 @@ def main_memory(self) -> Memory: short_byte_size = ShortByteSize.validate(value) # NOTE: use safe_set to accept values from mainline outside of our range - memory = self.container_config.memory.safe_set_value_copy(value) + memory: Memory = self.container_config.memory.safe_set_value_copy(value) memory.pinned = True memory.request = resource_requirements.get(ResourceRequirement.request) memory.limit = resource_requirements.get(ResourceRequirement.limit) @@ -4685,8 +1294,8 @@ def main_env(self) -> list[servo.EnvironmentSetting]: env: list[servo.EnvironmentSetting] = [] env_setting: Union[servo.EnvironmentRangeSetting, servo.EnvironmentEnumSetting] for env_setting in self.container_config.env or []: - if env_val := self.main_container.get_environment_variable( - env_setting.name + if env_val := ContainerHelper.get_environment_variable( + self.main_container, env_setting.name ): env_setting = env_setting.safe_set_value_copy(env_val) env_setting.pinned = True @@ -4705,7 +1314,7 @@ def main_replicas(self) -> servo.Replicas: return servo.Replicas( min=0, max=99999, - value=self.target_controller.replicas, + value=self.workload.spec.replicas, pinned=True, ) @@ -4718,7 +1327,7 @@ def main_name(self) -> str: """ return ( self.container_config.alias - or f"{self.target_controller_config.name}/{self.container_config.name}" + or f"{self.workload_config.name}/{self.container_config.name}" ) def to_components(self) -> List[servo.Component]: @@ -4747,20 +1356,6 @@ def to_components(self) -> List[servo.Component]: servo.Component(name=self.name, settings=tuning_settings), ] - async def rollback(self, error: Optional[Exception] = None) -> None: - """ - Not supported. Raises a TypeError when called. - - Rollbacks are not supported by the canary optimization strategy - because they are dependent on Kubernetes Deployments. - """ - raise TypeError( - ( - "rollback is not supported under the canary optimization strategy because rollbacks are applied to " - "Kubernetes Deployment objects and canary optimization is performed against a standalone Pod." - ) - ) - async def destroy(self, error: Optional[Exception] = None) -> None: if await self.delete_tuning_pod(raise_if_not_found=False) is None: self.logger.debug(f"no tuning pod exists, ignoring destroy") @@ -4772,17 +1367,9 @@ async def shutdown(self, error: Optional[Exception] = None) -> None: await self.destroy(error) async def handle_error(self, error: Exception) -> bool: - if ( - self.on_failure == FailureMode.rollback - or self.on_failure == FailureMode.shutdown - ): + if self.on_failure == FailureMode.shutdown: # Ensure that we chain any underlying exceptions that may occur try: - if self.on_failure == FailureMode.rollback: - self.logger.warning( - f"cannot rollback a tuning Pod: falling back to shutdown: {error}" - ) - try: await asyncio.wait_for( self.shutdown(), timeout=self.timeout.total_seconds() @@ -4798,7 +1385,7 @@ async def handle_error(self, error: Exception) -> bool: "creating new tuning pod against baseline following failed adjust" ) await self._configure_tuning_pod_template_spec() # reset to baseline from the target controller - self.tuning_pod = await self.create_or_recreate_tuning_pod() + self.tuning_pod = await self.create_tuning_pod() raise error # Always communicate errors to backend unless ignored @@ -4809,18 +1396,22 @@ async def handle_error(self, error: Exception) -> bool: return await super().handle_error(error) async def is_ready(self) -> bool: - is_ready, restart_count = await asyncio.gather( - self.tuning_pod.is_ready(), self.tuning_pod.get_restart_count() + # Refresh pod state + self.tuning_pod = await PodHelper.read( + self.tuning_pod.metadata.name, self.tuning_pod.metadata.namespace + ) + return ( + PodHelper.is_ready(self.tuning_pod) + and PodHelper.get_restart_count(self.tuning_pod) == 0 ) - return is_ready and restart_count == 0 - async def raise_for_status(self, tuning_pod=None) -> None: + async def raise_for_status(self) -> None: """Raise an exception if in an unhealthy state.""" - if tuning_pod is None: - tuning_pod = self.tuning_pod - await tuning_pod.raise_for_status( + self.tuning_pod = await PodHelper.read(self.tuning_pod_name, self.namespace) + await PodHelper.raise_for_status( + self.tuning_pod, adjustments=self.adjustments, - include_container_logs=self.target_controller_config.container_logs_in_error_status, + include_container_logs=self.workload_config.container_logs_in_error_status, ) class Config: @@ -4834,7 +1425,6 @@ class KubernetesOptimizations(pydantic.BaseModel, servo.logging.Mixin): """ config: "KubernetesConfiguration" - namespace: Namespace optimizations: List[BaseOptimization] runtime_id: str spec_id: str @@ -4847,35 +1437,26 @@ async def create( """ Read the state of all components under optimization from the cluster and return an object representation. """ - namespace = await Namespace.read(config.namespace) optimizations: List[BaseOptimization] = [] images = {} runtime_ids = {} pod_tmpl_specs = {} # TODO rename varname to workload_configs - for deployment_or_rollout_config in config.workloads: - if deployment_or_rollout_config.strategy == OptimizationStrategy.default: - if isinstance(deployment_or_rollout_config, RolloutConfiguration): - raise NotImplementedError( - "Saturation mode not currently supported on Argo Rollouts" - ) - optimization = await DeploymentOptimization.create( - deployment_or_rollout_config, - timeout=deployment_or_rollout_config.timeout, + for workload_config in config.workloads: + if workload_config.strategy == OptimizationStrategy.default: + optimization = await SaturationOptimization.create( + workload_config, + timeout=workload_config.timeout, ) - deployment_or_rollout = optimization.deployment + workload = optimization.workload container = optimization.container - elif deployment_or_rollout_config.strategy == OptimizationStrategy.canary: - if isinstance(deployment_or_rollout_config, StatefulSetConfiguration): - raise NotImplementedError( - "Canary mode not currently supported on StatefulSets" - ) + elif workload_config.strategy == OptimizationStrategy.canary: optimization = await CanaryOptimization.create( - deployment_or_rollout_config, - timeout=deployment_or_rollout_config.timeout, + workload_config, + timeout=workload_config.timeout, ) - deployment_or_rollout = optimization.target_controller + workload = optimization.workload container = optimization.main_container # Ensure the canary is available @@ -4885,17 +1466,17 @@ async def create( await optimization.create_tuning_pod() else: raise ValueError( - f"unknown optimization strategy: {deployment_or_rollout_config.strategy}" + f"unknown optimization strategy: {workload_config.strategy}" ) optimizations.append(optimization) # compile artifacts for checksum calculation - pods = await deployment_or_rollout.get_pods() - runtime_ids[optimization.name] = [pod.uid for pod in pods] - pod_tmpl_specs[ - deployment_or_rollout.name - ] = deployment_or_rollout.pod_template_spec.spec + pods = await PodHelper.list_pods_with_labels( + workload.metadata.namespace, workload.spec.selector.match_labels + ) + runtime_ids[optimization.name] = [pod.metadata.uid for pod in pods] + pod_tmpl_specs[workload.metadata.name] = workload.spec.template.spec images[container.name] = container.image # Compute checksums for change detection @@ -4909,7 +1490,6 @@ async def create( return KubernetesOptimizations( config=config, - namespace=namespace, optimizations=optimizations, spec_id=spec_id, runtime_id=runtime_id, @@ -5170,7 +1750,6 @@ class FailureMode(str, enum.Enum): The FailureMode enumeration defines how to handle a failed adjustment of a Kubernetes resource. """ - rollback = "rollback" shutdown = "shutdown" ignore = "ignore" exception = "exception" @@ -5211,10 +1790,10 @@ class PermissionSet(pydantic.BaseModel): ), ] -ROLLOUT_PERMISSIONS = [ +STATEFULSET_PERMISSIONS = [ PermissionSet( - group="argoproj.io", - resources=["rollouts", "rollouts/status"], + group="apps", + resources=["statefulsets"], verbs=["get", "list", "watch", "update", "patch"], ), ] @@ -5289,18 +1868,13 @@ class DeploymentConfiguration(BaseKubernetesConfiguration): class StatefulSetConfiguration(DeploymentConfiguration): - pass - - -class RolloutConfiguration(BaseKubernetesConfiguration): - """ - The RolloutConfiguration class models the configuration of an optimizable Argo Rollout. - """ - - name: DNSSubdomainName - containers: List[ContainerConfiguration] - strategy: StrategyTypes = OptimizationStrategy.canary - replicas: servo.Replicas + @pydantic.validator("strategy") + def validate_strategy(cls, v): + if v == OptimizationStrategy.canary: + raise NotImplementedError( + "Canary mode is not currently supported on StatefulSets" + ) + return v class KubernetesConfiguration(BaseKubernetesConfiguration): @@ -5321,21 +1895,11 @@ class KubernetesConfiguration(BaseKubernetesConfiguration): description="Deployments to be optimized.", ) - rollouts: Optional[List[RolloutConfiguration]] = pydantic.Field( - description="Argo rollouts to be optimized.", - ) - @property def workloads( self, - ) -> list[ - Union[StatefulSetConfiguration, DeploymentConfiguration, RolloutConfiguration] - ]: - return ( - (self.deployments or []) - + (self.rollouts or []) - + (self.stateful_sets or []) - ) + ) -> list[Union[StatefulSetConfiguration, DeploymentConfiguration]]: + return (self.deployments or []) + (self.stateful_sets or []) @pydantic.root_validator def check_workload(cls, values): @@ -5443,7 +2007,7 @@ async def load_kubeconfig(self) -> None: KubernetesOptimizations.update_forward_refs() -DeploymentOptimization.update_forward_refs() +SaturationOptimization.update_forward_refs() CanaryOptimization.update_forward_refs() @@ -5472,8 +2036,9 @@ async def check_kubernetes_permissions(self) -> None: async with kubernetes_asyncio.client.api_client.ApiClient() as api: v1 = kubernetes_asyncio.client.AuthorizationV1Api(api) required_permissions = self.config.permissions - if self.config.rollouts: - required_permissions.extend(ROLLOUT_PERMISSIONS) + if self.config.stateful_sets: + required_permissions.extend(STATEFULSET_PERMISSIONS) + # TODO stateful_set permissions for permission in required_permissions: for resource in permission.resources: for verb in permission.verbs: @@ -5503,44 +2068,50 @@ async def check_kubernetes_permissions(self) -> None: @servo.require('Namespace "{self.config.namespace}" is readable') async def check_kubernetes_namespace(self) -> None: - await Namespace.read(self.config.namespace) + await NamespaceHelper.read(self.config.namespace) @servo.multicheck('Deployment "{item.name}" is readable') async def check_kubernetes_deployments(self) -> Tuple[Iterable, servo.CheckHandler]: async def check_dep(dep_config: DeploymentConfiguration) -> None: - await Deployment.read(dep_config.name, dep_config.namespace) + await DeploymentHelper.read(dep_config.name, dep_config.namespace) return (self.config.deployments or []), check_dep - @servo.multicheck('Rollout "{item.name}" is readable') - async def check_kubernetes_rollouts(self) -> Tuple[Iterable, servo.CheckHandler]: - async def check_rol(rol_config: RolloutConfiguration) -> None: - await Rollout.read(rol_config.name, rol_config.namespace) + @servo.multicheck('StatefulSet "{item.name}" is readable') + async def check_kubernetes_statefulsets( + self, + ) -> Tuple[Iterable, servo.CheckHandler]: + async def check_ss(ss_config: StatefulSetConfiguration) -> None: + await StatefulSetHelper.read(ss_config.name, ss_config.namespace) - return (self.config.rollouts or []), check_rol + return (self.config.stateful_sets or []), check_ss async def _check_container_resource_requirements( self, - target_controller: Union[Deployment, Rollout], - target_config: Union[DeploymentConfiguration, RolloutConfiguration], + target_controller: Union[V1Deployment, V1StatefulSet], + target_config: Union[DeploymentConfiguration, StatefulSetConfiguration], ) -> None: for cont_config in target_config.containers: - container = target_controller.find_container(cont_config.name) + container = find_container(target_controller, cont_config.name) assert ( container ), f"{type(target_controller).__name__} {target_config.name} has no container {cont_config.name}" for resource in Resource.values(): current_state = None - container_requirements = container.get_resource_requirements(resource) - get_requirements = getattr(cont_config, resource).get + container_requirements = ContainerHelper.get_resource_requirements( + container, resource + ) + get_requirements = cast( + Union[CPU, Memory], getattr(cont_config, resource) + ).get for requirement in get_requirements: current_state = container_requirements.get(requirement) if current_state: break assert current_state, ( - f"{type(target_controller).__name__} {target_config.name} target container {cont_config.name} spec does not define the resource {resource}. " + f"{target_controller.kind} {target_config.name} target container {cont_config.name} spec does not define the resource {resource}. " f"At least one of the following must be specified: {', '.join(map(lambda req: req.resources_key, get_requirements))}" ) @@ -5553,46 +2124,58 @@ async def check_kubernetes_resource_requirements( async def check_dep_resource_requirements( dep_config: DeploymentConfiguration, ) -> None: - deployment = await Deployment.read(dep_config.name, dep_config.namespace) + deployment = await DeploymentHelper.read( + dep_config.name, dep_config.namespace + ) await self._check_container_resource_requirements(deployment, dep_config) return (self.config.deployments or []), check_dep_resource_requirements @servo.multicheck( - 'Containers in the "{item.name}" Rollout have resource requirements' + 'Containers in the "{item.name}" StatefulSet have resource requirements' ) - async def check_kubernetes_rollout_resource_requirements( + async def check_kubernetes_stateful_set_resource_requirements( self, ) -> Tuple[Iterable, servo.CheckHandler]: - async def check_rol_resource_requirements( - rol_config: RolloutConfiguration, + async def check_ss_resource_requirements( + ss_config: StatefulSetConfiguration, ) -> None: - rollout = await Rollout.read(rol_config.name, rol_config.namespace) - await self._check_container_resource_requirements(rollout, rol_config) + stateful_set = await StatefulSetHelper.read( + ss_config.name, ss_config.namespace + ) + await self._check_container_resource_requirements(stateful_set, ss_config) - return (self.config.rollouts or []), check_rol_resource_requirements + return (self.config.stateful_sets or []), check_ss_resource_requirements @servo.multicheck('Deployment "{item.name}" is ready') async def check_kubernetes_deployments_are_ready( self, ) -> Tuple[Iterable, servo.CheckHandler]: async def check_deployment(dep_config: DeploymentConfiguration) -> None: - deployment = await Deployment.read(dep_config.name, dep_config.namespace) - if not await deployment.is_ready(): - raise RuntimeError(f'Deployment "{deployment.name}" is not ready') + deployment = await DeploymentHelper.read( + dep_config.name, dep_config.namespace + ) + if not DeploymentHelper.is_ready(deployment): + raise RuntimeError( + f'Deployment "{deployment.metadata.name}" is not ready' + ) return (self.config.deployments or []), check_deployment - @servo.multicheck('Rollout "{item.name}" is ready') - async def check_kubernetes_rollouts_are_ready( + @servo.multicheck('StatefulSet "{item.name}" is ready') + async def check_kubernetes_stateful_sets_are_ready( self, ) -> Tuple[Iterable, servo.CheckHandler]: - async def check_rollout(rol_config: RolloutConfiguration) -> None: - rollout = await Rollout.read(rol_config.name, rol_config.namespace) - if not await rollout.is_ready(): - raise RuntimeError(f'Rollout "{rollout.name}" is not ready') + async def check_stateful_set(ss_config: StatefulSetConfiguration) -> None: + stateful_set = await StatefulSetHelper.read( + ss_config.name, ss_config.namespace + ) + if not StatefulSetHelper.is_ready(stateful_set): + raise RuntimeError( + f'Rollout "{stateful_set.metadata.name}" is not ready' + ) - return (self.config.rollouts or []), check_rollout + return (self.config.stateful_sets or []), check_stateful_set @servo.metadata( @@ -5762,166 +2345,6 @@ async def _create_optimizations(self) -> KubernetesOptimizations: return future.result() -def selector_string(selectors: Mapping[str, str]) -> str: - """Create a selector string from the given dictionary of selectors. - - Args: - selectors: The selectors to stringify. - - Returns: - The selector string for the given dictionary. - """ - return ",".join([f"{k}={v}" for k, v in selectors.items()]) - - -def selector_kwargs( - fields: Mapping[str, str] = None, - labels: Mapping[str, str] = None, -) -> Dict[str, str]: - """Create a dictionary of kwargs for Kubernetes object selectors. - - Args: - fields: A mapping of fields used to restrict the returned collection of - Objects to only those which match these field selectors. By default, - no restricting is done. - labels: A mapping of labels used to restrict the returned collection of - Objects to only those which match these label selectors. By default, - no restricting is done. - - Returns: - A dictionary that can be used as kwargs for many Kubernetes API calls for - label and field selectors. - """ - kwargs = {} - if fields is not None: - kwargs["field_selector"] = selector_string(fields) - if labels is not None: - kwargs["label_selector"] = selector_string(labels) - - return kwargs - - -class ConfigMap(KubernetesModel): - """Kubetest wrapper around a Kubernetes `ConfigMap`_ API Object. - - The actual ``kubernetes.client.V1ConfigMap`` instance that this - wraps can be accessed via the ``obj`` instance member. - - This wrapper provides some convenient functionality around the - API Object and provides some state management for the `ConfigMap`_. - - .. _ConfigMap: - https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#configmap-v1-core - """ - - obj_type = kubernetes_asyncio.client.V1ConfigMap - - api_clients = { - "preferred": kubernetes_asyncio.client.CoreV1Api, - "v1": kubernetes_asyncio.client.CoreV1Api, - } - - @classmethod - async def read(cls, name: str, namespace: str) -> "ConfigMap": - """Read a ConfigMap by name under the given namespace. - - Args: - name: The name of the Deployment to read. - namespace: The namespace to read the Deployment from. - """ - - async with cls.preferred_client() as api_client: - obj = await api_client.read_namespaced_config_map(name, namespace) - return ConfigMap(obj) - - async def create(self, namespace: str = None) -> None: - """Create the ConfigMap under the given namespace. - - Args: - namespace: The namespace to create the ConfigMap under. - If the ConfigMap was loaded via the kubetest client, the - namespace will already be set, so it is not needed here. - Otherwise, the namespace will need to be provided. - """ - if namespace is None: - namespace = self.namespace - - servo.logger.info( - f'creating configmap "{self.name}" in namespace "{self.namespace}"' - ) - servo.logger.debug(f"configmap: {self.obj}") - - self.obj = await self.api_client.create_namespaced_config_map( - namespace=namespace, - body=self.obj, - ) - - async def patch(self) -> None: - """ - Patches a ConfigMap. - """ - self.logger.info(f'patching ConfigMap "{self.name}"') - self.logger.trace(f"ConfigMap: {self.obj}") - async with self.api_client() as api_client: - await api_client.patch_namespaced_config_map( - name=self.name, - namespace=self.namespace, - body=self.obj, - ) - - async def delete( - self, options: kubernetes_asyncio.client.V1DeleteOptions = None - ) -> kubernetes_asyncio.client.V1Status: - """Delete the ConfigMap. - - This method expects the ConfigMap to have been loaded or otherwise - assigned a namespace already. If it has not, the namespace will need - to be set manually. - - Args: - options: Options for ConfigMap deletion. - - Returns: - The status of the delete operation. - """ - if options is None: - options = kubernetes_asyncio.client.V1DeleteOptions() - - servo.logger.info(f'deleting configmap "{self.name}"') - servo.logger.debug(f"delete options: {options}") - servo.logger.debug(f"configmap: {self.obj}") - - return await self.api_client.delete_namespaced_config_map( - name=self.name, - namespace=self.namespace, - body=options, - ) - - async def refresh(self) -> None: - """Refresh the underlying Kubernetes ConfigMap resource.""" - self.obj = await self.api_client.read_namespaced_config_map( - name=self.name, - namespace=self.namespace, - ) - - async def is_ready(self) -> bool: - """Check if the ConfigMap is in the ready state. - - ConfigMaps do not have a "status" field to check, so we will - measure their readiness status by whether or not they exist - on the cluster. - - Returns: - True if in the ready state; False otherwise. - """ - try: - await self.refresh() - except: # noqa - return False - - return True - - def dns_subdomainify(name: str) -> str: """ Valid DNS Subdomain Names conform to [RFC 1123](https://tools.ietf.org/html/rfc1123) and must: @@ -5995,12 +2418,12 @@ def dns_labelize(name: str) -> str: def set_container_resource_defaults_from_config( - container: Container, config: ContainerConfiguration + container: V1Container, config: ContainerConfiguration ) -> None: for resource in Resource.values(): # NOTE: cpu/memory stanza in container config resource_config = getattr(config, resource) - requirements = container.get_resource_requirements(resource) + requirements = ContainerHelper.get_resource_requirements(container, resource) servo.logger.debug( f"Loaded resource requirements for '{resource}': {requirements}" ) @@ -6021,4 +2444,6 @@ def set_container_resource_defaults_from_config( servo.logger.debug( f"Setting resource requirements for '{resource}' to: {requirements}" ) - container.set_resource_requirements(resource, requirements) + requirements = ContainerHelper.set_resource_requirements( + container, resource, requirements + ) diff --git a/servo/connectors/kubernetes_helpers/__init__.py b/servo/connectors/kubernetes_helpers/__init__.py new file mode 100644 index 000000000..a428ddbec --- /dev/null +++ b/servo/connectors/kubernetes_helpers/__init__.py @@ -0,0 +1,8 @@ +from .container import * +from .deployment import * +from .namespace import * +from .pod import * +from .replicaset import * +from .service import * +from .statefulset import * +from .util import * diff --git a/servo/connectors/kubernetes_helpers/base.py b/servo/connectors/kubernetes_helpers/base.py new file mode 100644 index 000000000..fccc492c6 --- /dev/null +++ b/servo/connectors/kubernetes_helpers/base.py @@ -0,0 +1,55 @@ +import abc +import devtools +from typing import Any, AsyncIterator, Optional + +import kubernetes_asyncio.watch + +from servo.logging import logger + + +class BaseKubernetesHelper(abc.ABC): + @classmethod + @abc.abstractmethod + async def watch_args(cls, api_object: object) -> AsyncIterator[dict[str, Any]]: + ... + + @classmethod + @abc.abstractmethod + def is_ready(cls, api_object: object, event_type: Optional[str] = None) -> bool: + ... + + @classmethod + async def wait_until_deleted(cls, api_object: object) -> None: + async with cls.watch_args(api_object) as watch_args: + async with kubernetes_asyncio.watch.Watch().stream(**watch_args) as stream: + async for event in stream: + cls.log_watch_event(event) + + if event["type"] == "DELETED": + stream.stop() + return + + @classmethod + async def wait_until_ready(cls, api_object: object) -> None: + async with cls.watch_args(api_object) as watch_args: + async with kubernetes_asyncio.watch.Watch().stream(**watch_args) as stream: + async for event in stream: + cls.log_watch_event(event) + + if cls.is_ready(event["object"], event["type"]): + stream.stop() + return + + @classmethod + def log_watch_event(cls, event: dict[str, Any]) -> None: + event_type: str = event["type"] + obj: dict = event["object"].to_dict() + kind: str = obj.get("kind", "UNKNOWN") + metadata = obj.get("metadata", {}) + name: str = metadata.get("name", "UNKNOWN") + namespace: str = metadata.get("namespace", "UNKNOWN") + logger.debug( + f"watch yielded event: {event_type} on kind {kind} {name}" + f" in namespace {namespace}" + ) + logger.trace(devtools.pformat(obj)) diff --git a/servo/connectors/kubernetes_helpers/base_workload.py b/servo/connectors/kubernetes_helpers/base_workload.py new file mode 100644 index 000000000..4e9774335 --- /dev/null +++ b/servo/connectors/kubernetes_helpers/base_workload.py @@ -0,0 +1,255 @@ +import abc +from collections import defaultdict +from typing import Optional, Union + +from kubernetes_asyncio.client import ( + V1Deployment, + V1StatefulSet, + V1Pod, + V1ContainerStatus, + V1PodCondition, +) +from .base import BaseKubernetesHelper +from .pod import PodHelper + +from servo.errors import AdjustmentFailedError, AdjustmentRejectedError +from servo.logging import logger +from servo.types.api import Adjustment + + +class BaseKubernetesWorkloadHelper(BaseKubernetesHelper): + @classmethod + @abc.abstractmethod + def check_conditions(cls, workload: Union[V1Deployment, V1StatefulSet]) -> None: + ... + + @classmethod + @abc.abstractmethod + async def get_latest_pods( + cls, workload: Union[V1Deployment, V1StatefulSet] + ) -> list[V1Pod]: + ... + + @classmethod + def is_ready( + cls, + workload: Union[V1Deployment, V1StatefulSet], + event_type: Optional[str] = None, + ): + if event_type == "ERROR": + # NOTE: Never seen this in action but apparently its part of k8s https://github.com/kubernetes/kubernetes/blob/6e0de20fbb4c127d2e45c7a22347c08545fc7a86/staging/src/k8s.io/apimachinery/pkg/watch/watch.go#L48 + raise AdjustmentRejectedError(str(workload), reason="start-failed") + + # TODO other rejection checks + + if workload.metadata.generation != workload.status.observed_generation: + logger.debug( + f"status observed generation ({workload.status.observed_generation}) does not match" + f" metadata generation ({workload.metadata.generation}), returning is_ready=false" + ) + return False + + # Fast fail on undesirable conditions + # NOTE this check only applies to Deployments (see https://github.com/kubernetes/kubernetes/issues/79606) + # TODO/FIXME we should really be checking the (latest) pods in order to get the most accurate info on deployment failures + # test conditions include FailedScheduling, FailedCreate (error looking up service account), + if workload.status.conditions: + cls.check_conditions(workload) + + # NOTE this field is N/A for StatefulSets unless the MaxUnavailableStatefulSet flag is enabled + if unavailable_count := getattr(workload.status, "unavailable_replicas", 0): + logger.debug( + f"found {unavailable_count} unavailable replicas, returning is_ready=false" + ) + return False + + desired_replicas = workload.spec.replicas + logger.debug( + f"Comparing desired replicas ({desired_replicas}) against current status replica counts: {workload.status}" + ) + + # Verify all scale ups and scale downs have completed + replica_counts: list[int] = [ + workload.status.replicas, # NOTE this includes replicas from previous versions allowing to wait for scaledowns without returning too early + workload.status.ready_replicas, + workload.status.updated_replicas, + ] + # NOTE: available counts is not always present on StatefulSets, assumedly due to the + # beta status of minReadySeconds https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#minimum-ready-seconds + if ( + available_replicas := getattr(workload.status, "available_replicas", None) + ) is not None: + replica_counts.append(available_replicas) + if replica_counts.count(desired_replicas) == len(replica_counts): + # We are done: all the counts match. Stop the watch and return + logger.debug(f"{workload.kind} '{workload.metadata.name}' is ready") + return True + + logger.debug("Replica counts out of alignment, returning is_ready=false") + return False + + @classmethod + async def get_restart_count( + cls, workload: Union[V1Deployment, V1StatefulSet] + ) -> int: + count = 0 + for pod in await cls.get_latest_pods(workload): + count += PodHelper.get_restart_count(pod) + + return count + + @classmethod + async def raise_for_status( + cls, + workload: Union[V1Deployment, V1StatefulSet], + adjustments: list[Adjustment], + include_container_logs=False, + ) -> None: + # NOTE: operate off of current state, assuming you have checked is_ready() + status = workload.status + logger.trace(f"current {workload.kind} status is {status}") + if status is None: + raise RuntimeError(f"No such {workload.kind}: {workload.metadata.name}") + + if not status.conditions: + raise RuntimeError( + f"{workload.kind} is not running: {workload.metadata.name}" + ) + + # Check for failure conditions + # NOTE this check only applies to Deployments (see https://github.com/kubernetes/kubernetes/issues/79606) + if status.conditions: + cls.check_conditions(workload) + + await cls.raise_for_failed_pod_adjustments( + workload=workload, + adjustments=adjustments, + include_container_logs=include_container_logs, + ) + + # Catchall + logger.trace( + f"unable to map {workload.kind} status to exception. workload: {workload}" + ) + raise RuntimeError( + f"Unknown {workload.kind} status for '{workload.metadata.name}': {status}" + ) + + @classmethod + async def raise_for_failed_pod_adjustments( + cls, + workload: Union[V1Deployment, V1StatefulSet], + adjustments: list[Adjustment], + include_container_logs=False, + ): + pods = await cls.get_latest_pods(workload=workload) + logger.trace(f"latest pod(s) status {list(map(lambda p: p.status, pods))}") + unschedulable_pods = [ + pod + for pod in pods + if pod.status.conditions + and any(cond.reason == "Unschedulable" for cond in pod.status.conditions) + ] + if unschedulable_pods: + pod_messages = [] + for pod in unschedulable_pods: + cond_msgs = [] + for unschedulable_condition in filter( + lambda cond: cond.reason == "Unschedulable", + pod.status.conditions, + ): + unschedulable_adjustments = list( + filter( + lambda a: a.setting_name in unschedulable_condition.message, + adjustments, + ) + ) + cond_msgs.append( + f"Requested adjustment(s) ({', '.join(map(str, unschedulable_adjustments))}) cannot be scheduled due to \"{unschedulable_condition.message}\"" + ) + pod_messages.append(f"{pod.metadata.name} - {'; '.join(cond_msgs)}") + + raise AdjustmentRejectedError( + f"{len(unschedulable_pods)} pod(s) could not be scheduled for {workload.kind} {workload.metadata.name}: {', '.join(pod_messages)}", + reason="unschedulable", + ) + + image_pull_failed_pods = [ + pod + for pod in pods + if pod.status.container_statuses + and any( + cont_stat.state + and cont_stat.state.waiting + and cont_stat.state.waiting.reason + in ["ImagePullBackOff", "ErrImagePull"] + for cont_stat in pod.status.container_statuses + ) + ] + if image_pull_failed_pods: + raise AdjustmentFailedError( + f"Container image pull failure detected on {len(image_pull_failed_pods)} pods: {', '.join(map(lambda pod: pod.metadata.name, pods))}", + reason="image-pull-failed", + ) + + restarted_pods_container_statuses: list[tuple[V1Pod, V1ContainerStatus]] = [ + (pod, cont_stat) + for pod in pods + for cont_stat in (pod.status.container_statuses or []) + if cont_stat.restart_count > 0 + ] + if restarted_pods_container_statuses: + pod_to_counts: dict[str, list] = defaultdict(list) + for (pod, cont_stat) in restarted_pods_container_statuses: + # TODO config to enable logs on per container basis + log_portion = "" + if include_container_logs: + log_portion = f" container logs {await PodHelper.get_logs_for_container(pod, cont_stat.name)}" + pod_to_counts[pod.metadata.name].append( + f"{cont_stat.name} x{cont_stat.restart_count}{log_portion}" + ) + + pod_message = ", ".join( + [f"{key} - {'; '.join(val)}" for key, val in pod_to_counts.items()] + ) + raise AdjustmentRejectedError( + f"{workload.kind} {workload.metadata.name} pod(s) crash restart detected: {pod_message}", + reason="unstable", + ) + + # Unready pod catchall + unready_pod_conds: list[tuple[V1Pod, V1PodCondition]] = [ + (pod, cond) + for pod in pods + for cond in (pod.status.conditions or []) + if cond.type == "Ready" and cond.status == "False" + ] + if unready_pod_conds: + pod_messages = [] + for pod, cond in unready_pod_conds: + pod_message = ( + f"{pod.metadata.name} - (reason {cond.reason}) {cond.message}" + ) + + if include_container_logs and cond.reason == "ContainersNotReady": + unready_container_statuses: list[V1ContainerStatus] = [ + cont_stat + for cont_stat in pod.status.container_statuses or [] + if not cont_stat.ready + ] + container_logs = [ + f"Container {cont_stat.name}:\n{await PodHelper.get_logs_for_container(pod, cont_stat.name)}" + for cont_stat in unready_container_statuses + ] + # NOTE: cant insert newline (backslash) into f-string brackets + pod_message = ( + f"{pod_message} container logs " + + "\n\n--- \n\n".join(container_logs) + ) + + pod_messages.append(pod_message) + + raise AdjustmentRejectedError( + f"Found {len(unready_pod_conds)} unready pod(s) for deployment {pod.metadata.name}: {', '.join(pod_messages)}", + reason="start-failed", + ) diff --git a/servo/connectors/kubernetes_helpers/container.py b/servo/connectors/kubernetes_helpers/container.py new file mode 100644 index 000000000..5c0e4ff5c --- /dev/null +++ b/servo/connectors/kubernetes_helpers/container.py @@ -0,0 +1,109 @@ +import copy + +from typing import Any, cast, Iterable, Optional + +from kubernetes_asyncio.client import V1Container, V1EnvVar, V1ResourceRequirements + +from servo.types.kubernetes import ResourceRequirement + + +class ContainerHelper: + @classmethod + def get_resource_requirements( + cls, container: V1Container, resource_type: str + ) -> dict[ResourceRequirement, Optional[str]]: + """Return a dictionary mapping resource requirements to values for a given resource (e.g., cpu or memory). + + This method is safe to call for containers that do not define any resource requirements (e.g., the `resources` property is None). + + Requirements that are not defined for the named resource are returned as None. For example, a container + that defines CPU requests but does not define limits would return a dict with a `None` value for + the `ResourceRequirement.limit` key. + + Args: + resource_type: The type of resource to get the requirements of (e.g., "cpu" or "memory"). + + Returns: + A dictionary mapping ResourceRequirement enum members to optional string values. + """ + resources: V1ResourceRequirements = getattr( + container, "resources", V1ResourceRequirements() + ) + requirements = {} + for requirement in ResourceRequirement: + # Get the 'requests' or 'limits' nested structure + requirement_subdict = getattr(resources, requirement.resources_key, {}) + if requirement_subdict: + requirements[requirement] = requirement_subdict.get(resource_type) + else: + requirements[requirement] = None + + return requirements + + @classmethod + def set_resource_requirements( + cls, + container: V1Container, + resource_type: str, + requirements: dict[ResourceRequirement, Optional[str]], + ) -> None: + """Sets resource requirements on the container for the values in the given dictionary. + + If no resources have been defined yet, a resources model is provisioned. + If no requirements have been defined for the given resource name, a requirements dictionary is defined. + Values of None are removed from the target requirements. + ResourceRequirement keys that are not present in the dict are not modified. + + Args: + resource_type: The name of the resource to set the requirements of (e.g., "cpu" or "memory"). + requirements: A dict mapping requirements to target values (e.g., `{ResourceRequirement.request: '500m', ResourceRequirement.limit: '2000m'}) + """ + resources: V1ResourceRequirements = copy.copy( + getattr(container, "resources", V1ResourceRequirements()) + ) + + for requirement, value in requirements.items(): + resource_to_values = getattr(resources, requirement.resources_key, {}) + if not resource_to_values: + resource_to_values = {} + + if value is not None: + # NOTE: Coerce to string as values are headed into Kubernetes resource model + resource_to_values[resource_type] = str(value) + else: + resource_to_values.pop(resource_type, None) + setattr(resources, requirement.resources_key, resource_to_values) + + container.resources = resources + + @classmethod + def get_environment_variable( + cls, container: V1Container, variable_name: str + ) -> Optional[str]: + if container.env: + return next( + iter( + v.value or f"valueFrom: {v.value_from}" + for v in cast(Iterable[V1EnvVar], container.env) + if v.name == variable_name + ), + None, + ) + return None + + @classmethod + def set_environment_variable( + cls, container: V1Container, variable_name: str, value: Any + ) -> None: + # V1EnvVar value type is str so value will be converted eventually. Might as well do it up front + val_str = str(value) + if "valueFrom" in val_str: + raise ValueError("Adjustment of valueFrom variables is not supported yet") + + new_vars: list[V1EnvVar] = container.env or [] + if new_vars: + # Filter out vars with the same name as the ones we are setting + new_vars = [v for v in new_vars if v.name != variable_name] + + new_vars.append(V1EnvVar(name=variable_name, value=val_str)) + container.env = new_vars diff --git a/servo/connectors/kubernetes_helpers/deployment.py b/servo/connectors/kubernetes_helpers/deployment.py new file mode 100644 index 000000000..e337ed780 --- /dev/null +++ b/servo/connectors/kubernetes_helpers/deployment.py @@ -0,0 +1,352 @@ +from contextlib import asynccontextmanager +import copy +import itertools +import operator +from typing import Any, AsyncIterator, Optional + +from kubernetes_asyncio.client import ( + ApiClient, + ApiException, + AppsV1Api, + V1Container, + V1ContainerPort, + V1Deployment, + V1DeploymentCondition, + V1EnvVar, + V1ObjectMeta, + V1Pod, + V1PodTemplateSpec, + V1ReplicaSet, + V1ResourceRequirements, + V1ServicePort, +) +from servo.connectors.kubernetes_helpers.pod import PodHelper + +from servo.errors import ConnectorError, AdjustmentFailedError, AdjustmentRejectedError +from servo.logging import logger +from .base_workload import BaseKubernetesWorkloadHelper +from .replicaset import ReplicasetHelper +from .service import ServiceHelper +from .util import dict_to_selector, get_containers + + +class DeploymentHelper(BaseKubernetesWorkloadHelper): + @classmethod + @asynccontextmanager + async def api_client(cls) -> AsyncIterator[AppsV1Api]: + async with ApiClient() as api: + yield AppsV1Api(api) + + @classmethod + async def read(cls, name: str, namespace: str) -> V1Deployment: + logger.debug(f'reading deployment "{name}" in namespace "{namespace}"') + async with cls.api_client() as api: + return await api.read_namespaced_deployment(name=name, namespace=namespace) + + @classmethod + async def patch( + cls, + workload: V1Deployment, + api_client_default_headers: dict[str, str] = { + "content-type": "application/strategic-merge-patch+json" + }, + ) -> V1Deployment: + name = workload.metadata.name + namespace = workload.metadata.namespace + logger.debug(f'patching deployment "{name}" in namespace "{namespace}"') + async with cls.api_client() as api_client: + # TODO: move up to baser class helper method + for k, v in (api_client_default_headers or {}).items(): + api_client.api_client.set_default_header(k, v) + + return await api_client.patch_namespaced_deployment( + name=name, + namespace=namespace, + body=workload, + ) + + @classmethod + @asynccontextmanager + async def watch_args(cls, workload: V1Deployment) -> AsyncIterator[dict[str, Any]]: + async with cls.api_client() as api: + metadata: V1ObjectMeta = workload.metadata + watch_args = {"func": api.list_namespaced_deployment} + watch_args["namespace"] = metadata.namespace + watch_args["label_selector"] = dict_to_selector(metadata.labels) + watch_args["field_selector"] = dict_to_selector( + {"metadata.name": metadata.name} + ) + yield watch_args + + @classmethod + def check_conditions(cls, workload: V1Deployment) -> None: + conditions: list[V1DeploymentCondition] = workload.status.conditions + for condition in conditions: + if condition.type == "Available": + if condition.status == "True": + # If we hit on this and have not raised yet we are good to go + break + elif condition.status in ("False", "Unknown"): + # Condition has not yet been met, log status and continue monitoring + logger.debug( + f"Condition({condition.type}).status == '{condition.status}' ({condition.reason}): {condition.message}" + ) + else: + raise AdjustmentFailedError( + f"encountered unexpected Condition status '{condition.status}'" + ) + + elif condition.type == "ReplicaFailure": + # TODO/FIXME Can't do RCA without getting the ReplicaSet + raise AdjustmentRejectedError( + f"ReplicaFailure: message='{condition.status.message}', reason='{condition.status.reason}'", + reason="start-failed", + ) + + elif condition.type == "Progressing": + if condition.status in ("True", "Unknown"): + # Still working + logger.debug( + f"{workload.kind} update is progressing: {condition}", + ) + break + elif condition.status == "False": + raise AdjustmentRejectedError( + f"ProgressionFailure: message='{condition.status.message}', reason='{condition.status.reason}'", + reason="start-failed", + ) + else: + raise AdjustmentFailedError( + f"unknown {workload.kind} status condition: {condition}" + ) + + @classmethod + async def get_latest_pods(cls, workload: V1Deployment) -> list[V1Pod]: + latest_replicaset: V1ReplicaSet = await cls.get_latest_replicaset(workload) + # NOTE Can skip checking owner references due to Deployment setting + # pod-template-hash on its ReplicaSets + return await PodHelper.list_pods_with_labels( + workload.metadata.namespace, latest_replicaset.spec.selector.match_labels + ) + + @classmethod + async def get_latest_replicaset(cls, workload: V1Deployment) -> V1ReplicaSet: + rs_list = await ReplicasetHelper.list_replicasets_with_labels( + workload.metadata.namespace, workload.spec.selector.match_labels + ) + # Verify all returned RS have this deployment as an owner + rs_list = [ + rs + for rs in rs_list + if rs.metadata.owner_references + and any( + ownRef.kind == "Deployment" and ownRef.uid == workload.metadata.uid + for ownRef in rs.metadata.owner_references + ) + ] + if not rs_list: + raise ConnectorError( + f'Unable to locate replicaset(s) for deployment "{workload.metadata.name}"' + ) + if missing_revision_rsets := list( + filter( + lambda rs: "deployment.kubernetes.io/revision" + not in rs.metadata.annotations, + rs_list, + ) + ): + raise ConnectorError( + f'Unable to determine latest replicaset for deployment "{workload.metadata.name}" due to missing revision' + f' annotation in replicaset(s) "{", ".join(list(map(lambda rs: rs.metadata.name, missing_revision_rsets)))}"' + ) + return sorted( + rs_list, + key=lambda rs: int( + rs.metadata.annotations["deployment.kubernetes.io/revision"] + ), + reverse=True, + )[0] + + # NOTE this method may need to become async if other workload types need their spec.template deserialized + # by the kubernetes_asyncio.client + @classmethod + def get_pod_template_spec_copy(cls, workload: V1Deployment) -> V1PodTemplateSpec: + """Return a deep copy of the pod template spec. Eg. for creation of a tuning pod""" + return copy.deepcopy(workload.spec.template) + + @classmethod + async def inject_sidecar( + cls, + workload: V1Deployment, + name: str, + image: str, + *, + service: Optional[str] = None, + port: Optional[int] = None, + index: Optional[int] = None, + service_port: int = 9980, + ) -> None: + """ + Injects an Envoy sidecar into a target Deployment that proxies a service + or literal TCP port, generating scrapeable metrics usable for optimization. + + The service or port argument must be provided to define how traffic is proxied + between the Envoy sidecar and the container responsible for fulfilling the request. + + Args: + name: The name of the sidecar to inject. + image: The container image for the sidecar container. + deployment: Name of the target Deployment to inject the sidecar into. + service: Name of the service to proxy. Envoy will accept ingress traffic + on the service port and reverse proxy requests back to the original + target container. + port: The name or number of a port within the Deployment to wrap the proxy around. + index: The index at which to insert the sidecar container. When `None`, the sidecar is appended. + service_port: The port to receive ingress traffic from an upstream service. + """ + if not (service or port): + raise ValueError(f"a service or port must be given") + + if isinstance(port, str) and port.isdigit(): + port = int(port) + + # check for a port conflict + container_ports: list[V1ContainerPort] = list( + itertools.chain(*[c.ports or [] for c in get_containers(workload=workload)]) + ) + if service_port in list( + map(operator.attrgetter("container_port"), container_ports) + ): + raise ValueError( + f"Port conflict: {workload.kind} '{workload.metadata.name}' already exposes" + f" port {service_port} through an existing container" + ) + + # lookup the port on the target service + if service: + try: + service_obj = await ServiceHelper.read( + service, workload.metadata.namespace + ) + except ApiException as error: + if error.status == 404: + raise ValueError(f"Unknown Service '{service}'") from error + else: + raise error + serv_port_list: list[V1ServicePort] = service_obj.spec.ports + + if not port: + port_count = len(serv_port_list) + if port_count == 0: + raise ValueError( + f"Target Service '{service}' does not expose any ports" + ) + elif port_count > 1: + raise ValueError( + f"Target Service '{service}' exposes multiple ports -- target port must be specified" + ) + port_obj = serv_port_list[0] + else: + if isinstance(port, int): + port_obj = next( + filter(lambda p: p.port == port, serv_port_list), + None, + ) + elif isinstance(port, str): + port_obj = next( + filter(lambda p: p.name == port, serv_port_list), + None, + ) + else: + raise TypeError( + f"Unable to resolve port value of type {port.__class__.__name__} (port={port})" + ) + + if not port_obj: + raise ValueError( + f"Port '{port}' does not exist in the Service '{service}'" + ) + + # resolve symbolic name in the service target port to a concrete container port + if isinstance(port_obj.target_port, str): + container_port_obj: V1ContainerPort = next( + filter(lambda p: p.name == port_obj.target_port, container_ports), + None, + ) + if not container_port_obj: + raise ValueError( + f"Port '{port_obj.target_port}' could not be resolved to a destination container port" + ) + + container_port = container_port_obj.container_port + else: + container_port = port_obj.target_port + + else: + # find the container port + container_port_obj = next( + filter(lambda p: p.container_port == port, container_ports), None + ) + if not container_port_obj: + raise ValueError( + f"Port '{port}' could not be resolved to a destination container port" + ) + + container_port = container_port_obj.container_port + + # build the sidecar container + container = V1Container( + name=name, + image=image, + image_pull_policy="IfNotPresent", + resources=V1ResourceRequirements( + requests={"cpu": "125m", "memory": "128Mi"}, + limits={"cpu": "250m", "memory": "256Mi"}, + ), + env=[ + V1EnvVar( + name="OPSANI_ENVOY_PROXY_SERVICE_PORT", value=str(service_port) + ), + V1EnvVar( + name="OPSANI_ENVOY_PROXIED_CONTAINER_PORT", + value=str(container_port), + ), + V1EnvVar(name="OPSANI_ENVOY_PROXY_METRICS_PORT", value="9901"), + ], + ports=[ + V1ContainerPort(name="opsani-proxy", container_port=service_port), + V1ContainerPort(name="opsani-metrics", container_port=9901), + ], + ) + + # add the sidecar to the Deployment + retries = 3 + while retries > 0: + if index is None: + workload.spec.template.spec.containers.append(container) + else: + workload.spec.template.spec.containers.insert(index, container) + + # patch the deployment + try: + await cls.patch(workload=workload) + except ApiException as ae: + retries -= 1 + if retries == 0: + logger.error("Failed to inject sidecar after 3 retries") + raise + + if ae.status == 409 and ae.reason == "Conflict": + # If we have a conflict, just load the existing object and try again + workload = await cls.read( + workload.metadata.name, workload.metadata.namespace + ) + else: + raise + else: + # No need to retry if no exception raised + break + + +# Run a dummy instantiation to detect missing ABC implementations +DeploymentHelper() diff --git a/servo/connectors/kubernetes_helpers/namespace.py b/servo/connectors/kubernetes_helpers/namespace.py new file mode 100644 index 000000000..7ceb37df7 --- /dev/null +++ b/servo/connectors/kubernetes_helpers/namespace.py @@ -0,0 +1,20 @@ +import contextlib +from typing import AsyncIterator + +from kubernetes_asyncio.client import CoreV1Api, V1Namespace, ApiClient + +from servo.logging import logger + + +class NamespaceHelper: + @classmethod + @contextlib.asynccontextmanager + async def api_client(cls) -> AsyncIterator[CoreV1Api]: + async with ApiClient() as api: + yield CoreV1Api(api) + + @classmethod + async def read(cls, name: str) -> V1Namespace: + logger.debug(f'reading namespace "{name}"') + async with cls.api_client() as api: + return await api.read_namespace(name=name) diff --git a/servo/connectors/kubernetes_helpers/pod.py b/servo/connectors/kubernetes_helpers/pod.py new file mode 100644 index 000000000..2e52d76ad --- /dev/null +++ b/servo/connectors/kubernetes_helpers/pod.py @@ -0,0 +1,292 @@ +import asyncio +import contextlib +from functools import partial +from typing import Any, AsyncIterator, cast, Optional, Union + +from kubernetes_asyncio.client import ( + ApiClient, + ApiException, + CoreV1Api, + V1ContainerStatus, + V1ObjectMeta, + V1Pod, + V1PodList, + V1PodCondition, + V1PodStatus, + V1Status, +) + +from servo.errors import AdjustmentFailedError, AdjustmentRejectedError, EventError +from servo.logging import logger +from servo.types.api import Adjustment +from servo.types.kubernetes import ContainerLogOptions +from .base import BaseKubernetesHelper +from .util import dict_to_selector + +# FIXME should be coming from servo.types.telemetry which does not exist (yet) +ONE_MiB = 1048576 + + +class PodHelper(BaseKubernetesHelper): + @classmethod + @contextlib.asynccontextmanager + async def api_client(cls) -> AsyncIterator[CoreV1Api]: + async with ApiClient() as api: + yield CoreV1Api(api) + + @classmethod + async def create(cls, workload: V1Pod): + metadata: V1ObjectMeta = workload.metadata + logger.info( + f'creating pod "{metadata.name}" in namespace "{metadata.namespace}"' + ) + async with cls.api_client() as api: + return await api.create_namespaced_pod( + namespace=metadata.namespace, body=workload + ) + + @classmethod + async def read(cls, name: str, namespace: str) -> V1Pod: + logger.debug(f'reading pod "{name}" in namespace "{namespace}"') + async with cls.api_client() as api: + return await api.read_namespaced_pod(name=name, namespace=namespace) + + @classmethod + async def delete(cls, pod: V1Pod) -> None: + metadata: V1ObjectMeta = pod.metadata + logger.debug( + f'deleting pod "{metadata.name}" in namespace "{metadata.namespace}"' + ) + async with cls.api_client() as api: + return await api.delete_namespaced_pod( + name=metadata.name, namespace=metadata.namespace + ) + + @classmethod + @contextlib.asynccontextmanager + async def watch_args(cls, pod: V1Pod) -> AsyncIterator[dict[str, Any]]: + async with cls.api_client() as api: + metadata: V1ObjectMeta = pod.metadata + watch_args = {"func": api.list_namespaced_pod} + watch_args["namespace"] = metadata.namespace + watch_args["label_selector"] = dict_to_selector(metadata.labels) + watch_args["field_selector"] = dict_to_selector( + {"metadata.name": metadata.name} + ) + yield watch_args + + @classmethod + def is_ready(cls, pod: V1Pod, event_type: Optional[str] = None) -> bool: + # implementation derived from official go client + # https://github.com/kubernetes/kubernetes/blob/096dafe757f897a9d1d9f6160451813062eec063/test/utils/conditions.go#L33 + status: V1PodStatus = pod.status + logger.trace(f"current pod status is {status}") + if status is None: + return False + + phase = status.phase + logger.debug(f"current pod phase is {phase}") + if phase != "Running": + return False + + conditions: list[V1PodCondition] = status.conditions or [] + logger.debug(f"checking status conditions {conditions}") + ready_condition = next(iter((c for c in conditions if c.type == "Ready")), None) + if ready_condition and ready_condition.status == "True": + return True + + logger.debug(f"unable to find ready=true, continuing to wait...") + return False + + @classmethod + async def list_pods_with_labels( + cls, namespace: str, match_labels: dict[str, str] + ) -> list[V1Pod]: + async with cls.api_client() as api: + pod_list: V1PodList = await api.list_namespaced_pod( + namespace=namespace, label_selector=dict_to_selector(match_labels) + ) + return pod_list.items or [] + + @classmethod + def get_restart_count(cls, pod: V1Pod, container_name: Optional[str] = None) -> int: + """Return restart count for all containers by default or a specific container if the optional container_name + is specified + """ + if pod.status is None or pod.status.container_statuses is None: + return 0 + + total = 0 + for container_status in pod.status.container_statuses: + if container_status.name == container_name: + return container_status.restart_count + + total += container_status.restart_count + + if container_name: + raise RuntimeError( + f"Unable to determine container status for {container_name} from pod {pod}" + ) + + return total + + @classmethod + async def raise_for_status( + cls, + workload: V1Pod, + adjustments: list[Adjustment], + include_container_logs=False, + ) -> None: + """Raise an exception if the Pod status is not not ready.""" + # NOTE: operate off of current state, assuming you have checked is_ready() + status: V1PodStatus = workload.status + logger.trace(f"current pod status is {status}") + + if not status.conditions: + raise EventError(f"Pod is not running: {workload.metadata.name}") + + logger.debug(f"checking container statuses: {status.container_statuses}") + if status.container_statuses: + for cont_stat in cast(list[V1ContainerStatus], status.container_statuses): + if ( + cont_stat.state + and cont_stat.state.waiting + and cont_stat.state.waiting.reason + in ["ImagePullBackOff", "ErrImagePull"] + ): + raise AdjustmentFailedError( + f"Container image pull failure detected in container {cont_stat.name}", + reason="image-pull-failed", + ) + + restarted_container_statuses: list[V1ContainerStatus] = [ + cont_stat + for cont_stat in cast(list[V1ContainerStatus], status.container_statuses) + or [] + if cont_stat.restart_count > 0 + ] + if restarted_container_statuses: + container_messages = [ + ( + f"{cont_stat.name} x{cont_stat.restart_count}" + # TODO enable logs config on per container basis + f"container logs {'DISABLED' if not include_container_logs else await cls.get_logs_for_container(workload, cont_stat.name)}" + ) + for cont_stat in restarted_container_statuses + ] + raise AdjustmentRejectedError( + # NOTE: cant use f-string with newline (backslash) insertion + ( + f"Tuning optimization {workload.metadata.name} crash restart detected on container(s): " + + ", \n".join(container_messages) + ), + reason="unstable", + ) + + logger.debug(f"checking status conditions {status.conditions}") + for cond in cast(list[V1PodCondition], status.conditions): + if cond.reason == "Unschedulable": + # FIXME: The servo rejected error should be raised further out. This should be a generic scheduling error + unschedulable_adjustments = [ + a for a in adjustments if a.setting_name in cond.message + ] + raise AdjustmentRejectedError( + f"Requested adjustment(s) ({', '.join(map(str, unschedulable_adjustments))}) cannot be scheduled due to \"{cond.message}\"", + reason="unschedulable", + ) + + if cond.type == "Ready" and cond.status == "False": + rejection_message = cond.message + if include_container_logs and cond.reason == "ContainersNotReady": + unready_container_statuses = [ + cont_stat + for cont_stat in cast( + list[V1ContainerStatus], status.container_statuses + ) + or [] + if not cont_stat.ready + ] + container_logs = [ + await cls.get_logs_for_container(workload, cs.name) + for cs in unready_container_statuses + ] + # NOTE: cant insert newline (backslash) into f-string brackets + rejection_message = ( + f"{rejection_message} container logs " + + "\n\n--- \n\n".join(container_logs) + ) + raise AdjustmentRejectedError( + f"(reason {cond.reason}) {rejection_message}", reason="start-failed" + ) + + # Catchall + logger.error( + f"unable to determine type of error to raise for pod {workload.metadata.name} status: {status}" + ) + raise EventError(f"Unknown Pod status for '{workload.metadata.name}': {status}") + + @classmethod + async def get_logs_for_container( + cls, + pod: V1Pod, + container_name: str, + limit_bytes: int = ONE_MiB, + logs_selector: ContainerLogOptions = ContainerLogOptions.both, + ) -> list[str]: + """ + Get container logs from the current pod for the container's whose statuses are provided in the list + + Args: + container_statuses (list[V1ContainerStatus]): The name of the Container. + limit_bytes (int): Maximum bytes to provide per log (NOTE: this will be 2x per container ) + logs_selector (ContainerLogOptions): "previous", "current", or "both" + + Returns: + list[str]: List of logs per container in the same order as the list of container_statuses + """ + read_logs_partial = partial( + cls.try_get_container_single_log, + pod=pod, + container_name=container_name, + limit_bytes=limit_bytes, + ) + if logs_selector == ContainerLogOptions.both: + return ( + f"previous (crash):\n {await read_logs_partial(previous=True)} \n\n--- \n\n" + f"current (latest):\n {await read_logs_partial(previous=False)}" + ) + else: + return await read_logs_partial( + previous=(logs_selector == ContainerLogOptions.previous) + ) + + @classmethod + async def try_get_container_single_log( + cls, + pod: V1Pod, + container_name: str, + limit_bytes: int = ONE_MiB, + previous=False, + ) -> str: + """Get log for a container run while handling common error cases (eg. Not Found)""" + async with cls.api_client() as api: + try: + return await api.read_namespaced_pod_log( + name=pod.metadata.name, + namespace=pod.metadata.namespace, + container=container_name, + limit_bytes=limit_bytes, + previous=previous, + ) + except ApiException as ae: + if ae.status == 400: + ae.data = ae.body + status: V1Status = api.api_client.deserialize(ae, "V1Status") + if (status.message or "").endswith("not found"): + return "Logs not found" + + raise + + +# Run a dummy instantiation to detect missing ABC implementations +PodHelper() diff --git a/servo/connectors/kubernetes_helpers/replicaset.py b/servo/connectors/kubernetes_helpers/replicaset.py new file mode 100644 index 000000000..adb5fc28d --- /dev/null +++ b/servo/connectors/kubernetes_helpers/replicaset.py @@ -0,0 +1,39 @@ +import contextlib +from typing import AsyncIterator + +from kubernetes_asyncio.client import ( + ApiClient, + AppsV1Api, + V1ReplicaSet, + V1ReplicaSetList, +) + +from servo.logging import logger +from .util import dict_to_selector + + +class ReplicasetHelper: + @classmethod + @contextlib.asynccontextmanager + async def api_client(cls) -> AsyncIterator[AppsV1Api]: + async with ApiClient() as api: + yield AppsV1Api(api) + + @classmethod + async def read(cls, name: str, namespace: str) -> V1ReplicaSet: + logger.debug(f'reading replicaset "{name}" in namespace "{namespace}"') + async with cls.api_client() as api: + return await api.read_namespaced_replica_set(name=name, namespace=namespace) + + @classmethod + async def list_replicasets_with_labels( + cls, namespace: str, match_labels: dict[str, str] + ) -> list[V1ReplicaSet]: + async with cls.api_client() as api: + rs_list: V1ReplicaSetList = await api.list_namespaced_replica_set( + namespace=namespace, label_selector=dict_to_selector(match_labels) + ) + return rs_list.items or [] + + +ReplicasetHelper() diff --git a/servo/connectors/kubernetes_helpers/service.py b/servo/connectors/kubernetes_helpers/service.py new file mode 100644 index 000000000..01cc8ab23 --- /dev/null +++ b/servo/connectors/kubernetes_helpers/service.py @@ -0,0 +1,60 @@ +import contextlib +from typing import cast, AsyncIterator, Optional, Union + +from kubernetes_asyncio.client import CoreV1Api, V1Service, V1ServicePort, ApiClient + +from servo.logging import logger + + +class ServiceHelper: + @classmethod + @contextlib.asynccontextmanager + async def api_client(cls) -> AsyncIterator[CoreV1Api]: + async with ApiClient() as api: + yield CoreV1Api(api) + + @classmethod + async def read(cls, name: str, namespace: str) -> V1Service: + logger.debug(f'reading service "{name}" in namespace {namespace}') + async with cls.api_client() as api: + return await api.read_namespaced_service(name=name, namespace=namespace) + + @classmethod + async def patch( + cls, + workload: V1Service, + api_client_default_headers: dict[str, str] = { + "content-type": "application/strategic-merge-patch+json" + }, + ) -> V1Service: + name = workload.metadata.name + namespace = workload.metadata.namespace + logger.debug(f'patching service "{name}" in namespace "{namespace}"') + async with cls.api_client() as api_client: + # TODO: move up to baser class helper method + for k, v in (api_client_default_headers or {}).items(): + api_client.api_client.set_default_header(k, v) + + return await api_client.patch_namespaced_service( + name=name, + namespace=namespace, + body=workload, + ) + + @classmethod + def find_port( + cls, service: V1Service, selector: Union[str, int] + ) -> Optional[V1ServicePort]: + for port in cast(list[V1ServicePort], service.spec.ports): + if isinstance(selector, str): + if port.name == selector: + return port + elif isinstance(selector, int): + if port.port == selector: + return port + else: + raise TypeError( + f"Unknown port selector type '{selector.__class__.__name__}': {selector}" + ) + + return None diff --git a/servo/connectors/kubernetes_helpers/statefulset.py b/servo/connectors/kubernetes_helpers/statefulset.py new file mode 100644 index 000000000..ae15194ce --- /dev/null +++ b/servo/connectors/kubernetes_helpers/statefulset.py @@ -0,0 +1,85 @@ +from contextlib import asynccontextmanager +from typing import Any, AsyncIterator + +from kubernetes_asyncio.client import ( + ApiClient, + AppsV1Api, + V1StatefulSet, + V1ObjectMeta, + V1Pod, +) + +from servo.logging import logger +from .base_workload import BaseKubernetesWorkloadHelper +from .pod import PodHelper +from .util import dict_to_selector + + +class StatefulSetHelper(BaseKubernetesWorkloadHelper): + @classmethod + @asynccontextmanager + async def api_client(cls) -> AsyncIterator[AppsV1Api]: + async with ApiClient() as api: + yield AppsV1Api(api) + + @classmethod + async def read(cls, name: str, namespace: str) -> V1StatefulSet: + logger.debug(f'reading statefulset "{name}" in namespace "{namespace}"') + async with cls.api_client() as api: + return await api.read_namespaced_stateful_set( + name=name, namespace=namespace + ) + + @classmethod + async def patch( + cls, + workload: V1StatefulSet, + api_client_default_headers: dict[str, str] = { + "content-type": "application/strategic-merge-patch+json" + }, + ) -> V1StatefulSet: + name = workload.metadata.name + namespace = workload.metadata.namespace + logger.debug(f'patching statefulset "{name}" in namespace "{namespace}"') + async with cls.api_client() as api_client: + # TODO: move up to baser class helper method + for k, v in (api_client_default_headers or {}).items(): + api_client.api_client.set_default_header(k, v) + + return await api_client.patch_namespaced_stateful_set( + name=name, + namespace=namespace, + body=workload, + ) + + @classmethod + @asynccontextmanager + async def watch_args(cls, workload: V1StatefulSet) -> AsyncIterator[dict[str, Any]]: + async with cls.api_client() as api: + metadata: V1ObjectMeta = workload.metadata + watch_args = {"func": api.list_namespaced_stateful_set} + watch_args["namespace"] = metadata.namespace + watch_args["label_selector"] = dict_to_selector(metadata.labels) + watch_args["field_selector"] = dict_to_selector( + {"metadata.name": metadata.name} + ) + yield watch_args + + @classmethod + def check_conditions(cls, workload: V1StatefulSet) -> None: + # https://github.com/kubernetes/kubernetes/issues/79606 + raise NotImplementedError("StatefulSets do not define conditions") + + @classmethod + async def get_latest_pods(cls, workload: V1StatefulSet) -> list[V1Pod]: + # make copy of selector dict for safe updates + pod_labels = dict(workload.spec.selector.match_labels) + pod_labels["controller-revision-hash"] = workload.status.update_revision + return await PodHelper.list_pods_with_labels( + workload.metadata.namespace, pod_labels + ) + # TODO? validate pod owner references? + + +# Run a dummy instantiation to detect missing ABC implementations +StatefulSetHelper() diff --git a/servo/connectors/kubernetes_helpers/util.py b/servo/connectors/kubernetes_helpers/util.py new file mode 100644 index 000000000..9160a342d --- /dev/null +++ b/servo/connectors/kubernetes_helpers/util.py @@ -0,0 +1,33 @@ +from typing import Mapping, Optional, Union + +from kubernetes_asyncio.client import ( + V1Container, + V1Pod, + V1StatefulSet, + V1Deployment, + V1PodTemplateSpec, +) + + +def dict_to_selector(mapping: Mapping[str, str]) -> str: + # https://stackoverflow.com/a/17888002 + return ",".join(["=".join((k, v)) for k, v in mapping.items()]) + + +def get_containers( + workload: Union[V1Pod, V1PodTemplateSpec, V1StatefulSet, V1Deployment] +) -> list[V1Container]: + if isinstance(workload, (V1Pod, V1PodTemplateSpec)): + return workload.spec.containers + else: + return workload.spec.template.spec.containers + + +# NOTE this method may need to become async if other workload types need their container object deserialized +# by the kubernetes_asyncio.client +def find_container( + workload: Union[V1Pod, V1PodTemplateSpec, V1StatefulSet, V1Deployment], name: str +) -> Optional[V1Container]: + return next( + iter((c for c in get_containers(workload=workload) if c.name == name)), None + ) diff --git a/servo/connectors/opsani_dev.py b/servo/connectors/opsani_dev.py index 6efc92bc6..330eeffd3 100644 --- a/servo/connectors/opsani_dev.py +++ b/servo/connectors/opsani_dev.py @@ -2,13 +2,24 @@ import json import operator import os -from typing import Dict, List, Optional, Union, Type +from typing import cast, Dict, List, Optional, Union, Type import kubernetes_asyncio +import kubernetes_asyncio.client import pydantic import servo +import servo.types +from servo.types.kubernetes import Resource import servo.connectors.kubernetes +from servo.connectors.kubernetes_helpers import ( + find_container, + ContainerHelper, + DeploymentHelper, + NamespaceHelper, + PodHelper, + ServiceHelper, +) import servo.connectors.kube_metrics import servo.connectors.prometheus @@ -58,8 +69,19 @@ class Memory(servo.connectors.kubernetes.Memory): class OpsaniDevConfiguration(servo.BaseConfiguration): namespace: str - deployment: Optional[str] - rollout: Optional[str] + workload_name: str = pydantic.Field( + alias="deployment", + env=["deployment", "workload"], + title="Workload Name", + description=( + "Name of the targeted workload (NOTE: the workload_name key should be used for this config going" + " forward. The deployment key is supported for backwards compatibility)" + ), + ) # alias to maintain backward compatibility + workload_kind: str = pydantic.Field( + default="Deployment", + regex=r"^([Dd]eployment)$", + ) container: str service: str port: Optional[Union[pydantic.StrictInt, str]] = None @@ -81,21 +103,14 @@ class OpsaniDevConfiguration(servo.BaseConfiguration): description="Disable to prevent a canary strategy", ) - @pydantic.root_validator - def check_deployment_and_rollout(cls, values): - if values.get("deployment") is not None and values.get("rollout") is not None: - raise ValueError("Configuration cannot specify both rollout and deployment") - - if values.get("deployment") is None and values.get("rollout") is None: - raise ValueError("Configuration must specify either rollout or deployment") - - return values + class Config(servo.AbstractBaseConfiguration.Config): + allow_population_by_field_name = True @classmethod def generate(cls, **kwargs) -> "OpsaniDevConfiguration": return cls( namespace="default", - deployment="app-deployment", + workload_name="app-deployment", container="main", service="app", cpu=CPU(min="250m", max="4000m"), @@ -133,8 +148,8 @@ def generate_kubernetes_config( replicas = servo.Replicas(min=0, max=99999, pinned=True) - main_config = servo.connectors.kubernetes.DeploymentConfiguration( - name=(self.deployment or self.rollout), + workload_kwargs = dict( + name=self.workload_name, strategy=strategy, replicas=replicas, containers=[ @@ -148,25 +163,27 @@ def generate_kubernetes_config( ) ], ) - if self.deployment: - main_arg = {"deployments": [main_config]} - elif self.rollout: - main_arg = { - "rollouts": [ - servo.connectors.kubernetes.RolloutConfiguration.parse_obj( - main_config.dict(exclude_none=True) - ) - ] - } - - return servo.connectors.kubernetes.KubernetesConfiguration( + main_config_kwargs = dict( namespace=self.namespace, description="Update the namespace, deployment, etc. to match your Kubernetes cluster", timeout=self.timeout, settlement=self.settlement, container_logs_in_error_status=self.container_logs_in_error_status, create_tuning_pod=self.create_tuning_pod, - **main_arg, + ) + if self.workload_kind.lower() == "deployment": + workload_config = servo.connectors.kubernetes.DeploymentConfiguration( + **workload_kwargs + ) + main_config_kwargs["deployments"] = [workload_config] + + else: + raise servo.EventError( + f"Incompatible workload_kind configured: {self.workload_kind}" + ) + + return servo.connectors.kubernetes.KubernetesConfiguration( + **main_config_kwargs, **kwargs, ) @@ -278,86 +295,62 @@ def generate_kube_metrics_config( return servo.connectors.kube_metrics.KubeMetricsConfiguration( namespace=self.namespace, - name=self.deployment or self.rollout, - kind="Deployment" if self.deployment else "Rollout", + name=self.workload_name, + kind=self.workload_kind, container=self.container, metrics_to_collect=metrics, **kwargs, ) -class BaseOpsaniDevChecks(servo.BaseChecks, abc.ABC): +class OpsaniDevChecks(servo.BaseChecks): config: OpsaniDevConfiguration + # FIXME make this a property of worklod helper? @property - @abc.abstractmethod - def controller_type_name(self) -> str: - ... - - @property - @abc.abstractmethod - def config_controller_name(self) -> str: - ... - - @property - @abc.abstractmethod - def controller_class( - self, - ) -> Type[ - Union[ - servo.connectors.kubernetes.Deployment, servo.connectors.kubernetes.Rollout - ] - ]: - ... - - @property - @abc.abstractmethod def required_permissions(self) -> List[servo.connectors.kubernetes.PermissionSet]: - ... - - @abc.abstractmethod - async def _get_port_forward_target(self) -> str: - ... + if self.config.workload_kind.lower() == "deployment": + return KUBERNETES_PERMISSIONS + else: + raise servo.EventError( + f"Incompatible workload_kind configured: {self.workload_kind}" + ) - @abc.abstractmethod def _get_generated_controller_config( self, config: servo.connectors.kubernetes.KubernetesConfiguration - ) -> Union[ - servo.connectors.kubernetes.DeploymentConfiguration, - servo.connectors.kubernetes.RolloutConfiguration, - ]: - ... - - @abc.abstractmethod - def _get_controller_service_selector( - self, - controller: Union[ - servo.connectors.kubernetes.Deployment, servo.connectors.kubernetes.Rollout - ], - ) -> Dict[str, str]: - ... + ) -> servo.connectors.kubernetes.DeploymentConfiguration: + if self.config.workload_kind.lower() == "deployment": + return config.deployments[0] + else: + raise servo.EventError( + f"Incompatible workload_kind configured: {self.workload_kind}" + ) - @abc.abstractmethod - def _get_controller_patch_target( - self, - controller: Union[ - servo.connectors.kubernetes.Deployment, servo.connectors.kubernetes.Rollout - ], - ) -> str: - ... + # NOTE for rollout support, will need to get current replicaset of rollout as target + async def _get_port_forward_target(self) -> str: + return f"{self.config.workload_kind}/{self.config.workload_name}" + + @property + def workload_helper(self) -> type[DeploymentHelper]: + if self.config.workload_kind.lower() == "deployment": + return DeploymentHelper + else: + raise servo.EventError( + f"Incompatible workload_kind configured: {self.workload_kind}" + ) ## # Kubernetes essentials @servo.checks.require("Connectivity to Kubernetes") async def check_connectivity(self) -> None: - async with kubernetes_asyncio.client.api_client.ApiClient() as api: + async with kubernetes_asyncio.client.ApiClient() as api: v1 = kubernetes_asyncio.client.VersionApi(api) await v1.get_code() @servo.checks.warn("Kubernetes version") async def check_version(self) -> None: - async with kubernetes_asyncio.client.api_client.ApiClient() as api: + async with kubernetes_asyncio.client.ApiClient() as api: v1 = kubernetes_asyncio.client.VersionApi(api) version = await v1.get_code() assert int(version.major) >= 1 @@ -366,27 +359,23 @@ async def check_version(self) -> None: @servo.checks.require("Kubernetes permissions") async def check_permissions(self) -> None: - async with kubernetes_asyncio.client.api_client.ApiClient() as api: + async with kubernetes_asyncio.client.ApiClient() as api: v1 = kubernetes_asyncio.client.AuthorizationV1Api(api) for permission in self.required_permissions: for resource in permission.resources: for verb in permission.verbs: - attributes = ( - kubernetes_asyncio.client.models.V1ResourceAttributes( - namespace=self.config.namespace, - group=permission.group, - resource=resource, - verb=verb, - ) + attributes = kubernetes_asyncio.client.V1ResourceAttributes( + namespace=self.config.namespace, + group=permission.group, + resource=resource, + verb=verb, ) - spec = kubernetes_asyncio.client.models.V1SelfSubjectAccessReviewSpec( + spec = kubernetes_asyncio.client.V1SelfSubjectAccessReviewSpec( resource_attributes=attributes ) - review = ( - kubernetes_asyncio.client.models.V1SelfSubjectAccessReview( - spec=spec - ) + review = kubernetes_asyncio.client.V1SelfSubjectAccessReview( + spec=spec ) access_review = await v1.create_self_subject_access_review( body=review @@ -397,32 +386,32 @@ async def check_permissions(self) -> None: @servo.checks.require('Namespace "{self.config.namespace}" is readable') async def check_opsani_dev_kubernetes_namespace(self) -> None: - await servo.connectors.kubernetes.Namespace.read(self.config.namespace) + await NamespaceHelper.read(self.config.namespace) @servo.checks.require( - '{self.controller_type_name} "{self.config_controller_name}" is readable' + '{self.config.workload_kind} "{self.config.workload_name}" is readable' ) async def check_opsani_dev_kubernetes_controller(self) -> None: - await self.controller_class.read( - self.config_controller_name, self.config.namespace + await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) @servo.checks.require('Container "{self.config.container}" is readable') async def check_opsani_dev_kubernetes_container(self) -> None: - controller = await self.controller_class.read( - self.config_controller_name, self.config.namespace + controller = await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) - container = controller.find_container(self.config.container) + container = find_container(controller, self.config.container) assert ( container - ), f"failed reading Container '{self.config.container}' in {self.controller_type_name} '{self.config_controller_name}'" + ), f"failed reading Container '{self.config.container}' in {self.config.workload_kind} '{self.config.workload_name}'" @servo.require('Container "{self.config.container}" has resource requirements') async def check_resource_requirements(self) -> None: - controller = await self.controller_class.read( - self.config_controller_name, self.config.namespace + controller = await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) - container = controller.find_container(self.config.container) + container = find_container(controller, self.config.container) assert container assert container.resources, "missing container resources" @@ -433,33 +422,34 @@ async def check_resource_requirements(self) -> None: for resource in servo.connectors.kubernetes.Resource.values(): current_state = None - container_requirements = container.get_resource_requirements(resource) - get_requirements = getattr(self.config, resource).get + container_requirements = ContainerHelper.get_resource_requirements( + container, resource + ) + get_requirements = cast( + Union[CPU, Memory], getattr(self.config, resource) + ).get for requirement in get_requirements: current_state = container_requirements.get(requirement) if current_state: break assert current_state, ( - f"{self.controller_type_name} {self.config_controller_name} target container {self.config.container} spec does not define the resource {resource}. " + f"{self.config.workload_kind} {self.config.workload_name} target container {self.config.container} spec does not define the resource {resource}. " f"At least one of the following must be specified: {', '.join(map(lambda req: req.resources_key, get_requirements))}" ) @servo.checks.require("Target container resources fall within optimization range") async def check_target_container_resources_within_limits(self) -> None: # Load the Controller - controller = await self.controller_class.read( - self.config_controller_name, self.config.namespace + controller = await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) assert ( controller - ), f"failed to read {self.controller_type_name} '{self.config_controller_name}' in namespace '{self.config.namespace}'" + ), f"failed to read {self.config.workload_kind} '{self.config.workload_name}' in namespace '{self.config.namespace}'" # Find the target Container - target_container = next( - filter(lambda c: c.name == self.config.container, controller.containers), - None, - ) + target_container = find_container(controller, self.config.container) assert ( target_container ), f"failed to find container '{self.config.container}' when verifying resource limits" @@ -471,7 +461,9 @@ async def check_target_container_resources_within_limits(self) -> None: # Get resource requirements from container # TODO: This needs to reuse the logic from CanaryOptimization class (tuning_cpu, tuning_memory, etc properties) - cpu_resource_requirements = target_container.get_resource_requirements("cpu") + cpu_resource_requirements = ContainerHelper.get_resource_requirements( + target_container, Resource.cpu.value + ) cpu_resource_value = cpu_resource_requirements.get( next( filter( @@ -483,8 +475,8 @@ async def check_target_container_resources_within_limits(self) -> None: ) container_cpu_value = servo.connectors.kubernetes.Core.parse(cpu_resource_value) - memory_resource_requirements = target_container.get_resource_requirements( - "memory" + memory_resource_requirements = ContainerHelper.get_resource_requirements( + target_container, Resource.memory.value ) memory_resource_value = memory_resource_requirements.get( next( @@ -520,46 +512,40 @@ async def check_target_container_resources_within_limits(self) -> None: ), f"target container Memory value {container_memory_value.human_readable()} must be less than optimizable maximum {config_memory_max.human_readable()}" @servo.require( - '{self.controller_type_name} "{self.config_controller_name}" is ready' + '{self.config.workload_kind} "{self.config.workload_name}" is ready' ) async def check_controller_readiness(self) -> None: - controller = await self.controller_class.read( - self.config_controller_name, self.config.namespace + controller = await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) - if not await controller.is_ready(): + if not self.workload_helper.is_ready(controller): raise RuntimeError( - f'{self.controller_type_name} "{controller.name}" is not ready' + f'{self.config.workload_name} "{controller.metadata.name}" is not ready' ) - @servo.checks.require("service") + @servo.checks.require("Service {self.config.service} is readable") async def check_opsani_dev_kubernetes_service(self) -> None: - await servo.connectors.kubernetes.Service.read( - self.config.service, self.config.namespace - ) + await ServiceHelper.read(self.config.service, self.config.namespace) - @servo.checks.warn("service type") + @servo.checks.warn("Service {self.config.service} has compatible type") async def check_opsani_dev_kubernetes_service_type(self) -> None: - service = await servo.connectors.kubernetes.Service.read( - self.config.service, self.config.namespace - ) - service_type = service.obj.spec.type + service = await ServiceHelper.read(self.config.service, self.config.namespace) + service_type = service.spec.type if not service_type in ("ClusterIP", "LoadBalancer", "NodePort"): raise ValueError( f"expected service type of ClusterIP, LoadBalancer, or NodePort but found {service_type}" ) - @servo.checks.check("service port") + @servo.checks.check("Service {self.config.service} has unambiguous target port") async def check_opsani_dev_kubernetes_service_port(self) -> None: - service = await servo.connectors.kubernetes.Service.read( - self.config.service, self.config.namespace - ) - if len(service.ports) > 1: + service = await ServiceHelper.read(self.config.service, self.config.namespace) + if len(service.spec.ports) > 1: if not self.config.port: raise ValueError( f"service defines more than one port: a `port` (name or number) must be specified in the configuration" ) - port = service.find_port(self.config.port) + port = ServiceHelper.find_port(service, self.config.port) if not port: if isinstance(self.config.port, str): raise LookupError( @@ -572,28 +558,27 @@ async def check_opsani_dev_kubernetes_service_port(self) -> None: else: raise RuntimeError(f"unknown port value: {self.config.port}") else: - port = service.ports[0] + port: kubernetes_asyncio.client.V1ServicePort = service.spec.ports[0] return ( f"Service Port: {port.name} {port.port}:{port.target_port}/{port.protocol}" ) - @servo.checks.check("Service routes traffic to {self.controller_type_name} Pods") + @servo.checks.check("Service routes traffic to {self.config.workload_name} Pods") async def check_service_routes_traffic_to_controller(self) -> None: - service = await servo.connectors.kubernetes.Service.read( - self.config.service, self.config.namespace - ) - controller = await self.controller_class.read( - self.config_controller_name, self.config.namespace + service = await ServiceHelper.read(self.config.service, self.config.namespace) + controller = await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) # NOTE: The Service labels should be a subset of the controller labels - controller_labels = self._get_controller_service_selector(controller) - delta = dict(set(service.selector.items()) - set(controller_labels.items())) + controller_labels: dict[str, str] = controller.spec.selector.match_labels + service_labels: dict[str, str] = service.spec.selector + delta = dict(set(service_labels.items()) - set(controller_labels.items())) if delta: desc = " ".join(map("=".join, delta.items())) raise RuntimeError( - f"Service selector does not match {self.controller_type_name} labels. Missing labels: {desc}" + f"Service selector does not match {self.config.workload_kind} labels. Missing labels: {desc}" ) ## @@ -611,12 +596,12 @@ async def check_prometheus_config_map(self) -> None: config = None for name in names: try: - config = await servo.connectors.kubernetes.ConfigMap.read( - name, namespace - ) - if config: - break - except kubernetes_asyncio.client.exceptions.ApiException as e: + async with kubernetes_asyncio.client.ApiClient() as api: + corev1 = kubernetes_asyncio.client.CoreV1Api(api) + config = await corev1.read_namespaced_config_map(name, namespace) + if config: + break + except kubernetes_asyncio.client.ApiException as e: if e.status != 404 or e.reason != "Not Found": raise @@ -633,9 +618,9 @@ async def check_prometheus_sidecar_exists(self) -> None: f"no servo pod is running in namespace '{self.config.namespace}'" ) - if not pod.get_container("prometheus"): + if not find_container(workload=pod, name="prometheus"): raise servo.checks.CheckError( - f"no 'prometheus' container found in pod '{pod.name}' in namespace '{self.config.namespace}'" + f"no 'prometheus' container found in pod '{pod.metadata.name}' in namespace '{self.config.namespace}'" ) @servo.checks.check("Prometheus sidecar is ready") @@ -644,8 +629,8 @@ async def check_prometheus_sidecar_is_ready(self) -> None: if pod is None: raise servo.checks.CheckError(f"no servo pod was found") - if not await pod.is_ready(): - raise servo.checks.CheckError(f"pod '{pod.name}' is not ready") + if not PodHelper.is_ready(pod): + raise servo.checks.CheckError(f"pod '{pod.metadata.name}' is not ready") @servo.checks.warn("Prometheus sidecar is stable") async def check_prometheus_restart_count(self) -> None: @@ -653,12 +638,13 @@ async def check_prometheus_restart_count(self) -> None: if pod is None: raise servo.checks.CheckError(f"no servo pod was found") - container = pod.get_container("prometheus") + container = find_container(workload=pod, name="prometheus") assert container, "could not find a Prometheus sidecar container" - restart_count = await container.get_restart_count() + # TODO PodHelper.get_restart_count + restart_count = PodHelper.get_restart_count(pod, container_name="prometheus") assert ( restart_count == 0 - ), f"container 'prometheus' in pod '{pod.name}' has restarted {restart_count} times" + ), f"container 'prometheus' in pod '{pod.metadata.name}' has restarted {restart_count} times" @servo.checks.require("Prometheus has container port on 9090") async def check_prometheus_container_port(self) -> None: @@ -666,43 +652,33 @@ async def check_prometheus_container_port(self) -> None: if pod is None: raise servo.checks.CheckError(f"failed: no servo pod was found") - container = pod.get_container("prometheus") + container = find_container(workload=pod, name="prometheus") assert container, "could not find Prometheus sidecar container" assert ( - len(container.obj.ports) == 1 - ), f"expected 1 container port but found {len(container.obj.ports)}" - port = container.obj.ports[0].container_port + len(container.ports) == 1 + ), f"expected 1 container port but found {len(container.ports)}" + port: int = container.ports[0].container_port assert ( port == 9090 ), f"expected Prometheus container port on 9090 but found {port}" @servo.checks.require("Prometheus is accessible") async def check_prometheus_is_accessible(self) -> str: - pod = await self._read_servo_pod() - if pod is None: - raise servo.checks.CheckError(f"no servo pod was found") - - container = pod.get_container("prometheus") - assert container, "could not find a Prometheus sidecar container" - assert ( - len(container.obj.ports) == 1 - ), f"expected 1 container port but found {len(container.obj.ports)}" - client = servo.connectors.prometheus.Client( base_url=self.config.prometheus_base_url ) await client.list_targets() return f"Prometheus is accessible at {self.config.prometheus_base_url}" - async def _read_servo_pod(self) -> Optional[servo.connectors.kubernetes.Pod]: + async def _read_servo_pod(self) -> Optional[kubernetes_asyncio.client.V1Pod]: return await self._read_servo_pod_from_env() or next( reversed(await self._list_servo_pods()), None ) async def _read_servo_pod_from_env( self, - ) -> Optional[servo.connectors.kubernetes.Pod]: + ) -> Optional[kubernetes_asyncio.client.V1Pod]: """Reads the servo Pod from Kubernetes by referencing the `POD_NAME` and `POD_NAMESPACE` environment variables. @@ -714,74 +690,71 @@ async def _read_servo_pod_from_env( if None in (pod_name, pod_namespace): return None - return await servo.connectors.kubernetes.Pod.read(pod_name, pod_namespace) + return await PodHelper.read(pod_name, pod_namespace) - async def _list_servo_pods(self) -> List[servo.connectors.kubernetes.Pod]: + async def _list_servo_pods(self) -> list[kubernetes_asyncio.client.V1Pod]: """Lists all servo pods in the configured namespace. Returns: A list of servo pods in the configured namespace. """ - async with servo.connectors.kubernetes.Pod.preferred_client() as api_client: - label_selector = servo.connectors.kubernetes.selector_string( - {"app.kubernetes.io/name": "servo"} - ) - pod_list: servo.connectors.kubernetes.client.V1PodList = ( - await api_client.list_namespaced_pod( - namespace=self.config.namespace, label_selector=label_selector - ) - ) - - pods = [servo.connectors.kubernetes.Pod(p) for p in pod_list.items] - return pods + return await PodHelper.list_pods_with_labels( + namespace=self.config.namespace, + match_labels={"app.kubernetes.io/name": "servo"}, + ) ## # Kubernetes Controller edits - @servo.checks.check("{self.controller_type_name} PodSpec has expected annotations") + @servo.checks.check("{self.config.workload_name} PodSpec has expected annotations") async def check_controller_annotations(self) -> None: - controller = await self.controller_class.read( - self.config_controller_name, self.config.namespace + controller = await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) assert ( controller - ), f"failed to read {self.controller_type_name} '{self.config_controller_name}' in namespace '{self.config.namespace}'" + ), f"failed to read {self.config.workload_kind} '{self.config.workload_name}' in namespace '{self.config.namespace}'" # Add optimizer annotation to the static Prometheus values required_annotations = PROMETHEUS_ANNOTATION_DEFAULTS.copy() required_annotations["servo.opsani.com/optimizer"] = self.config.optimizer.id # NOTE: Only check for annotation keys - annotations = controller.pod_template_spec.metadata.annotations or dict() + annotations: dict[str, str] = ( + controller.spec.template.metadata.annotations or dict() + ) actual_annotations = set(annotations.keys()) delta = set(required_annotations.keys()).difference(actual_annotations) if delta: annotations = dict(map(lambda k: (k, required_annotations[k]), delta)) patch = {"spec": {"template": {"metadata": {"annotations": annotations}}}} patch_json = json.dumps(patch, indent=None) - controller_patch_target = self._get_controller_patch_target(controller) # NOTE: custom resources don't support strategic merge type. json merge is acceptable for both cases because the patch json doesn't contain lists - command = f"kubectl --namespace {self.config.namespace} patch {controller_patch_target} --type='merge' -p '{patch_json}'" + command = ( + f"kubectl --namespace {self.config.namespace}" + f" patch {self.config.workload_kind} {self.config.workload_name}" + f" --type='merge' -p '{patch_json}'" + ) desc = ", ".join(sorted(delta)) raise servo.checks.CheckError( - f"{self.controller_type_name} '{controller.name}' is missing annotations: {desc}", + f"{self.config.workload_kind} '{controller.metadata.name}' is missing annotations: {desc}", hint=f"Patch annotations via: `{command}`", remedy=lambda: _stream_remedy_command(command), ) - @servo.checks.check("{self.controller_type_name} PodSpec has expected labels") + @servo.checks.check("{self.config.workload_kind} PodSpec has expected labels") async def check_controller_labels(self) -> None: - controller = await self.controller_class.read( - self.config_controller_name, self.config.namespace + controller = await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) assert ( controller - ), f"failed to read {self.controller_type_name} '{self.config_controller_name}' in namespace '{self.config.namespace}'" + ), f"failed to read {self.config.workload_kind} '{self.config.workload_name}' in namespace '{self.config.namespace}'" - labels = controller.pod_template_spec.metadata.labels + labels = controller.spec.template.metadata.labels assert ( labels - ), f"{self.controller_type_name} '{controller.name}' does not have any labels" + ), f"{self.config.workload_kind} '{controller.metadata.name}' does not have any labels" # Add optimizer label to the static values required_labels = ENVOY_SIDECAR_LABELS.copy() required_labels[ @@ -794,28 +767,30 @@ async def check_controller_labels(self) -> None: desc = ", ".join(sorted(map("=".join, delta.items()))) patch = {"spec": {"template": {"metadata": {"labels": delta}}}} patch_json = json.dumps(patch, indent=None) - controller_patch_target = self._get_controller_patch_target(controller) # NOTE: custom resources don't support strategic merge type. json merge is acceptable for both cases because the patch json doesn't contain lists - command = f"kubectl --namespace {self.config.namespace} patch {controller_patch_target} --type='merge' -p '{patch_json}'" + command = ( + f"kubectl --namespace {self.config.namespace}" + f" patch {self.config.workload_kind} {self.config.workload_name}" + f" --type='merge' -p '{patch_json}'" + ) raise servo.checks.CheckError( - f"{self.controller_type_name} '{controller.name}' is missing labels: {desc}", + f"{self.config.workload_kind} '{controller.metadata.name}' is missing labels: {desc}", hint=f"Patch labels via: `{command}`", remedy=lambda: _stream_remedy_command(command), ) - @servo.checks.check("{self.controller_type_name} has Envoy sidecar container") + @servo.checks.check("{self.config.workload_kind} has Envoy sidecar container") async def check_controller_envoy_sidecars(self) -> None: - controller = await self.controller_class.read( - self.config_controller_name, self.config.namespace + controller = await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) assert ( controller - ), f"failed to read {self.controller_type_name} '{self.config_controller_name}' in namespace '{self.config.namespace}'" + ), f"failed to read {self.config.workload_kind} '{self.config.workload_name}' in namespace '{self.config.namespace}'" # Search the containers list for the sidecar - for container in controller.containers: - if container.name == "opsani-envoy": - return + if find_container(controller, "opsani-envoy"): + return port_switch = ( f" --port {self.config.port}" if self.config.port is not None else "" @@ -824,31 +799,33 @@ async def check_controller_envoy_sidecars(self) -> None: f"kubectl exec -n {self.config.namespace} -c servo {self._servo_resource_target} -- " f"servo --token-file /servo/opsani.token inject-sidecar --image {self.config.envoy_sidecar_image} " f"--namespace {self.config.namespace} --service {self.config.service}{port_switch} " - f"{self.controller_type_name.lower()}/{self.config_controller_name}" + f"{self.config.workload_kind.lower()}/{self.config.workload_name}" ) raise servo.checks.CheckError( - f"{self.controller_type_name} '{controller.name}' pod template spec does not include envoy sidecar container ('opsani-envoy')", + f"{self.config.workload_kind} '{controller.metadata.name}' pod template spec does not include envoy sidecar container ('opsani-envoy')", hint=f"Inject Envoy sidecar container via: `{command}`", remedy=lambda: _stream_remedy_command(command), ) @servo.checks.check("Pods have Envoy sidecar containers") async def check_pod_envoy_sidecars(self) -> None: - controller = await self.controller_class.read( - self.config_controller_name, self.config.namespace + controller = await self.workload_helper.read( + self.config.workload_name, self.config.namespace ) assert ( controller - ), f"failed to read {self.controller_type_name} '{self.config_controller_name}' in namespace '{self.config.namespace}'" + ), f"failed to read {self.config.workload_kind} '{self.config.workload_name}' in namespace '{self.config.namespace}'" pods_without_sidecars = [] - for pod in await controller.get_pods(): + for pod in await self.workload_helper.get_latest_pods(controller): # Search the containers list for the sidecar - if not pod.get_container("opsani-envoy"): + if not find_container(pod, "opsani-envoy"): pods_without_sidecars.append(pod) if pods_without_sidecars: - desc = ", ".join(map(operator.attrgetter("name"), pods_without_sidecars)) + desc = ", ".join( + map(operator.attrgetter("metadata.name"), pods_without_sidecars) + ) raise servo.checks.CheckError( f"pods '{desc}' do not have envoy sidecar container ('opsani-envoy')" ) @@ -858,16 +835,6 @@ async def check_pod_envoy_sidecars(self) -> None: @servo.check("Prometheus is discovering targets") async def check_prometheus_targets(self) -> None: - pod = await self._read_servo_pod() - if pod is None: - raise servo.checks.CheckError(f"no servo pod was found") - - container = pod.get_container("prometheus") - assert container, "could not find a Prometheus sidecar container" - assert ( - len(container.obj.ports) == 1 - ), f"expected 1 container port but found {len(container.obj.ports)}" - client = servo.connectors.prometheus.Client( base_url=self.config.prometheus_base_url ) @@ -914,13 +881,11 @@ async def check_envoy_sidecar_metrics(self) -> str: @servo.checks.require("Traffic is proxied through Envoy") async def check_service_proxy(self) -> str: proxy_service_port = ENVOY_SIDECAR_DEFAULT_PORT # TODO: move to configuration - service = await servo.connectors.kubernetes.Service.read( - self.config.service, self.config.namespace - ) + service = await ServiceHelper.read(self.config.service, self.config.namespace) if self.config.port: - port = service.find_port(self.config.port) + port = ServiceHelper.find_port(service, self.config.port) else: - port = service.ports[0] + port = service.spec.ports[0] # return if we are already proxying to Envoy if port.target_port == proxy_service_port: @@ -929,7 +894,7 @@ async def check_service_proxy(self) -> str: # patch the target port to pass traffic through Envoy patch = { "spec": { - "type": service.obj.spec.type, + "type": service.spec.type, "ports": [ { "protocol": "TCP", @@ -943,7 +908,7 @@ async def check_service_proxy(self) -> str: patch_json = json.dumps(patch, indent=None) command = f"kubectl --namespace {self.config.namespace} patch service {self.config.service} -p '{patch_json}'" raise servo.checks.CheckError( - f"service '{service.name}' is not routing traffic through Envoy sidecar on port {proxy_service_port}", + f"service '{service.metadata.name}' is not routing traffic through Envoy sidecar on port {proxy_service_port}", hint=f"Update target port via: `{command}`", remedy=lambda: _stream_remedy_command(command), ) @@ -1028,166 +993,6 @@ def _servo_resource_target(self) -> str: return "deployment/servo" -class OpsaniDevChecks(BaseOpsaniDevChecks): - """Opsani dev checks against standard kubernetes Deployments""" - - @property - def controller_type_name(self) -> str: - return "Deployment" - - @property - def config_controller_name(self) -> str: - return self.config.deployment - - @property - def controller_class(self) -> Type[servo.connectors.kubernetes.Deployment]: - return servo.connectors.kubernetes.Deployment - - @property - def required_permissions(self) -> List[servo.connectors.kubernetes.PermissionSet]: - return KUBERNETES_PERMISSIONS - - async def _get_port_forward_target(self) -> str: - return f"deploy/{self.config.deployment}" - - def _get_generated_controller_config( - self, config: servo.connectors.kubernetes.KubernetesConfiguration - ) -> servo.connectors.kubernetes.DeploymentConfiguration: - return config.deployments[0] - - def _get_controller_service_selector( - self, controller: servo.connectors.kubernetes.Deployment - ) -> Dict[str, str]: - return controller.match_labels - - def _get_controller_patch_target( - self, controller: servo.connectors.kubernetes.Deployment - ) -> str: - return f"Deployment {self.config_controller_name}" - - -class OpsaniDevRolloutChecks(BaseOpsaniDevChecks): - """Opsani dev checks against argoproj.io Rollouts""" - - @property - def controller_type_name(self) -> str: - return "Rollout" - - @property - def config_controller_name(self) -> str: - return self.config.rollout - - @property - def controller_class(self) -> Type[servo.connectors.kubernetes.Rollout]: - return servo.connectors.kubernetes.Rollout - - @property - def required_permissions(self) -> List[servo.connectors.kubernetes.PermissionSet]: - return KUBERNETES_PERMISSIONS + [ - servo.connectors.kubernetes.PermissionSet( - group="argoproj.io", - resources=["rollouts", "rollouts/status"], - verbs=["get", "list", "watch", "update", "patch"], - ) - ] - - async def _get_port_forward_target(self) -> str: - # NOTE rollouts don't support kubectl port-forward, have to target the current replicaset instead - rollout = await servo.connectors.kubernetes.Rollout.read( - self.config.rollout, self.config.namespace - ) - assert ( - rollout - ), f"failed to read rollout '{self.config.rollout}' in namespace '{self.config.namespace}'" - assert ( - rollout.status - ), f"unable to verify envoy proxy. rollout '{self.config.rollout}' in namespace '{self.config.namespace}' has no status" - assert ( - rollout.status.current_pod_hash - ), f"unable to verify envoy proxy. rollout '{self.config.rollout}' in namespace '{self.config.namespace}' has no currentPodHash" - return f"replicaset/{rollout.name}-{rollout.status.current_pod_hash}" - - def _get_generated_controller_config( - self, config: servo.connectors.kubernetes.KubernetesConfiguration - ) -> servo.connectors.kubernetes.RolloutConfiguration: - return config.rollouts[0] - - def _get_controller_service_selector( - self, controller: servo.connectors.kubernetes.Rollout - ) -> Dict[str, str]: - match_labels = dict(controller.match_labels) - assert ( - controller.status - ), f"unable to determine service selector. rollout '{self.config.rollout}' in namespace '{self.config.namespace}' has no status" - assert ( - controller.status.current_pod_hash - ), f"unable to determine service selector. rollout '{self.config.rollout}' in namespace '{self.config.namespace}' has no currentPodHash" - match_labels["rollouts-pod-template-hash"] = controller.status.current_pod_hash - return match_labels - - def _get_controller_patch_target( - self, controller: servo.connectors.kubernetes.Rollout - ) -> str: - if controller.workload_ref_controller: - return f"{controller.workload_ref_controller.obj.kind} {controller.workload_ref_controller.name}" - - return f"Rollout {self.config_controller_name}" - - @servo.checks.require("Rollout Selector and PodSpec has opsani_role label") - async def check_rollout_selector_labels(self) -> None: - if os.environ.get("POD_NAME") and os.environ.get("POD_NAMESPACE"): - return # Setting owner reference to servo should prevent tuning pod from being adopted by the rollout controller - - rollout = await servo.connectors.kubernetes.Rollout.read( - self.config.rollout, self.config.namespace - ) - assert ( - rollout - ), f"failed to read Rollout '{self.config.rollout}' in namespace '{self.config.namespace}'" - - spec_patch = {} - match_labels = rollout.match_labels or dict() - opsani_role_selector = match_labels.get("opsani_role") - if opsani_role_selector is None or opsani_role_selector == "tuning": - opsani_role_selector = "mainline" - spec_patch["selector"] = { - "matchLabels": {"opsani_role": opsani_role_selector} - } - - labels = rollout.pod_template_spec.metadata.labels or dict() - opsani_role_label = labels.get("opsani_role") - if ( - opsani_role_label is None - or opsani_role_label == "tuning" - or opsani_role_label != opsani_role_selector - ): - spec_patch["template"] = { - "metadata": {"labels": {"opsani_role": opsani_role_selector}} - } - - if spec_patch: # Check failed if spec needs patching - patch = {"spec": spec_patch} - patch_json = json.dumps(patch, indent=None) - # NOTE: custom resources don't support strategic merge type. json merge is acceptable because the patch json doesn't contain lists - command = f"kubectl --namespace {self.config.namespace} patch rollout {self.config.rollout} --type='merge' -p '{patch_json}'" - replicasets = [f"rs/{rollout.name}-{rollout.status.current_pod_hash}"] - if ( - rollout.status.stable_RS - and rollout.status.stable_RS != rollout.status.current_pod_hash - ): - replicasets.append(f"rs/{rollout.name}-{rollout.status.stable_RS}") - raise servo.checks.CheckError( - ( - f"Rollout '{self.config.rollout}' has missing/mismatched opsani_role selector and/or label." - ' Label opsani_role with value != "tuning" is required to prevent the rollout controller from adopting and destroying the tuning pod' - ), - hint=( - f"NOTE: Running this patch will require that you manually scale down or delete the replicaset(s) ({', '.join(replicasets)})" - f" orphaned by the selector update. Patch selector and labels via: `{command}`" - ), - ) - - @servo.metadata( description="Optimize a single service via a tuning instance and an Envoy sidecar", version="2.0.0", @@ -1202,6 +1007,7 @@ class OpsaniDevConnector(servo.BaseConnector): @servo.on_event() async def attach(self, servo_: servo.Servo) -> None: + # FIXME figure out why servo.events.MetaClass is screwing with Servo type hinting await servo_.add_connector( "opsani-dev:kubernetes", servo.connectors.kubernetes.KubernetesConnector( @@ -1243,12 +1049,9 @@ async def check( matching: Optional[servo.CheckFilter], halt_on: Optional[servo.ErrorSeverity] = servo.ErrorSeverity.critical, ) -> List[servo.Check]: - if self.config.deployment: - checks_class = OpsaniDevChecks - elif self.config.rollout: - checks_class = OpsaniDevRolloutChecks - - return await checks_class.run(self.config, matching=matching, halt_on=halt_on) + return await OpsaniDevChecks.run( + self.config, matching=matching, halt_on=halt_on + ) async def _stream_remedy_command(command: str) -> None: diff --git a/servo/types/kubernetes.py b/servo/types/kubernetes.py index 90ee2efd2..4067d33f1 100644 --- a/servo/types/kubernetes.py +++ b/servo/types/kubernetes.py @@ -1,6 +1,47 @@ import enum +class Resource(str, enum.Enum): + memory = "memory" + cpu = "cpu" + + @classmethod + def values(cls) -> list[str]: + """ + Return a list of strings that identifies all resource values. + """ + return list(map(lambda rsrc: rsrc.value, cls.__members__.values())) + + +class ResourceRequirement(enum.Enum): + """ + The ResourceRequirement enumeration determines how optimization values are submitted to the + Kubernetes scheduler to allocate core compute resources. Requests establish the lower bounds + of the CPU and memory necessary for an application to execute while Limits define the upper + bounds for resources that can be consumed by a given Pod. The Opsani engine can determine + optimal values for these settings by identifying performant, low cost configurations that meet + target SLOs and/or maximizing performance while identifying the point of diminishing returns + on further resourcing. + """ + + request = "request" + limit = "limit" + + @property + def resources_key(self) -> str: + """ + Return a string value for accessing resource requirements within a Kubernetes Container representation. + """ + if self == ResourceRequirement.request: + return "requests" + elif self == ResourceRequirement.limit: + return "limits" + else: + raise NotImplementedError( + f'missing resources_key implementation for resource requirement "{self}"' + ) + + class ContainerLogOptions(str, enum.Enum): previous = "previous" current = "current" diff --git a/setup.cfg b/setup.cfg index 65c6139d0..1c4144356 100644 --- a/setup.cfg +++ b/setup.cfg @@ -11,6 +11,7 @@ filterwarnings = ignore: Use 'content=<...>' to upload raw bytes/text content.:DeprecationWarning ignore: unclosed file <_io.TextIOWrapper name=.*:ResourceWarning ignore: unclosed file <_io.FileIO name=.*:ResourceWarning + ignore: unclosed file <_io.BufferedReader name='/tmp/pytest.*'.*:ResourceWarning ignore: unclosed str: # NOTE no point in even trying to recover from this due to asyncio xdist parallelization coordination hell pytest.xfail("Minikube failed start") + if exit_code == 80 and any( + "Exiting due to GUEST_START" in line for line in stderr + ): + # https://github.com/kubernetes/minikube/issues/13621 + pytest.xfail("Minikube failed start (CA)") + raise RuntimeError( f"failed running minikube: exited with status code {exit_code}: {stderr}" ) @@ -667,11 +674,8 @@ def fastapi_app() -> fastapi.FastAPI: ForwardingTarget = Union[ str, kubetest.objects.Pod, - servo.connectors.kubernetes.Pod, kubetest.objects.Deployment, - servo.connectors.kubernetes.Deployment, kubetest.objects.Service, - servo.connectors.kubernetes.Service, ] @@ -709,18 +713,11 @@ async def kubectl_ports_forwarded( def _identifier_for_target(target: ForwardingTarget) -> str: if isinstance(target, str): return target - elif isinstance( - target, (kubetest.objects.Pod, servo.connectors.kubernetes.Pod) - ): + elif isinstance(target, kubetest.objects.Pod): return f"pod/{target.name}" - elif isinstance( - target, - (kubetest.objects.Deployment, servo.connectors.kubernetes.Deployment), - ): + elif isinstance(target, kubetest.objects.Deployment): return f"deployment/{target.name}" - elif isinstance( - target, (kubetest.objects.Service, servo.connectors.kubernetes.Service) - ): + elif isinstance(target, kubetest.objects.Service): return f"service/{target.name}" else: raise TypeError(f"unknown target: {repr(target)}") @@ -739,11 +736,16 @@ def _identifier_for_target(target: ForwardingTarget) -> str: await event.wait() - # Check if the sockets are open + # Check the sockets can be connected to + # TODO/FIXME add fault tolerance for error upgrading connection: unable to upgrade connection: pod does not exist for local_port, _ in ports: a_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - if a_socket.connect_ex(("localhost", local_port)) != 0: - raise RuntimeError(f"port forwarding failed: port {local_port} is not open") + if (h_errno := a_socket.connect_ex(("localhost", local_port))) != 0: + if task.done(): + debug(task.result()) + raise RuntimeError( + f"port forwarding failed: port {local_port} connect failed (errno {h_errno})" + ) try: if len(ports) == 1: @@ -763,7 +765,7 @@ def _identifier_for_target(target: ForwardingTarget) -> str: @pytest.fixture() async def kube_port_forward( - kube, + kube: kubetest.client.TestClient, unused_tcp_port_factory: Callable[[], int], kubeconfig, kubecontext: Optional[str], diff --git a/tests/connectors/kube_metrics_test.py b/tests/connectors/kube_metrics_test.py index dbf0134cc..921532aa8 100644 --- a/tests/connectors/kube_metrics_test.py +++ b/tests/connectors/kube_metrics_test.py @@ -6,10 +6,12 @@ import kubetest.client import pytest +from kubernetes_asyncio.client import V1Deployment + import servo from servo.runner import ServoRunner -from servo.connectors.kubernetes import Deployment +from servo.connectors.kubernetes_helpers import dict_to_selector, DeploymentHelper from servo.connectors.kube_metrics import * from servo.connectors.kube_metrics import ( _append_data_point, @@ -17,12 +19,6 @@ _get_target_resource_container, _name_to_metric, ) -from tests.connectors.kubernetes_test import namespace - - -@pytest.fixture -def kubecontext() -> str: - return "metrics-server" @pytest.fixture @@ -68,14 +64,23 @@ async def test_describe(kube_metrics_connector: KubeMetricsConnector): ] -async def _wait_for_scrape(namespace: str, deployment: Deployment): +async def _try_wait_for_scrape(namespace: str, deployment: V1Deployment) -> None: + try: + await asyncio.wait_for( + _wait_for_scrape(namespace=namespace, deployment=deployment), timeout=60 + ) + except asyncio.TimeoutError as te: + pytest.xfail("Metrics server scrape failed") + + +async def _wait_for_scrape(namespace: str, deployment: V1Deployment): async with kubernetes_asyncio.client.ApiClient() as api: cust_obj_api = kubernetes_asyncio.client.CustomObjectsApi(api) while True: await asyncio.sleep(1) result = await cust_obj_api.list_namespaced_custom_object( - label_selector=deployment.label_selector, + label_selector=dict_to_selector(deployment.spec.selector.match_labels), namespace=namespace, **METRICS_CUSTOM_OJBECT_CONST_ARGS, ) @@ -93,12 +98,10 @@ async def _wait_for_scrape(namespace: str, deployment: Deployment): loguru.logger.info("Coninuing wait for scrape") -# TODO group minikube fixture into file scope when xdist supports fixture scoping -@pytest.mark.minikube_profile.with_args("metrics-server") +@pytest.mark.integration +@pytest.mark.usefixtures("kubernetes_asyncio_config") @pytest.mark.applymanifests("../manifests", files=["fiber-http-opsani-dev.yaml"]) async def test_periodic_measure( - kubeconfig: str, - minikube: str, kube: kubetest.client.TestClient, servo_runner: ServoRunner, ): @@ -111,20 +114,14 @@ async def test_periodic_measure( name="fiber-http", namespace=kube.namespace, container="fiber-http", - context=minikube, - kubeconfig=kubeconfig, ) ) await connector.attach(servo_=servo_runner.servo) - deployment = await Deployment.read("fiber-http", kube.namespace) - - await asyncio.wait_for( - _wait_for_scrape(namespace=kube.namespace, deployment=deployment), timeout=60 - ) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) + await _try_wait_for_scrape(namespace=kube.namespace, deployment=deployment) await connector.periodic_measure( - target_resource=deployment, target_metrics=MAIN_METRICS, datapoints_dicts=datapoints_dicts, ) @@ -133,13 +130,12 @@ async def test_periodic_measure( assert m in datapoints_dicts -@pytest.mark.minikube_profile.with_args("metrics-server") +@pytest.mark.integration +@pytest.mark.usefixtures("kubernetes_asyncio_config") @pytest.mark.applymanifests( "../manifests", files=["fiber-http-opsani-dev_no_resource_limits.yaml"] ) async def test_periodic_measure_no_limits( - kubeconfig: str, - minikube: str, kube: kubetest.client.TestClient, servo_runner: ServoRunner, ): @@ -152,20 +148,14 @@ async def test_periodic_measure_no_limits( name="fiber-http", namespace=kube.namespace, container="fiber-http", - context=minikube, - kubeconfig=kubeconfig, ) ) await connector.attach(servo_=servo_runner.servo) - deployment = await Deployment.read("fiber-http", kube.namespace) - - await asyncio.wait_for( - _wait_for_scrape(namespace=kube.namespace, deployment=deployment), timeout=60 - ) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) + await _try_wait_for_scrape(namespace=kube.namespace, deployment=deployment) await connector.periodic_measure( - target_resource=deployment, target_metrics=MAIN_METRICS, datapoints_dicts=datapoints_dicts, ) @@ -175,13 +165,12 @@ async def test_periodic_measure_no_limits( assert m not in datapoints_dicts -@pytest.mark.minikube_profile.with_args("metrics-server") +@pytest.mark.integration +@pytest.mark.usefixtures("kubernetes_asyncio_config") @pytest.mark.applymanifests( "../manifests", files=["fiber-http-opsani-dev_no_resource_requests.yaml"] ) async def test_periodic_measure_no_requests( - kubeconfig: str, - minikube: str, kube: kubetest.client.TestClient, servo_runner: ServoRunner, ): @@ -194,20 +183,14 @@ async def test_periodic_measure_no_requests( name="fiber-http", namespace=kube.namespace, container="fiber-http", - context=minikube, - kubeconfig=kubeconfig, ) ) await connector.attach(servo_=servo_runner.servo) - deployment = await Deployment.read("fiber-http", kube.namespace) - - await asyncio.wait_for( - _wait_for_scrape(namespace=kube.namespace, deployment=deployment), timeout=60 - ) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) + await _try_wait_for_scrape(namespace=kube.namespace, deployment=deployment) await connector.periodic_measure( - target_resource=deployment, target_metrics=MAIN_METRICS, datapoints_dicts=datapoints_dicts, ) @@ -255,44 +238,34 @@ def test_append_data_point(): } -@pytest.mark.minikube_profile.with_args("metrics-server") +@pytest.mark.integration +@pytest.mark.usefixtures("kubernetes_asyncio_config") @pytest.mark.applymanifests("../manifests", files=["fiber-http-opsani-dev.yaml"]) # async def test_periodic_measure(kubeconfig: str, minikube: str, kube: kubetest.client.TestClient, servo_runner: ServoRunner): -async def test_get_target_resource( - kubeconfig: str, kubecontext: str, minikube: str, kube: kubetest.client.TestClient -): +async def test_get_target_resource(kube: kubetest.client.TestClient): kube.wait_for_registered() - await kubernetes_asyncio.config.load_kube_config( - config_file=str(kubeconfig), context=kubecontext - ) + await kubernetes_asyncio.config.load_kube_config() assert await _get_target_resource( KubeMetricsConfiguration( name="fiber-http", namespace=kube.namespace, container="fiber-http", - context=minikube, - kubeconfig=kubeconfig, ) ) -@pytest.mark.minikube_profile.with_args("metrics-server") +@pytest.mark.integration +@pytest.mark.usefixtures("kubernetes_asyncio_config") @pytest.mark.applymanifests("../manifests", files=["fiber-http-opsani-dev.yaml"]) -async def test_get_target_resource_container( - kubeconfig: str, kubecontext: str, minikube: str, kube: kubetest.client.TestClient -): +async def test_get_target_resource_container(kube: kubetest.client.TestClient): kube.wait_for_registered() - await kubernetes_asyncio.config.load_kube_config( - config_file=str(kubeconfig), context=kubecontext - ) - deployment = await Deployment.read("fiber-http", kube.namespace) + await kubernetes_asyncio.config.load_kube_config() + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) assert _get_target_resource_container( KubeMetricsConfiguration( name="fiber-http", namespace=kube.namespace, container="fiber-http", - context=minikube, - kubeconfig=kubeconfig, ), target_resource=deployment, ) @@ -306,7 +279,6 @@ def test_name_to_metric(): @pytest.mark.integration @pytest.mark.usefixtures("kubernetes_asyncio_config") -# @pytest.mark.applymanifests("../manifests/kube_metrics", files=["role.yaml", "role-binding.yaml"]) @pytest.mark.applymanifests("../manifests", files=["fiber-http-opsani-dev.yaml"]) class TestKubeMetricsConnectorIntegration: @pytest.fixture @@ -317,10 +289,6 @@ def kube_metrics_config(self, kube: kubetest.client.TestClient): container="fiber-http", ) - @pytest.fixture - def kubecontext(self) -> str: - return None # override file level fixture for EKS - async def test_checks(self, kube_metrics_config: KubeMetricsConfiguration) -> None: checks = await KubeMetricsChecks.run(kube_metrics_config) assert all(c.success for c in checks), debug(checks) @@ -331,12 +299,8 @@ async def test_measure( kube_metrics_connector: KubeMetricsConnector, ) -> None: kube.wait_for_registered() - deployment = await Deployment.read("fiber-http", kube.namespace) - - await asyncio.wait_for( - _wait_for_scrape(namespace=kube.namespace, deployment=deployment), - timeout=60, - ) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) + await _try_wait_for_scrape(namespace=kube.namespace, deployment=deployment) kube_metrics_connector.config.metric_collection_frequency = servo.Duration("1s") result = await kube_metrics_connector.measure() diff --git a/tests/connectors/kubernetes_test.py b/tests/connectors/kubernetes_test.py index 4e8e10995..0bdf6096c 100644 --- a/tests/connectors/kubernetes_test.py +++ b/tests/connectors/kubernetes_test.py @@ -14,7 +14,16 @@ import re import respx import traceback -from kubernetes_asyncio import client +from kubernetes_asyncio.client import ( + ApiClient, + V1Container, + V1ContainerPort, + V1ResourceRequirements, + V1EnvVar, + V1ServicePort, + VersionApi, + VersionInfo, +) from pydantic import BaseModel from pydantic.error_wrappers import ValidationError @@ -24,11 +33,9 @@ CPU, CanaryOptimization, CanaryOptimizationStrategyConfiguration, - Container, ContainerConfiguration, ContainerTagName, DefaultOptimizationStrategyConfiguration, - Deployment, DeploymentConfiguration, DNSLabelName, DNSSubdomainName, @@ -39,12 +46,16 @@ Memory, Core, OptimizationStrategy, - Pod, - ResourceRequirement, - Rollout, - RolloutConfiguration, ) -import servo +from servo.types.kubernetes import Resource, ResourceRequirement +from servo.connectors.kubernetes_helpers import ( + find_container, + get_containers, + ContainerHelper, + DeploymentHelper, + PodHelper, + ServiceHelper, +) from servo.errors import AdjustmentFailedError, AdjustmentRejectedError import servo.runner from servo.types.api import Adjustment, Component, Description @@ -597,17 +608,16 @@ def test_resource_key(self, requirement: ResourceRequirement, val) -> None: class TestContainer: @pytest.fixture - def container(self, mocker) -> Container: - stub_pod = mocker.stub(name="Pod") - container = Container(client.V1Container(name="fiber-http"), stub_pod) + def container(self) -> V1Container: + container = V1Container(name="fiber-http") - resources = client.V1ResourceRequirements() + resources = V1ResourceRequirements() resources.requests = {"cpu": "100m", "memory": "3G"} resources.limits = {"cpu": "15000m"} container.resources = resources - container.obj.env = [ - client.V1EnvVar(name="TEST1", value="TEST2"), + container.env = [ + V1EnvVar(name="TEST1", value="TEST2"), ] return container @@ -641,13 +651,15 @@ def container(self, mocker) -> Container: ) def test_get_resource_requirements( self, - container: Container, + container: V1Container, resource: str, requirement: ResourceRequirement, value, ) -> None: assert ( - all_requirements := container.get_resource_requirements(resource) + all_requirements := ContainerHelper.get_resource_requirements( + container, resource + ) ) is not None if requirement: assert all_requirements.get(requirement) == value @@ -680,21 +692,22 @@ def test_get_resource_requirements( ) def test_set_resource_requirements( self, - container: Container, + container: V1Container, resource: str, value: dict[ResourceRequirement, Optional[str]], resources_dict, ) -> None: - container.set_resource_requirements(resource, value) + ContainerHelper.set_resource_requirements(container, resource, value) assert container.resources.to_dict() == resources_dict def test_set_resource_requirements_handles_null_requirements_dict( - self, container: Container + self, container: V1Container ): - container.resources = client.V1ResourceRequirements() + container.resources = V1ResourceRequirements() - container.set_resource_requirements( - "cpu", + ContainerHelper.set_resource_requirements( + container, + Resource.cpu.value, {ResourceRequirement.request: "1000m", ResourceRequirement.limit: "1000m"}, ) assert container.resources.to_dict() == { @@ -702,16 +715,16 @@ def test_set_resource_requirements_handles_null_requirements_dict( "requests": {"cpu": "1000m"}, } - def test_get_environment_variable(self, container: Container): - assert container.get_environment_variable("TEST1") == "TEST2" + def test_get_environment_variable(self, container: V1Container): + assert ContainerHelper.get_environment_variable(container, "TEST1") == "TEST2" - def test_set_environment_variable(self, container: Container): - container.set_environment_variable("TEST1", "TEST3") - container.set_environment_variable("TEST4", "TEST5") + def test_set_environment_variable(self, container: V1Container): + ContainerHelper.set_environment_variable(container, "TEST1", "TEST3") + ContainerHelper.set_environment_variable(container, "TEST4", "TEST5") assert container.env == [ - client.V1EnvVar(name="TEST1", value="TEST3"), - client.V1EnvVar(name="TEST4", value="TEST5"), + V1EnvVar(name="TEST1", value="TEST3"), + V1EnvVar(name="TEST4", value="TEST5"), ] @@ -1078,22 +1091,26 @@ async def test_adjust_cpu_with_settlement(self, config): assert setting assert setting.value == 0.25 - async def test_adjust_cpu_at_non_zero_container_index(self, config): + async def test_adjust_cpu_at_non_zero_container_index( + self, config: KubernetesConfiguration + ): # Inject a sidecar at index zero - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", config.namespace - ) + deployment = await DeploymentHelper.read("fiber-http", config.namespace) assert ( deployment ), f"failed loading deployment 'fiber-http' in namespace '{config.namespace}'" - async with deployment.rollout(timeout=config.timeout) as deployment_update: - await deployment_update.inject_sidecar( - "opsani-envoy", - "opsani/envoy-proxy:latest", - port="8480", - service_port=8091, - index=0, - ) + await DeploymentHelper.inject_sidecar( + deployment, + "opsani-envoy", + "opsani/envoy-proxy:latest", + port="8480", + service_port=8091, + index=0, + ) + await asyncio.wait_for( + DeploymentHelper.wait_until_ready(deployment), + timeout=config.timeout.total_seconds(), + ) connector = KubernetesConnector(config=config) adjustment = Adjustment( @@ -1249,12 +1266,14 @@ async def test_adjust_replicas(self, config): assert setting assert setting.value == 2 - async def test_read_pod(self, config, kube) -> None: + async def test_read_pod( + self, config: KubernetesConfiguration, kube: kubetest.client.TestClient + ) -> None: connector = KubernetesConnector(config=config) pods = kube.get_pods() pod_name = next(iter(pods.keys())) assert pod_name.startswith("fiber-http") - pod = await Pod.read(pod_name, kube.namespace) + pod = await PodHelper.read(pod_name, kube.namespace) assert pod ## @@ -1512,7 +1531,7 @@ async def test_bad_request_error_handled_gracefully( return_value=("memory", "256.0MiBGiB"), ) - tuning_config.deployments[0].on_failure = FailureMode.rollback + tuning_config.deployments[0].on_failure = FailureMode.shutdown connector = KubernetesConnector(config=tuning_config) adjustment = Adjustment( component_name="fiber-http/fiber-http-tuning", @@ -1520,7 +1539,7 @@ async def test_bad_request_error_handled_gracefully( value="256Mi", ) - # Catch info log messages + # Catch debug log messages messages = [] connector.logger.add(lambda m: messages.append(m.record["message"]), level=10) @@ -1588,9 +1607,9 @@ async def test_adjust_handle_error_respects_nested_config( description = await connector.adjust([adjustment]) debug(description) - deployment = await Deployment.read("fiber-http", kube.namespace) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) # check deployment was not scaled to 0 replicas (i.e., the outer-level 'shutdown' was overridden) - assert deployment.obj.spec.replicas != 0 + assert deployment.spec.replicas != 0 async def test_adjust_tuning_cpu_out_of_range(self, tuning_config): connector = KubernetesConnector(config=tuning_config) @@ -1819,7 +1838,7 @@ def kubetest_deployment_becomes_unready( ( "if [ $(cat /sys/fs/cgroup/memory/memory.limit_in_bytes) -gt 201326592 ]; " "then /bin/fiber-http; " - "else (/bin/fiber-http &); sleep 10s; kill $(jobs -p '%/bin/fiber-http'); " + "else (/bin/fiber-http &); sleep 10s; " "fi" ), ] @@ -2074,14 +2093,14 @@ async def test_get_resource_requirements_no_limits( ) -> None: servo.logging.set_level("DEBUG") - deployment = await Deployment.read("fiber-http", tuning_config.namespace) - await deployment.wait_until_ready() + deployment = await DeploymentHelper.read("fiber-http", tuning_config.namespace) + await DeploymentHelper.wait_until_ready(deployment) - pods = await deployment.get_pods() + pods = await DeploymentHelper.get_latest_pods(deployment) assert len(pods) == 1, "expected a fiber-http pod" pod = pods[0] - container = pod.get_container("fiber-http") - assert container.get_resource_requirements("cpu") == { + container = find_container(pod, "fiber-http") + assert ContainerHelper.get_resource_requirements(container, "cpu") == { servo.connectors.kubernetes.ResourceRequirement.request: "125m", servo.connectors.kubernetes.ResourceRequirement.limit: None, } @@ -2095,62 +2114,69 @@ async def test_set_resource_requirements_no_limits( ) -> None: servo.logging.set_level("DEBUG") - deployment = await Deployment.read("fiber-http", tuning_config.namespace) - await deployment.wait_until_ready() + deployment = await DeploymentHelper.read("fiber-http", tuning_config.namespace) + await asyncio.wait_for( + DeploymentHelper.wait_until_ready(deployment), timeout=300 + ) - pods = await deployment.get_pods() + pods = await DeploymentHelper.get_latest_pods(deployment) assert len(pods) == 1, "expected a fiber-http pod" pod = pods[0] - container = pod.get_container("fiber-http") - assert container.get_resource_requirements("cpu") == { + container = find_container(pod, "fiber-http") + assert ContainerHelper.get_resource_requirements(container, "cpu") == { servo.connectors.kubernetes.ResourceRequirement.request: "125m", servo.connectors.kubernetes.ResourceRequirement.limit: None, } # Set request and limit - container.set_resource_requirements( + ContainerHelper.set_resource_requirements( + container, "cpu", { servo.connectors.kubernetes.ResourceRequirement.request: "125m", servo.connectors.kubernetes.ResourceRequirement.limit: "250m", }, ) - container.get_resource_requirements("cpu") == { + assert ContainerHelper.get_resource_requirements(container, "cpu") == { servo.connectors.kubernetes.ResourceRequirement.request: "125m", servo.connectors.kubernetes.ResourceRequirement.limit: "250m", } # Set limit, leaving request alone - container.set_resource_requirements( - "cpu", {servo.connectors.kubernetes.ResourceRequirement.limit: "750m"} + ContainerHelper.set_resource_requirements( + container, + "cpu", + {servo.connectors.kubernetes.ResourceRequirement.limit: "750m"}, ) - assert container.get_resource_requirements("cpu") == { + assert ContainerHelper.get_resource_requirements(container, "cpu") == { servo.connectors.kubernetes.ResourceRequirement.request: "125m", servo.connectors.kubernetes.ResourceRequirement.limit: "750m", } # Set request, clearing limit - container.set_resource_requirements( + ContainerHelper.set_resource_requirements( + container, "cpu", { servo.connectors.kubernetes.ResourceRequirement.request: "250m", servo.connectors.kubernetes.ResourceRequirement.limit: None, }, ) - assert container.get_resource_requirements("cpu") == { + assert ContainerHelper.get_resource_requirements(container, "cpu") == { servo.connectors.kubernetes.ResourceRequirement.request: "250m", servo.connectors.kubernetes.ResourceRequirement.limit: None, } # Clear request and limit - container.set_resource_requirements( + ContainerHelper.set_resource_requirements( + container, "cpu", { servo.connectors.kubernetes.ResourceRequirement.request: None, servo.connectors.kubernetes.ResourceRequirement.limit: None, }, ) - assert container.get_resource_requirements("cpu") == { + assert ContainerHelper.get_resource_requirements(container, "cpu") == { servo.connectors.kubernetes.ResourceRequirement.request: None, servo.connectors.kubernetes.ResourceRequirement.limit: None, } @@ -2173,10 +2199,12 @@ async def test_initialize_tuning_pod_set_defaults_for_no_limits( await servo.connectors.kubernetes.KubernetesOptimizations.create(tuning_config) # Read the Tuning Pod and check resources - pod = await Pod.read("fiber-http-tuning", tuning_config.namespace) - container = pod.get_container("fiber-http") - cpu_requirements = container.get_resource_requirements("cpu") - memory_requirements = container.get_resource_requirements("memory") + pod = await PodHelper.read("fiber-http-tuning", tuning_config.namespace) + container = find_container(pod, "fiber-http") + cpu_requirements = ContainerHelper.get_resource_requirements(container, "cpu") + memory_requirements = ContainerHelper.get_resource_requirements( + container, "memory" + ) assert ( cpu_requirements[servo.connectors.kubernetes.ResourceRequirement.limit] @@ -2213,17 +2241,17 @@ async def test_no_cpu_limit( assert setting.value == 0.25 # Read the Tuning Pod and check resources - pod = await Pod.read("fiber-http-tuning", tuning_config.namespace) - container = pod.get_container("fiber-http") + pod = await PodHelper.read("fiber-http-tuning", tuning_config.namespace) + container = find_container(pod, "fiber-http") # CPU picks up the 1000m default and then gets adjust to 250m - assert container.get_resource_requirements("cpu") == { + assert ContainerHelper.get_resource_requirements(container, "cpu") == { servo.connectors.kubernetes.ResourceRequirement.request: "250m", servo.connectors.kubernetes.ResourceRequirement.limit: "1", } # Memory is untouched from the mainfest - assert container.get_resource_requirements("memory") == { + assert ContainerHelper.get_resource_requirements(container, "memory") == { servo.connectors.kubernetes.ResourceRequirement.request: "128Mi", servo.connectors.kubernetes.ResourceRequirement.limit: "128Mi", } @@ -2452,28 +2480,34 @@ async def test_preflight_cycle( assert adjusted_tuning_mem_setting.value.human_readable() == "1.0Gi" ## Read the Main Pod and check resources - main_deployment = await Deployment.read("fiber-http", tuning_config.namespace) - main_pods = await main_deployment.get_pods() - main_pod_container = main_pods[0].get_container("fiber-http") + main_deployment = await DeploymentHelper.read( + "fiber-http", tuning_config.namespace + ) + main_pods = await DeploymentHelper.get_latest_pods(main_deployment) + main_pod_container = find_container(main_pods[0], "fiber-http") ## CPU is set to 500m on both requirements - assert main_pod_container.get_resource_requirements("cpu") == { + assert ContainerHelper.get_resource_requirements(main_pod_container, "cpu") == { servo.connectors.kubernetes.ResourceRequirement.request: "125m", servo.connectors.kubernetes.ResourceRequirement.limit: "250m", } ## Read the Tuning Pod and check resources - tuning_pod = await Pod.read("fiber-http-tuning", tuning_config.namespace) - tuning_pod_container = tuning_pod.get_container("fiber-http") + tuning_pod = await PodHelper.read("fiber-http-tuning", tuning_config.namespace) + tuning_pod_container = find_container(tuning_pod, "fiber-http") ## CPU is set to 500m on both requirements - assert tuning_pod_container.get_resource_requirements("cpu") == { + assert ContainerHelper.get_resource_requirements( + tuning_pod_container, "cpu" + ) == { servo.connectors.kubernetes.ResourceRequirement.request: "500m", servo.connectors.kubernetes.ResourceRequirement.limit: "500m", } ## Memory is set to 1Gi on both requirements - assert tuning_pod_container.get_resource_requirements("memory") == { + assert ContainerHelper.get_resource_requirements( + tuning_pod_container, "memory" + ) == { servo.connectors.kubernetes.ResourceRequirement.request: "1Gi", servo.connectors.kubernetes.ResourceRequirement.limit: "1Gi", } @@ -2538,7 +2572,7 @@ async def test_preflight_cycle( @pytest.mark.usefixtures("kubernetes_asyncio_config") class TestSidecarInjection: @pytest.fixture(autouse=True) - async def _wait_for_manifests(self, kube, config): + async def _wait_for_manifests(self, kube: kubetest.client.TestClient, config): kube.wait_for_registered() config.timeout = "5m" @@ -2550,7 +2584,7 @@ def namespace(self, kube: kubetest.client.TestClient) -> str: "../manifests/sidecar_injection", files=["fiber-http_single_port.yaml"] ) @pytest.mark.parametrize( - "port, service", + "port, service_name", [ (None, "fiber-http"), (80, "fiber-http"), @@ -2558,17 +2592,13 @@ def namespace(self, kube: kubetest.client.TestClient) -> str: ], ) async def test_inject_single_port_deployment( - self, namespace: str, service: str, port: Union[str, int] + self, namespace: str, service_name: str, port: Union[str, int] ) -> None: - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", namespace - ) - assert len(deployment.containers) == 1, "expected a single container" - service = await servo.connectors.kubernetes.Service.read( - "fiber-http", namespace - ) - assert len(service.ports) == 1 - port_obj = service.ports[0] + deployment = await DeploymentHelper.read("fiber-http", namespace) + assert len(get_containers(deployment)) == 1, "expected a single container" + service = await ServiceHelper.read(service_name, namespace) + assert len(service.spec.ports) == 1 + port_obj: V1ServicePort = service.spec.ports[0] if isinstance(port, int): assert port_obj.port == port @@ -2576,26 +2606,31 @@ async def test_inject_single_port_deployment( assert port_obj.name == port assert port_obj.target_port == 8480 - await deployment.inject_sidecar( - "opsani-envoy", ENVOY_SIDECAR_IMAGE_TAG, service="fiber-http", port=port + await DeploymentHelper.inject_sidecar( + deployment, + "opsani-envoy", + ENVOY_SIDECAR_IMAGE_TAG, + service=service_name, + port=port, ) # Examine new sidecar - await deployment.refresh() - assert len(deployment.containers) == 2, "expected an injected container" - sidecar_container = deployment.containers[1] + deployment = await DeploymentHelper.read("fiber-http", namespace) + containers = get_containers(deployment) + assert len(containers) == 2, "expected an injected container" + sidecar_container = containers[1] assert sidecar_container.name == "opsani-envoy" # Check ports and env assert sidecar_container.ports == [ - kubernetes_asyncio.client.V1ContainerPort( + V1ContainerPort( container_port=9980, host_ip=None, host_port=None, name="opsani-proxy", protocol="TCP", ), - kubernetes_asyncio.client.V1ContainerPort( + V1ContainerPort( container_port=9901, host_ip=None, host_port=None, @@ -2603,11 +2638,11 @@ async def test_inject_single_port_deployment( protocol="TCP", ), ] - assert sidecar_container.obj.env == [ - kubernetes_asyncio.client.V1EnvVar( + assert sidecar_container.env == [ + V1EnvVar( name="OPSANI_ENVOY_PROXY_SERVICE_PORT", value="9980", value_from=None ), - kubernetes_asyncio.client.V1EnvVar( + V1EnvVar( name="OPSANI_ENVOY_PROXIED_CONTAINER_PORT", value="8480", value_from=None, @@ -2621,7 +2656,7 @@ async def test_inject_single_port_deployment( "../manifests/sidecar_injection", files=["fiber-http_multiple_ports.yaml"] ) @pytest.mark.parametrize( - "port, service, error", + "port, service_name, error", [ ( None, @@ -2637,19 +2672,15 @@ async def test_inject_single_port_deployment( async def test_inject_multiport_deployment( self, namespace: str, - service: str, + service_name: str, port: Union[str, int], error: Optional[Exception], ) -> None: - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", namespace - ) - assert len(deployment.containers) == 1, "expected a single container" - service = await servo.connectors.kubernetes.Service.read( - "fiber-http", namespace - ) - assert len(service.ports) == 2 - port_obj = service.ports[0] + deployment = await DeploymentHelper.read("fiber-http", namespace) + assert len(get_containers(deployment)) == 1, "expected a single container" + service = await ServiceHelper.read(service_name, namespace) + assert len(service.spec.ports) == 2 + port_obj: V1ServicePort = service.spec.ports[0] if isinstance(port, int): assert port_obj.port == port @@ -2658,17 +2689,22 @@ async def test_inject_multiport_deployment( assert port_obj.target_port == 8480 try: - await deployment.inject_sidecar( - "opsani-envoy", ENVOY_SIDECAR_IMAGE_TAG, service="fiber-http", port=port + await DeploymentHelper.inject_sidecar( + deployment, + "opsani-envoy", + ENVOY_SIDECAR_IMAGE_TAG, + service=service_name, + port=port, ) except Exception as e: assert repr(e) == repr(error) # Examine new sidecar (if success is expected) if error is None: - await deployment.refresh() - assert len(deployment.containers) == 2, "expected an injected container" - sidecar_container = deployment.containers[1] + deployment = await DeploymentHelper.read("fiber-http", namespace) + containers = get_containers(deployment) + assert len(containers) == 2, "expected an injected container" + sidecar_container = containers[1] assert sidecar_container.name == "opsani-envoy" # Check ports and env @@ -2688,7 +2724,7 @@ async def test_inject_multiport_deployment( protocol="TCP", ), ] - assert sidecar_container.obj.env == [ + assert sidecar_container.env == [ kubernetes_asyncio.client.V1EnvVar( name="OPSANI_ENVOY_PROXY_SERVICE_PORT", value="9980", @@ -2711,7 +2747,7 @@ async def test_inject_multiport_deployment( files=["fiber-http_multiple_ports_symbolic_targets.yaml"], ) @pytest.mark.parametrize( - "port, service", + "port, service_name", [ (None, "fiber-http"), (80, "fiber-http"), @@ -2719,18 +2755,14 @@ async def test_inject_multiport_deployment( ], ) async def test_inject_symbolic_target_port( - self, namespace: str, service: str, port: Union[str, int] + self, namespace: str, service_name: str, port: Union[str, int] ) -> None: """test_inject_by_source_port_name_with_symbolic_target_port""" - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", namespace - ) - assert len(deployment.containers) == 1, "expected a single container" - service = await servo.connectors.kubernetes.Service.read( - "fiber-http", namespace - ) - assert len(service.ports) == 1 - port_obj = service.ports[0] + deployment = await DeploymentHelper.read("fiber-http", namespace) + assert len(get_containers(deployment)) == 1, "expected a single container" + service = await ServiceHelper.read(service_name, namespace) + assert len(service.spec.ports) == 1 + port_obj: V1ServicePort = service.spec.ports[0] if isinstance(port, int): assert port_obj.port == port @@ -2738,14 +2770,19 @@ async def test_inject_symbolic_target_port( assert port_obj.name == port assert port_obj.target_port == "collector" - await deployment.inject_sidecar( - "opsani-envoy", ENVOY_SIDECAR_IMAGE_TAG, service="fiber-http", port=port + await DeploymentHelper.inject_sidecar( + deployment, + "opsani-envoy", + ENVOY_SIDECAR_IMAGE_TAG, + service=service_name, + port=port, ) # Examine new sidecar - await deployment.refresh() - assert len(deployment.containers) == 2, "expected an injected container" - sidecar_container = deployment.containers[1] + deployment = await DeploymentHelper.read("fiber-http", namespace) + containers = get_containers(deployment) + assert len(containers) == 2, "expected an injected container" + sidecar_container = containers[1] assert sidecar_container.name == "opsani-envoy" # Check ports and env @@ -2765,7 +2802,7 @@ async def test_inject_symbolic_target_port( protocol="TCP", ), ] - assert sidecar_container.obj.env == [ + assert sidecar_container.env == [ kubernetes_asyncio.client.V1EnvVar( name="OPSANI_ENVOY_PROXY_SERVICE_PORT", value="9980", value_from=None ), @@ -2796,9 +2833,9 @@ async def test_telemetry_hello( config: KubernetesConfiguration, servo_runner: servo.runner.Runner, ) -> None: - async with client.api_client.ApiClient() as api: - v1 = kubernetes_asyncio.client.VersionApi(api) - version_obj = await v1.get_code() + async with ApiClient() as api: + v1 = VersionApi(api) + version_obj: VersionInfo = await v1.get_code() expected = ( f'"telemetry": {{"servox.version": "{servo.__version__}", "servox.platform": "{platform.platform()}", ' @@ -2830,367 +2867,3 @@ async def test_telemetry_hello( assert request.called print(request.calls.last.request.content.decode()) assert expected in request.calls.last.request.content.decode() - - -## -# Tests against an ArgoCD rollout -@pytest.mark.integration -@pytest.mark.usefixtures("kubernetes_asyncio_config", "manage_rollout") -@pytest.mark.parametrize( - (), - [ - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/argo_rollouts/fiber-http-opsani-dev.yaml" - ) - ), - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/argo_rollouts/fiber-http-opsani-dev-workload-ref.yaml" - ) - ), - ], -) -class TestKubernetesConnectorRolloutIntegration: - @pytest.fixture - def namespace(self, kube: kubetest.client.TestClient) -> str: - return kube.namespace - - @pytest.fixture() - def _rollout_tuning_config( - self, tuning_config: KubernetesConfiguration - ) -> KubernetesConfiguration: - tuning_config.rollouts = [ - RolloutConfiguration.parse_obj(d) for d in tuning_config.deployments - ] - tuning_config.deployments = None - return tuning_config - - ## - # Canary Tests - async def test_create_rollout_tuning( - self, - _rollout_tuning_config: KubernetesConfiguration, - kube: kubetest.client.TestClient, - namespace: str, - ) -> None: - _rollout_tuning_config.rollouts[0].containers[ - 0 - ].static_environment_variables = {"FOO": "BAR"} - connector = KubernetesConnector(config=_rollout_tuning_config) - rol = await Rollout.read("fiber-http", namespace) - description = await connector.describe() - - assert description == Description( - components=[ - Component( - name="fiber-http/fiber-http", - settings=[ - CPU( - name="cpu", - type="range", - pinned=True, - value="125m", - min="125m", - max="875m", - step="125m", - request="125m", - limit="125m", - get=["request", "limit"], - set=["request", "limit"], - ), - Memory( - name="mem", - type="range", - pinned=True, - value=134217728, - min=134217728, - max=805306368, - step=33554432, - request=134217728, - limit=134217728, - get=["request", "limit"], - set=["request", "limit"], - ), - Replicas( - name="replicas", - type="range", - pinned=True, - value=1, - min=0, - max=99999, - step=1, - ), - EnvironmentEnumSetting( - name="INIT_MEMORY_SIZE", - type="enum", - pinned=True, - values=["32MB", "64MB", "128MB"], - ), - ], - ), - Component( - name="fiber-http/fiber-http-tuning", - settings=[ - CPU( - name="cpu", - type="range", - pinned=False, - value="125m", - min="125m", - max="875m", - step="125m", - request="125m", - limit="125m", - get=["request", "limit"], - set=["request", "limit"], - ), - Memory( - name="mem", - type="range", - pinned=False, - value=134217728, - min=134217728, - max=805306368, - step=33554432, - request=134217728, - limit=134217728, - get=["request", "limit"], - set=["request", "limit"], - ), - Replicas( - name="replicas", - type="range", - pinned=True, - value=1, - min=0, - max=1, - step=1, - ), - EnvironmentEnumSetting( - name="INIT_MEMORY_SIZE", - type="enum", - pinned=True, - values=["32MB", "64MB", "128MB"], - ), - ], - ), - ] - ) - - tuning_pod = kube.get_pods()["fiber-http-tuning"] - assert ( - tuning_pod.obj.metadata.annotations["opsani.com/opsani_tuning_for"] - == "fiber-http/fiber-http-tuning" - ) - assert tuning_pod.obj.metadata.labels["opsani_role"] == "tuning" - target_container = next( - filter(lambda c: c.name == "fiber-http", tuning_pod.obj.spec.containers) - ) - assert target_container.resources.requests == {"cpu": "125m", "memory": "128Mi"} - assert target_container.resources.limits == {"cpu": "125m", "memory": "128Mi"} - assert target_container.env == [ - kubernetes.client.models.V1EnvVar(name="FOO", value="BAR") - ] - - # verify tuning pod is registered as service endpoint - service = await servo.connectors.kubernetes.Service.read( - "fiber-http", namespace - ) - endpoints = await service.get_endpoints() - tuning_name = f"{_rollout_tuning_config.rollouts[0].name}-tuning" - tuning_endpoint = next( - filter( - lambda epa: epa.target_ref.name == tuning_name, - endpoints[0].subsets[0].addresses, - ), - None, - ) - if tuning_endpoint is None: - raise AssertionError( - f"Tuning pod {tuning_name} not contained in service endpoints: {endpoints}" - ) - - async def test_adjust_rol_tuning_cpu_with_settle( - self, _rollout_tuning_config, namespace - ): - # test_adjust_rollout_tuning_cpu_with_settlement - connector = KubernetesConnector(config=_rollout_tuning_config) - adjustment = Adjustment( - component_name="fiber-http/fiber-http-tuning", - setting_name="cpu", - value=".250", - ) - control = servo.Control(settlement="1s") - description = await connector.adjust([adjustment], control) - assert description is not None - setting = description.get_setting("fiber-http/fiber-http-tuning.cpu") - assert setting - assert setting.value == 0.25 - - async def test_adjust_rol_tuning_insufficient_rsrcs( - self, _rollout_tuning_config: KubernetesConfiguration, namespace - ) -> None: - # test_adjust_rollout_tuning_insufficient_resources - servo.logging.set_level("TRACE") - _rollout_tuning_config.timeout = "15s" - _rollout_tuning_config.cascade_common_settings(overwrite=True) - _rollout_tuning_config.rollouts[0].containers[0].memory.max = "256Gi" - connector = KubernetesConnector(config=_rollout_tuning_config) - - adjustment = Adjustment( - component_name="fiber-http/fiber-http-tuning", - setting_name="mem", - value="128Gi", # impossible right? - ) - with pytest.raises(AdjustmentRejectedError) as rejection_info: - description = await connector.adjust([adjustment]) - - rej_msg = str(rejection_info.value) - assert ( - "Insufficient memory." in rej_msg - or "Pod Node didn't have enough resource: memory" in rej_msg - ) - - -STANDARD_ROLLOUT_EXPECTED_PORTS = [ - servo.connectors.kubernetes.RolloutV1ContainerPort( - container_port=9980, - host_ip=None, - host_port=None, - name="opsani-proxy", - protocol="TCP", - ), - servo.connectors.kubernetes.RolloutV1ContainerPort( - container_port=9901, - host_ip=None, - host_port=None, - name="opsani-metrics", - protocol="TCP", - ), -] -STANDARD_ROLLOUT_EXPECTED_ENV = [ - servo.connectors.kubernetes.RolloutV1EnvVar( - name="OPSANI_ENVOY_PROXY_SERVICE_PORT", value="9980", value_from=None - ), - servo.connectors.kubernetes.RolloutV1EnvVar( - name="OPSANI_ENVOY_PROXIED_CONTAINER_PORT", value="8480", value_from=None - ), - servo.connectors.kubernetes.RolloutV1EnvVar( - name="OPSANI_ENVOY_PROXY_METRICS_PORT", value="9901", value_from=None - ), -] -WORKLOAD_REF_ROLLOUT_EXPECTED_PORTS = [ - kubernetes_asyncio.client.V1ContainerPort( - container_port=9980, - host_ip=None, - host_port=None, - name="opsani-proxy", - protocol="TCP", - ), - kubernetes_asyncio.client.V1ContainerPort( - container_port=9901, - host_ip=None, - host_port=None, - name="opsani-metrics", - protocol="TCP", - ), -] -WORKLOAD_REF_ROLLOUT_EXPECTED_ENV = [ - kubernetes_asyncio.client.V1EnvVar( - name="OPSANI_ENVOY_PROXY_SERVICE_PORT", value="9980", value_from=None - ), - kubernetes_asyncio.client.V1EnvVar( - name="OPSANI_ENVOY_PROXIED_CONTAINER_PORT", value="8480", value_from=None - ), - kubernetes_asyncio.client.V1EnvVar( - name="OPSANI_ENVOY_PROXY_METRICS_PORT", value="9901", value_from=None - ), -] - - -@pytest.mark.integration -@pytest.mark.usefixtures("kubernetes_asyncio_config", "manage_rollout") -class TestRolloutSidecarInjection: - @pytest.fixture - def namespace(self, kube: kubetest.client.TestClient) -> str: - return kube.namespace - - @pytest.mark.parametrize( - "ports, env", - [ - pytest.param( - STANDARD_ROLLOUT_EXPECTED_PORTS, - STANDARD_ROLLOUT_EXPECTED_ENV, - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/argo_rollouts/fiber-http_single_port.yaml" - ), - ), - pytest.param( - WORKLOAD_REF_ROLLOUT_EXPECTED_PORTS, - WORKLOAD_REF_ROLLOUT_EXPECTED_ENV, - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/argo_rollouts/fiber-http-workload-ref_single_port.yaml" - ), - ), - ], - ) - @pytest.mark.parametrize( - "test_port, service", - [ - (None, "fiber-http"), - (80, "fiber-http"), - ("http", "fiber-http"), - ], - ) - async def test_inject_rollout( - # test_inject_single_port_rollout - self, - namespace: str, - service: str, - test_port: Union[str, int], - ports: List[ - Union[ - servo.connectors.kubernetes.RolloutV1ContainerPort, - kubernetes_asyncio.client.V1ContainerPort, - ] - ], - env: List[ - Union[ - servo.connectors.kubernetes.RolloutV1EnvVar, - kubernetes_asyncio.client.V1EnvVar, - ] - ], - ) -> None: - rollout = await servo.connectors.kubernetes.Rollout.read( - "fiber-http", namespace - ) - assert len(rollout.containers) == 1, "expected a single container" - service = await servo.connectors.kubernetes.Service.read( - "fiber-http", namespace - ) - assert len(service.ports) == 1 - port_obj = service.ports[0] - - if isinstance(test_port, int): - assert port_obj.port == test_port - elif isinstance(test_port, str): - assert port_obj.name == test_port - assert port_obj.target_port == 8480 - - await rollout.inject_sidecar( - "opsani-envoy", - ENVOY_SIDECAR_IMAGE_TAG, - service="fiber-http", - port=test_port, - ) - - # Examine new sidecar - await rollout.refresh() - assert len(rollout.containers) == 2, "expected an injected container" - sidecar_container = rollout.containers[1] - assert sidecar_container.name == "opsani-envoy" - - # Check ports and env - assert sidecar_container.ports == ports - assert sidecar_container.obj.env == env diff --git a/tests/connectors/opsani_dev_test.py b/tests/connectors/opsani_dev_test.py index 6b98a0daa..75697ad48 100644 --- a/tests/connectors/opsani_dev_test.py +++ b/tests/connectors/opsani_dev_test.py @@ -22,6 +22,8 @@ import devtools import httpx import kubernetes_asyncio +from kubernetes_asyncio.client import V1Deployment, V1Service, V1Pod +import kubetest.client import pydantic import pytest import pytz @@ -31,6 +33,7 @@ import servo import servo.cli import servo.connectors.kubernetes +from servo.connectors.kubernetes_helpers import DeploymentHelper, ServiceHelper import servo.connectors.opsani_dev import servo.connectors.prometheus @@ -39,7 +42,7 @@ def config(kube) -> servo.connectors.opsani_dev.OpsaniDevConfiguration: return servo.connectors.opsani_dev.OpsaniDevConfiguration( namespace=kube.namespace, - deployment="fiber-http", + workload_name="fiber-http", container="fiber-http", service="fiber-http", cpu=servo.connectors.kubernetes.CPU(min="125m", max="4000m", step="125m"), @@ -54,7 +57,7 @@ def config(kube) -> servo.connectors.opsani_dev.OpsaniDevConfiguration: def no_tuning_config(kube) -> servo.connectors.opsani_dev.OpsaniDevConfiguration: return servo.connectors.opsani_dev.OpsaniDevConfiguration( namespace=kube.namespace, - deployment="fiber-http", + workload_name="fiber-http", container="fiber-http", service="fiber-http", cpu=servo.connectors.kubernetes.CPU(min="125m", max="4000m", step="125m"), @@ -66,21 +69,6 @@ def no_tuning_config(kube) -> servo.connectors.opsani_dev.OpsaniDevConfiguration ) -@pytest.fixture -def rollout_config(kube) -> servo.connectors.opsani_dev.OpsaniDevConfiguration: - return servo.connectors.opsani_dev.OpsaniDevConfiguration( - namespace=kube.namespace, - rollout="fiber-http", - container="fiber-http", - service="fiber-http", - cpu=servo.connectors.kubernetes.CPU(min="125m", max="4000m", step="125m"), - memory=servo.connectors.kubernetes.Memory( - min="128 MiB", max="4.0 GiB", step="128 MiB" - ), - __optimizer__=servo.configuration.Optimizer(id="test.com/foo", token="12345"), - ) - - @pytest.fixture def checks( config: servo.connectors.opsani_dev.OpsaniDevConfiguration, @@ -95,21 +83,14 @@ def no_tuning_checks( return servo.connectors.opsani_dev.OpsaniDevChecks(config=no_tuning_config) -@pytest.fixture -def rollout_checks( - rollout_config: servo.connectors.opsani_dev.OpsaniDevConfiguration, -) -> servo.connectors.opsani_dev.OpsaniDevRolloutChecks: - return servo.connectors.opsani_dev.OpsaniDevRolloutChecks(config=rollout_config) - - class TestConfig: def test_generate(self) -> None: config = servo.connectors.opsani_dev.OpsaniDevConfiguration.generate() assert list(config.dict().keys()) == [ "description", "namespace", - "deployment", - "rollout", + "workload_name", + "workload_kind", "container", "service", "port", @@ -129,7 +110,7 @@ def test_generate_yaml(self) -> None: config = servo.connectors.opsani_dev.OpsaniDevConfiguration.generate() assert config.yaml(exclude_unset=True) == ( "namespace: default\n" - "deployment: app-deployment\n" + "workload_name: app-deployment\n" "container: main\n" "service: app\n" "cpu:\n" @@ -188,7 +169,7 @@ def test_generate_kubernetes_config(self) -> None: def test_generate_no_tuning_config(self) -> None: no_tuning_config = servo.connectors.opsani_dev.OpsaniDevConfiguration( namespace="test", - rollout="fiber-http", + workload_name="fiber-http", container="fiber-http", service="fiber-http", cpu=servo.connectors.kubernetes.CPU(min="125m", max="4000m", step="125m"), @@ -204,23 +185,6 @@ def test_generate_no_tuning_config(self) -> None: assert no_tuning_config.create_tuning_pod == False assert no_tuning_k_config.create_tuning_pod == False - def test_generate_rollout_config(self) -> None: - rollout_config = servo.connectors.opsani_dev.OpsaniDevConfiguration( - namespace="test", - rollout="fiber-http", - container="fiber-http", - service="fiber-http", - cpu=servo.connectors.kubernetes.CPU(min="125m", max="4000m", step="125m"), - memory=servo.connectors.kubernetes.Memory( - min="128 MiB", max="4.0 GiB", step="128 MiB" - ), - __optimizer__=servo.configuration.Optimizer( - id="test.com/foo", token="12345" - ), - ) - k_config = rollout_config.generate_kubernetes_config() - assert k_config.rollouts[0].namespace == "test" # validate config cascade - @pytest.mark.applymanifests( "../manifests/opsani_dev", @@ -236,7 +200,10 @@ class TestIntegration: class TestChecksOriginalState: @pytest.fixture(autouse=True) async def load_manifests( - self, kube, checks: servo.connectors.opsani_dev.OpsaniDevChecks, kubeconfig + self, + kube: kubetest.client.TestClient, + checks: servo.connectors.opsani_dev.OpsaniDevChecks, + kubeconfig, ) -> None: kube.wait_for_registered() checks.config.namespace = kube.namespace @@ -262,7 +229,7 @@ async def test_resource_exists( self, resource: str, checks: servo.connectors.opsani_dev.OpsaniDevChecks ) -> None: result = await checks.run_one(id=f"check_opsani_dev_kubernetes_{resource}") - assert result.success + assert result.success, f"Expected success but got: {result}" async def test_target_container_resources_within_limits( self, @@ -306,31 +273,31 @@ async def test_prometheus_configmap_exists( self, kube, checks: servo.connectors.opsani_dev.OpsaniDevChecks ) -> None: result = await checks.run_one(id=f"check_prometheus_config_map") - assert result.success + assert result.success, f"Expected success but got: {result}" async def test_prometheus_sidecar_exists( self, kube, checks: servo.connectors.opsani_dev.OpsaniDevChecks ) -> None: result = await checks.run_one(id=f"check_prometheus_sidecar_exists") - assert result.success + assert result.success, f"Expected success but got: {result}" async def test_prometheus_sidecar_is_ready( self, kube, checks: servo.connectors.opsani_dev.OpsaniDevChecks ) -> None: result = await checks.run_one(id=f"check_prometheus_sidecar_is_ready") - assert result.success + assert result.success, f"Expected success but got: {result}" async def test_check_prometheus_restart_count( self, kube, checks: servo.connectors.opsani_dev.OpsaniDevChecks ) -> None: result = await checks.run_one(id=f"check_prometheus_restart_count") - assert result.success + assert result.success, f"Expected success but got: {result}" async def test_check_prometheus_container_port( self, kube, checks: servo.connectors.opsani_dev.OpsaniDevChecks ) -> None: result = await checks.run_one(id=f"check_prometheus_container_port") - assert result.success + assert result.success, f"Expected success but got: {result}" @pytest.fixture def go_memstats_gc_sys_bytes(self) -> dict: @@ -441,12 +408,12 @@ async def test_no_tuning_process( # Connect the checks to our port forward interface no_tuning_checks.config.prometheus_base_url = prometheus_base_url - deployment = await servo.connectors.kubernetes.Deployment.read( - no_tuning_checks.config.deployment, no_tuning_checks.config.namespace + deployment = await DeploymentHelper.read( + no_tuning_checks.config.workload_name, no_tuning_checks.config.namespace ) assert ( deployment - ), f"failed loading deployment '{no_tuning_checks.config.deployment}' in namespace '{no_tuning_checks.config.namespace}'" + ), f"failed loading deployment '{no_tuning_checks.config.workload_name}' in namespace '{no_tuning_checks.config.namespace}'" prometheus_config = ( servo.connectors.prometheus.PrometheusConfiguration.generate( @@ -486,6 +453,9 @@ async def test_no_tuning_process( ) # Fill in the missing annotations + deployment = await DeploymentHelper.read( + no_tuning_checks.config.workload_name, no_tuning_checks.config.namespace + ) async with change_to_resource(deployment): await add_annotations_to_podspec_of_deployment( deployment, @@ -508,6 +478,9 @@ async def test_no_tuning_process( ), ) + deployment = await DeploymentHelper.read( + no_tuning_checks.config.workload_name, no_tuning_checks.config.namespace + ) async with change_to_resource(deployment): await add_labels_to_podspec_of_deployment( deployment, @@ -531,11 +504,15 @@ async def test_no_tuning_process( ) # servo.logging.set_level("DEBUG") + deployment = await DeploymentHelper.read( + no_tuning_checks.config.workload_name, no_tuning_checks.config.namespace + ) async with change_to_resource(deployment): servo.logger.info( - f"injecting Envoy sidecar to Deployment {deployment.name} PodSpec" + f"injecting Envoy sidecar to Deployment {deployment.metadata.name} PodSpec" ) - await deployment.inject_sidecar( + await DeploymentHelper.inject_sidecar( + deployment, "opsani-envoy", "opsani/envoy-proxy:latest", service="fiber-http", @@ -630,18 +607,17 @@ async def wait_for_targets_to_be_scraped() -> List[ ) # Update the port to point to the sidecar - service = await servo.connectors.kubernetes.Service.read( + service = await ServiceHelper.read( "fiber-http", no_tuning_checks.config.namespace ) - service.ports[0].target_port = envoy_proxy_port - async with change_to_resource(service): - await service.patch() + service.spec.ports[0].target_port = envoy_proxy_port + service = await ServiceHelper.patch(service) await wait_for_check_to_pass( functools.partial(no_tuning_checks.run_one, id=f"check_service_proxy") ) # Send traffic through the service and verify it shows up in Envoy - port = service.ports[0].port + port = service.spec.ports[0].port servo.logger.info( f"Sending test traffic through proxied Service fiber-http on port {port}" ) @@ -659,7 +635,7 @@ async def wait_for_targets_to_be_scraped() -> List[ kubernetes_config = no_tuning_checks.config.generate_kubernetes_config() no_tuning_opt = ( - await servo.connectors.kubernetes.DeploymentOptimization.create( + await servo.connectors.kubernetes.SaturationOptimization.create( config=kubernetes_config.deployments[0], timeout=kubernetes_config.timeout, ) @@ -750,325 +726,6 @@ async def test_check_resource_requirements_config_defaults( assert result.success, f"Expected success but got: {result}" -@pytest.mark.applymanifests( - "../manifests/opsani_dev", - files=[ - "service.yaml", - "prometheus.yaml", - ], -) -@pytest.mark.integration -@pytest.mark.usefixtures("kubeconfig", "kubernetes_asyncio_config") -class TestRolloutIntegration: - @pytest.fixture(autouse=True) - async def load_manifests( - self, - kube, - rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks, - kubeconfig, - manage_rollout # NOTE: rollout must be specified as a dependency, otherwise kube.wait_for_registered runs - # indefinitely waiting for the service to have endpoints from a rollout that hasn't been deployed yet - ) -> None: - kube.wait_for_registered() - rollout_checks.config.namespace = kube.namespace - - # Fake out the servo metadata in the environment - # These env vars are set by our manifests - pods = kube.get_pods(labels={"app.kubernetes.io/name": "servo"}) - assert pods, "servo is not deployed" - try: - os.environ["POD_NAME"] = list(pods.keys())[0] - os.environ["POD_NAMESPACE"] = kube.namespace - - yield - - finally: - os.environ.pop("POD_NAME", None) - os.environ.pop("POD_NAMESPACE", None) - - @pytest.mark.parametrize( - (), - [ - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout.yaml" - ) - ), - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout-workload-ref.yaml" - ) - ), - ], - ) - class TestChecksOriginalState: - @pytest.mark.parametrize("resource", ["controller", "container"]) - async def test_rollout_resource_exists( - self, - resource: str, - rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks, - ) -> None: - result = await rollout_checks.run_one( - id=f"check_opsani_dev_kubernetes_{resource}" - ) - assert result.success - - async def test_rollout_check_rsrc_limits( - self, - kube, - rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks, - rollout_config: servo.connectors.opsani_dev.OpsaniDevConfiguration, - ) -> None: - rollout_config.cpu.min = "125m" - rollout_config.cpu.max = "2000m" - rollout_config.memory.min = "128MiB" - rollout_config.memory.max = "4GiB" - result = await rollout_checks.run_one( - id=f"check_target_container_resources_within_limits" - ) - assert result.success, f"Expected success but got: {result}" - - async def test_rollout_check_rsrc_limits_fails( - self, - kube, - rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks, - rollout_config: servo.connectors.opsani_dev.OpsaniDevConfiguration, - ) -> None: - rollout_config.cpu.max = "5000m" - rollout_config.cpu.min = "4000m" - rollout_config.memory.min = "2GiB" - rollout_config.memory.max = "4GiB" - result = await rollout_checks.run_one( - id=f"check_target_container_resources_within_limits" - ) - assert result.exception - - async def test_service_routes_traffic_to_rollout( - self, - kube, - rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks, - ) -> None: - result = await rollout_checks.run_one( - id=f"check_service_routes_traffic_to_controller" - ) - assert result.success, f"Failed with message: {result.message}" - - async def test_rollout_check_resource_requirements( - self, rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks - ) -> None: - result = await rollout_checks.run_one(id=f"check_resource_requirements") - assert result.success, f"Expected success but got: {result}" - - async def test_check_rollout_selector_labels_pass( - self, rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks - ): - # simulate servo running outside of cluster - os.environ.pop("POD_NAME", None) - os.environ.pop("POD_NAMESPACE", None) - - result = await rollout_checks.run_one( - id=f"check_rollout_selector_labels", skip_requirements=True - ) - assert result.success, f"Expected success but got: {result}" - - class TestChecksOriginalStateCustomManifests: - @pytest.mark.parametrize( - (), - [ - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout_no_mem.yaml" - ) - ), - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout-workload-ref_no_mem.yaml" - ) - ), - ], - ) - async def test_rollout_check_mem_requirements_fails( - self, rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks - ): - result = await rollout_checks.run_one(id=f"check_resource_requirements") - assert result.exception, f"Expected exception but got: {result}" - - @pytest.mark.parametrize( - (), - [ - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout_no_cpu_limit.yaml" - ) - ), - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout-workload-ref_no_cpu_limit.yaml" - ) - ), - ], - ) - async def test_rollout_check_cpu_limit_fails( - self, rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks - ): - rollout_checks.config.cpu.get = [ - servo.connectors.kubernetes.ResourceRequirement.limit - ] - result = await rollout_checks.run_one(id=f"check_resource_requirements") - assert result.exception, f"Expected exception but got: {result}" - - @pytest.mark.parametrize( - (), - [ - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout_no_selector.yaml" - ) - ), - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout-workload-ref_no_selector.yaml" - ) - ), - ], - ) - async def test_check_rollout_selector_labels_fails( - self, rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks - ): - # simulate servo running outside of cluster - os.environ.pop("POD_NAME", None) - os.environ.pop("POD_NAMESPACE", None) - - result = await rollout_checks.run_one( - id=f"check_rollout_selector_labels", skip_requirements=True - ) - assert result.exception, f"Expected exception but got: {result}" - - @pytest.mark.parametrize( - (), - [ - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout_no_selector.yaml" - ) - ), - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout-workload-ref_no_selector.yaml" - ) - ), - ], - ) - async def test_check_rollout_selector_in_cluster( - self, rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks - ): - # servo running in cluster, owner reference will be set on tuning pod - result = await rollout_checks.run_one( - id=f"check_rollout_selector_labels", skip_requirements=True - ) - assert result.success, f"Expected success but got: {result}" - - # NOTE: Prometheus checks are redundant in this case, covered by standard integration tests - - @pytest.mark.parametrize( - (), - [ - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout.yaml" - ) - ), - pytest.param( - marks=pytest.mark.rollout_manifest.with_args( - "tests/manifests/opsani_dev/argo_rollouts/rollout-workload-ref.yaml" - ) - ), - ], - ) - class TestChecksUpdateState: - @pytest.fixture(autouse=True) - def set_kubeconfig_env_var( - self, kubeconfig: Union[pathlib.Path, pathlib.PurePath] - ) -> None: - with tests.helpers.environment_overrides({"KUBECONFIG": str(kubeconfig)}): - yield - - @pytest.fixture() - async def port_forward_prometheus_sidecar( - self, - kube_port_forward: Callable[[str, int], AsyncContextManager[str]], - rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks, - ) -> None: - async with kube_port_forward("deploy/servo", 9090) as prometheus_base_url: - # Connect the checks to our port forward interface - rollout_checks.config.prometheus_base_url = prometheus_base_url - yield - - async def test_rollout_check_annotations( - self, - rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks, - port_forward_prometheus_sidecar, - ) -> None: - servo.logging.set_level("TRACE") - result = await rollout_checks.run_one(id=f"check_controller_annotations") - assert result.exception, f"Expected exception but got: {result}" - assert ( - result.remedy - ), f"Expected failed result to have remedy. Result: {result}" - - rollout = await servo.connectors.kubernetes.Rollout.read( - rollout_checks.config.rollout, rollout_checks.config.namespace - ) - # NOTE in workload ref case, deployment is patched which doesn't immediately update - # the rollout's resource version causing change_to_resource to erroneously return early. - # wait for the resource version update before exiting the context to prevent test flakiness - pre_patch_resource_version = rollout.obj.metadata.resource_version - - async def wait_for_resource_version_update(): - while True: - await rollout.refresh() - if ( - rollout.obj.metadata.resource_version - != pre_patch_resource_version - ): - break - - async with change_to_resource(rollout): - await _run_remedy_from_check(result) - try: - await asyncio.wait_for( - wait_for_resource_version_update(), timeout=5 - ) - except asyncio.TimeoutError: - pytest.xfail("Rollout controller needs refresh, WIP") - - result = await rollout_checks.run_one(id=f"check_controller_annotations") - assert ( - result.success - ), f"Expected success after remedy was run but got: {result}" - - async def test_rollout_check_labels( - self, rollout_checks: servo.connectors.opsani_dev.OpsaniDevRolloutChecks - ) -> None: - result = await rollout_checks.run_one( - id=f"check_controller_labels", skip_requirements=True - ) - assert result.exception, f"Expected exception but got: {result}" - assert ( - result.remedy - ), f"Expected failed result to have remedy. Result: {result}" - await _run_remedy_from_check(result) - - result = await rollout_checks.run_one( - id=f"check_controller_labels", skip_requirements=True - ) - assert ( - result.success - ), f"Expected success after remedy was run but got: {result}" - - # TODO: port TestInstall class to rollouts by refactoring deployment specific helper code - - @pytest.mark.applymanifests( "../manifests/opsani_dev", files=[ @@ -1082,25 +739,27 @@ async def test_rollout_check_labels( class TestServiceMultiport: @pytest.fixture async def multiport_service( - self, kube, checks: servo.connectors.opsani_dev.OpsaniDevChecks + self, + kube: kubetest.client.TestClient, + checks: servo.connectors.opsani_dev.OpsaniDevChecks, ) -> None: kube.wait_for_registered() - service = await servo.connectors.kubernetes.Service.read( + service = await ServiceHelper.read( checks.config.service, checks.config.namespace ) assert service - assert len(service.ports) == 1 - assert service.find_port("http") + assert len(service.spec.ports) == 1 + assert ServiceHelper.find_port(service, "http") # Add a port port = kubernetes_asyncio.client.V1ServicePort(name="elite", port=31337) - service.obj.spec.ports.append(port) + service.spec.ports.append(port) - await service.patch() + service = await ServiceHelper.patch(service) - assert len(service.ports) == 2 - assert service.find_port("http") - assert service.find_port("elite") + assert len(service.spec.ports) == 2 + assert ServiceHelper.find_port(service, "http") + assert ServiceHelper.find_port(service, "elite") return service @@ -1126,7 +785,7 @@ async def test_resolve_port_by_name( ) -> None: checks.config.port = "elite" result = await checks.run_one(id=f"check_opsani_dev_kubernetes_service_port") - assert result.success + assert result.success, f"Expected success but got: {result}" assert result.message == "Service Port: elite 31337:31337/TCP" async def test_resolve_port_by_number( @@ -1137,7 +796,7 @@ async def test_resolve_port_by_number( ) -> None: checks.config.port = 80 result = await checks.run_one(id=f"check_opsani_dev_kubernetes_service_port") - assert result.success + assert result.success, f"Expected success but got: {result}" assert result.message == "Service Port: http 80:8480/TCP" async def test_cannot_resolve_port_by_name( @@ -1235,12 +894,12 @@ async def test_process( # Connect the checks to our port forward interface checks.config.prometheus_base_url = prometheus_base_url - deployment = await servo.connectors.kubernetes.Deployment.read( - checks.config.deployment, checks.config.namespace + deployment = await DeploymentHelper.read( + checks.config.workload_name, checks.config.namespace ) assert ( deployment - ), f"failed loading deployment '{checks.config.deployment}' in namespace '{checks.config.namespace}'" + ), f"failed loading deployment '{checks.config.workload_name}' in namespace '{checks.config.namespace}'" prometheus_config = ( servo.connectors.prometheus.PrometheusConfiguration.generate( @@ -1278,6 +937,9 @@ async def test_process( ) # Fill in the missing annotations + deployment = await DeploymentHelper.read( + checks.config.workload_name, checks.config.namespace + ) async with change_to_resource(deployment): await add_annotations_to_podspec_of_deployment( deployment, @@ -1298,6 +960,9 @@ async def test_process( ), ) + deployment = await DeploymentHelper.read( + checks.config.workload_name, checks.config.namespace + ) async with change_to_resource(deployment): await add_labels_to_podspec_of_deployment( deployment, @@ -1321,11 +986,15 @@ async def test_process( ) # servo.logging.set_level("DEBUG") + deployment = await DeploymentHelper.read( + checks.config.workload_name, checks.config.namespace + ) async with change_to_resource(deployment): servo.logger.info( - f"injecting Envoy sidecar to Deployment {deployment.name} PodSpec" + f"injecting Envoy sidecar to Deployment {deployment.metadata.name} PodSpec" ) - await deployment.inject_sidecar( + await DeploymentHelper.inject_sidecar( + deployment, "opsani-envoy", "opsani/envoy-proxy:latest", service="fiber-http", @@ -1416,18 +1085,17 @@ async def wait_for_targets_to_be_scraped() -> List[ ) # Update the port to point to the sidecar - service = await servo.connectors.kubernetes.Service.read( + service = await ServiceHelper.read( "fiber-http", checks.config.namespace ) - service.ports[0].target_port = envoy_proxy_port - async with change_to_resource(service): - await service.patch() + service.spec.ports[0].target_port = envoy_proxy_port + await ServiceHelper.patch(service) await wait_for_check_to_pass( functools.partial(checks.run_one, id=f"check_service_proxy") ) # Send traffic through the service and verify it shows up in Envoy - port = service.ports[0].port + port = service.spec.ports[0].port servo.logger.info( f"Sending test traffic through proxied Service fiber-http on port {port}" ) @@ -1448,7 +1116,7 @@ async def wait_for_targets_to_be_scraped() -> List[ kubernetes_config = checks.config.generate_kubernetes_config() canary_opt = ( await servo.connectors.kubernetes.CanaryOptimization.create( - deployment_or_rollout_config=kubernetes_config.deployments[0], + workload_config=kubernetes_config.deployments[0], timeout=kubernetes_config.timeout, ) ) @@ -1536,12 +1204,12 @@ async def test_install_wait( # Connect the checks to our port forward interface checks.config.prometheus_base_url = prometheus_base_url - deployment = await servo.connectors.kubernetes.Deployment.read( - checks.config.deployment, checks.config.namespace + deployment = await DeploymentHelper.read( + checks.config.workload_name, checks.config.namespace ) assert ( deployment - ), f"failed loading deployment '{checks.config.deployment}' in namespace '{checks.config.namespace}'" + ), f"failed loading deployment '{checks.config.workload_name}' in namespace '{checks.config.namespace}'" async def loop_checks() -> None: while True: @@ -1553,6 +1221,9 @@ async def loop_checks() -> None: servo.logger.critical( f"Attempting to remedy failing check: {devtools.pformat(next_failure)}" ) # , exception=next_failure.exception) + deployment = await DeploymentHelper.read( + checks.config.workload_name, checks.config.namespace + ) await _remedy_check( next_failure.id, config=checks.config, @@ -1572,6 +1243,14 @@ async def loop_checks() -> None: ) +# TODO/FIXME The following tests are using the _remedy_check test helper instead of running the actual check remedies +# whose parallelization is what these tests were intended to cover. Ideally this is refactored to use the +# ChecksHelper.process_checks method but that requires refactoring of the check_controller_envoy_sidecars remedy. +# Said rememdy currently uses a kubectl exec workaround instead of implementing the remedy in code which makes it incompatible +# with being run by a servo not deployed inside of a kubernetes cluster. Further complicating matters is the fact +# that remedies are being phased out which is why said refactor has not been prioritized at this time +# https://github.com/opsani/servox/blob/74ff31117b26eb13039d1d4ad6b1d430426695bc/servo/checks.py#L743 +# https://github.com/opsani/servox/blob/74ff31117b26eb13039d1d4ad6b1d430426695bc/servo/connectors/opsani_dev.py#L807 @pytest.mark.applymanifests( "../manifests/opsani_dev", files=[ @@ -1622,12 +1301,12 @@ async def test_checks_do_not_halt( # Connect the checks to our port forward interface checks.config.prometheus_base_url = prometheus_base_url - deployment = await servo.connectors.kubernetes.Deployment.read( - checks.config.deployment, checks.config.namespace + deployment = await DeploymentHelper.read( + checks.config.workload_name, checks.config.namespace ) assert ( deployment - ), f"failed loading deployment '{checks.config.deployment}' in namespace '{checks.config.namespace}'" + ), f"failed loading deployment '{checks.config.workload_name}' in namespace '{checks.config.namespace}'" async def loop_checks() -> None: while True: @@ -1636,57 +1315,9 @@ async def loop_checks() -> None: servo.logger.info(f"{failures}") if failures: for failure in failures: - - await _remedy_check( - failure.id, - config=checks.config, - deployment=deployment, - kube_port_forward=kube_port_forward, - load_generator=load_generator, - checks=checks, + deployment = await DeploymentHelper.read( + checks.config.workload_name, checks.config.namespace ) - else: - break - - await asyncio.wait_for(loop_checks(), timeout=75.0) - - servo.logger.success("🥷 Opsani Dev is now deployed.") - servo.logger.critical( - "🔥 Now witness the firepower of this fully ARMED and OPERATIONAL battle station!" - ) - - @pytest.mark.namespace(create=False, name="test-checks") - @pytest.mark.xfail( - reason="Remedy flow does not complete in time with check halting" - ) - async def test_checks_timeout_with_halt( - self, - kube, - kubetest_teardown, - checks: servo.connectors.opsani_dev.OpsaniDevChecks, - kube_port_forward: Callable[[str, int], AsyncContextManager[str]], - load_generator: Callable[[], "LoadGenerator"], - tmp_path: pathlib.Path, - ) -> None: - servo.logging.set_level("INFO") - - async with kube_port_forward("deploy/servo", 9090) as prometheus_base_url: - # Connect the checks to our port forward interface - checks.config.prometheus_base_url = prometheus_base_url - - deployment = await servo.connectors.kubernetes.Deployment.read( - checks.config.deployment, checks.config.namespace - ) - assert ( - deployment - ), f"failed loading deployment '{checks.config.deployment}' in namespace '{checks.config.namespace}'" - - async def loop_checks() -> None: - while True: - results = await checks.run_all() - failures = list(filter(lambda r: r.success is False, results)) - if failures: - for failure in failures: await _remedy_check( failure.id, config=checks.config, @@ -1695,19 +1326,70 @@ async def loop_checks() -> None: load_generator=load_generator, checks=checks, ) - - # Replicate check-halting behavior, loop breaking on each failure - break else: break - await asyncio.wait_for(loop_checks(), timeout=75.0) + await asyncio.wait_for(loop_checks(), timeout=120.0) servo.logger.success("🥷 Opsani Dev is now deployed.") servo.logger.critical( "🔥 Now witness the firepower of this fully ARMED and OPERATIONAL battle station!" ) + # TODO/FIXME this test is effectively identical to test_install_wait until refactored to use ChecksHelper.process_checks + # @pytest.mark.namespace(create=False, name="test-checks") + # @pytest.mark.xfail( + # reason="Remedy flow does not complete in time with check halting" + # ) + # async def test_checks_timeout_with_halt( + # self, + # kube, + # kubetest_teardown, + # checks: servo.connectors.opsani_dev.OpsaniDevChecks, + # kube_port_forward: Callable[[str, int], AsyncContextManager[str]], + # load_generator: Callable[[], "LoadGenerator"], + # tmp_path: pathlib.Path, + # ) -> None: + # servo.logging.set_level("INFO") + + # async with kube_port_forward("deploy/servo", 9090) as prometheus_base_url: + # # Connect the checks to our port forward interface + # checks.config.prometheus_base_url = prometheus_base_url + + # deployment = await DeploymentHelper.read( + # checks.config.workload_name, checks.config.namespace + # ) + # assert ( + # deployment + # ), f"failed loading deployment '{checks.config.workload_name}' in namespace '{checks.config.namespace}'" + + # async def loop_checks() -> None: + # while True: + # results = await checks.run_all() + # failures = list(filter(lambda r: r.success is False, results)) + # if failures: + # for failure in failures: + # await _remedy_check( + # failure.id, + # config=checks.config, + # deployment=deployment, + # kube_port_forward=kube_port_forward, + # load_generator=load_generator, + # checks=checks, + # ) + + # # Replicate check-halting behavior, loop breaking on each failure + # break + # else: + # break + + # await asyncio.wait_for(loop_checks(), timeout=75.0) + + # servo.logger.success("🥷 Opsani Dev is now deployed.") + # servo.logger.critical( + # "🔥 Now witness the firepower of this fully ARMED and OPERATIONAL battle station!" + # ) + ## # FIXME: Migrate these assertions into a better home and fix the line number mess @@ -1861,67 +1543,46 @@ async def assert_check( # TODO: Move these into library functions. Do we want replace/merge versions? async def add_annotations_to_podspec_of_deployment( - deployment, annotations: Dict[str, str] + deployment: V1Deployment, annotations: Dict[str, str] ) -> None: servo.logger.info( - f"adding annotations {annotations} to PodSpec of Deployment '{deployment.name}'" + f"adding annotations {annotations} to PodSpec of Deployment '{deployment.metadata.name}'" ) - existing_annotations = deployment.pod_template_spec.metadata.annotations or {} + existing_annotations = deployment.spec.template.metadata.annotations or {} existing_annotations.update(annotations) - deployment.pod_template_spec.metadata.annotations = existing_annotations - await deployment.patch() + deployment.spec.template.metadata.annotations = existing_annotations + await DeploymentHelper.patch(deployment) -async def add_labels_to_podspec_of_deployment(deployment, labels: List[str]) -> None: +async def add_labels_to_podspec_of_deployment( + deployment: V1Deployment, labels: List[str] +) -> None: servo.logger.info( - f"adding labels {labels} to PodSpec of Deployment '{deployment.name}'" + f"adding labels {labels} to PodSpec of Deployment '{deployment.metadata.name}'" ) - existing_labels = deployment.pod_template_spec.metadata.labels or {} + existing_labels = deployment.spec.template.metadata.labels or {} existing_labels.update(labels) - deployment.pod_template_spec.metadata.labels = existing_labels - await deployment.patch() + deployment.spec.template.metadata.labels = existing_labels + await DeploymentHelper.patch(deployment) @contextlib.asynccontextmanager -async def change_to_resource(resource: servo.connectors.kubernetes.KubernetesModel): - if hasattr(resource, "observed_generation"): - observed_generation_prepatch = resource.observed_generation - metadata = resource.obj.metadata +async def change_to_resource(resource: V1Deployment): + metadata = resource.metadata + # allow the resource to be changed + yield - if isinstance(resource, servo.connectors.kubernetes.Deployment): - async with resource.rollout(): - yield - else: - # allow the resource to be changed - yield - - await resource.refresh() + resource = await DeploymentHelper.read( + resource.metadata.name, resource.metadata.namespace + ) # early exit if nothing changed - if resource.obj.metadata.resource_version == metadata.resource_version: + if resource.metadata.resource_version == metadata.resource_version: servo.logger.debug(f"exiting early: metadata resource version has not changed") return - if hasattr(resource, "observed_generation"): - while observed_generation_prepatch == resource.observed_generation: - await resource.refresh() - # wait for the change to roll out - if isinstance( - resource, - ( - servo.connectors.kubernetes.Deployment, - servo.connectors.kubernetes.Rollout, - servo.connectors.kubernetes.Pod, - ), - ): - await resource.wait_until_ready() - elif isinstance(resource, servo.connectors.kubernetes.Service): - pass - else: - servo.logger.warning( - f"no change observation strategy for Kubernetes resource of type `{resource.__class__.__name__}`" - ) + await asyncio.wait_for(DeploymentHelper.wait_until_ready(resource), timeout=300) class LoadGenerator(pydantic.BaseModel): @@ -2064,11 +1725,11 @@ async def _loop_check() -> servo.Check: async def _remedy_check( id: str, *, - config, - deployment, + config: servo.connectors.opsani_dev.OpsaniDevConfiguration, + deployment: V1Deployment, kube_port_forward, load_generator, - checks, + checks: servo.connectors.opsani_dev.OpsaniDevChecks, ) -> None: envoy_proxy_port = servo.connectors.opsani_dev.ENVOY_SIDECAR_DEFAULT_PORT servo.logger.warning(f"Remedying failing check '{id}'...") @@ -2107,10 +1768,13 @@ async def _remedy_check( servo.logger.critical("Step 3 - Inject Envoy sidecar container") async with change_to_resource(deployment): servo.logger.info( - f"injecting Envoy sidecar to Deployment {deployment.name} PodSpec" + f"injecting Envoy sidecar to Deployment {deployment.metadata.name} PodSpec" ) - await deployment.inject_sidecar( - "opsani-envoy", "opsani/envoy-proxy:latest", service="fiber-http" + await DeploymentHelper.inject_sidecar( + deployment, + "opsani-envoy", + "opsani/envoy-proxy:latest", + service="fiber-http", ) elif id in { @@ -2133,8 +1797,9 @@ async def _remedy_check( "Step 5 - Check that traffic metrics are coming in from Envoy" ) servo.logger.info(f"Sending test traffic to Envoy through deploy/fiber-http") + pods = await DeploymentHelper.get_latest_pods(deployment) async with kube_port_forward( - "deploy/fiber-http", envoy_proxy_port + pods[0].metadata.name, envoy_proxy_port ) as envoy_url: await load_generator(envoy_url).run_until( wait_for_check_to_pass( @@ -2147,18 +1812,15 @@ async def _remedy_check( servo.logger.critical("Step 6 - Proxy Service traffic through Envoy") # Update the port to point to the sidecar - service = await servo.connectors.kubernetes.Service.read( - "fiber-http", config.namespace - ) - service.ports[0].target_port = envoy_proxy_port - async with change_to_resource(service): - await service.patch() + service = await ServiceHelper.read("fiber-http", config.namespace) + service.spec.ports[0].target_port = envoy_proxy_port + await ServiceHelper.patch(service) elif id == "check_tuning_is_running": servo.logger.critical("Step 7 - Bring tuning Pod online") kubernetes_config = config.generate_kubernetes_config() canary_opt = await servo.connectors.kubernetes.CanaryOptimization.create( - deployment_or_rollout_config=kubernetes_config.deployments[0], + workload_config=kubernetes_config.deployments[0], timeout=kubernetes_config.timeout, ) await canary_opt.create_tuning_pod() diff --git a/tests/connectors/prometheus_test.py b/tests/connectors/prometheus_test.py index 93530b628..908d70e51 100644 --- a/tests/connectors/prometheus_test.py +++ b/tests/connectors/prometheus_test.py @@ -14,6 +14,7 @@ import respx import typer +import servo import servo.connectors.kubernetes import servo.connectors.prometheus import servo.errors @@ -653,7 +654,7 @@ async def test_fast_fail_passes( servo.logging.set_level("DEBUG") # Create a tuning instance canary_opt = await servo.connectors.kubernetes.CanaryOptimization.create( - deployment_or_rollout_config=tuning_config.deployments[0], + workload_config=tuning_config.deployments[0], timeout=tuning_config.timeout, ) async with canary_opt.temporary_tuning_pod() as _: @@ -765,7 +766,7 @@ async def test_fast_fail_fails( servo.logging.set_level("DEBUG") # Create a tuning instance canary_opt = await servo.connectors.kubernetes.CanaryOptimization.create( - deployment_or_rollout_config=tuning_config.deployments[0], + workload_config=tuning_config.deployments[0], timeout=tuning_config.timeout, ) diff --git a/tests/kubernetes_test.py b/tests/kubernetes_test.py index 4dd200da9..8700604bb 100644 --- a/tests/kubernetes_test.py +++ b/tests/kubernetes_test.py @@ -1,16 +1,23 @@ import asyncio import datetime import hashlib +from typing import cast import re import kubernetes_asyncio import kubernetes_asyncio.client +from kubernetes_asyncio.client import V1Container, V1OwnerReference import kubetest.client import pydantic import pytest import servo import servo.connectors.kubernetes +from servo.connectors.kubernetes_helpers import ( + get_containers, + DeploymentHelper, + ServiceHelper, +) import tests.helpers from servo.types.settings import _is_step_aligned, _suggest_step_aligned_values @@ -122,123 +129,127 @@ def test_supports_nil_container_name() -> None: @pytest.mark.applymanifests("manifests", files=["fiber-http.yaml"]) class TestSidecar: - async def test_inject_sidecar_by_port_number(self, kube) -> None: - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) - assert len(deployment.containers) == 1 - await deployment.inject_sidecar( - "whatever", "opsani/envoy-proxy:latest", port=8480 + async def test_inject_sidecar_by_port_number( + self, kube: kubetest.client.TestClient + ) -> None: + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) + assert len(deployment.spec.template.spec.containers) == 1 + await DeploymentHelper.inject_sidecar( + deployment, "whatever", "opsani/envoy-proxy:latest", port=8480 ) - deployment_ = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) - assert len(deployment_.containers) == 2 + deployment_ = await DeploymentHelper.read("fiber-http", kube.namespace) + assert len(deployment_.spec.template.spec.containers) == 2 - async def test_inject_sidecar_by_port_number_string(self, kube) -> None: - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) - assert len(deployment.containers) == 1 - await deployment.inject_sidecar( - "whatever", "opsani/envoy-proxy:latest", port="8480" + async def test_inject_sidecar_by_port_number_string( + self, kube: kubetest.client.TestClient + ) -> None: + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) + assert len(deployment.spec.template.spec.containers) == 1 + await DeploymentHelper.inject_sidecar( + deployment, "whatever", "opsani/envoy-proxy:latest", port="8480" ) - deployment_ = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) - assert len(deployment_.containers) == 2 + deployment_ = await DeploymentHelper.read("fiber-http", kube.namespace) + assert len(get_containers(deployment_)) == 2 - async def test_inject_sidecar_port_conflict(self, kube): - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + async def test_inject_sidecar_port_conflict(self, kube: kubetest.client.TestClient): + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) with pytest.raises( ValueError, match="Port conflict: Deployment 'fiber-http' already exposes port 8480 through an existing container", ): - await deployment.inject_sidecar( - "whatever", "opsani/envoy-proxy:latest", port=8481, service_port=8480 + await DeploymentHelper.inject_sidecar( + deployment, + "whatever", + "opsani/envoy-proxy:latest", + port=8481, + service_port=8480, ) - async def test_inject_sidecar_by_service(self, kube) -> None: - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + async def test_inject_sidecar_by_service( + self, kube: kubetest.client.TestClient + ) -> None: + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) - assert len(deployment.containers) == 1 - await deployment.inject_sidecar( - "whatever", "opsani/envoy-proxy:latest", service="fiber-http" + assert len(deployment.spec.template.spec.containers) == 1 + await DeploymentHelper.inject_sidecar( + deployment, "whatever", "opsani/envoy-proxy:latest", service="fiber-http" ) - deployment_ = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) - assert len(deployment_.containers) == 2 + deployment_ = await DeploymentHelper.read("fiber-http", kube.namespace) + assert len(deployment_.spec.template.spec.containers) == 2 - async def test_inject_sidecar_by_service_and_port_number(self, kube) -> None: - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + async def test_inject_sidecar_by_service_and_port_number( + self, kube: kubetest.client.TestClient + ) -> None: + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) - assert len(deployment.containers) == 1 - await deployment.inject_sidecar( - "whatever", "opsani/envoy-proxy:latest", service="fiber-http", port=80 + assert len(deployment.spec.template.spec.containers) == 1 + await DeploymentHelper.inject_sidecar( + deployment, + "whatever", + "opsani/envoy-proxy:latest", + service="fiber-http", + port=80, ) - deployment_ = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) - assert len(deployment_.containers) == 2 + deployment_ = await DeploymentHelper.read("fiber-http", kube.namespace) + assert len(deployment_.spec.template.spec.containers) == 2 - async def test_inject_sidecar_by_service_and_port_name(self, kube) -> None: - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + async def test_inject_sidecar_by_service_and_port_name( + self, kube: kubetest.client.TestClient + ) -> None: + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) # NOTE: This can generate a 409 Conflict failure under CI for _ in range(3): try: # change the container port so we don't conflict - deployment.obj.spec.template.spec.containers[0].ports[ + deployment.spec.template.spec.containers[0].ports[ 0 ].container_port = 9999 - await deployment.replace() + await DeploymentHelper.patch(deployment) break except kubernetes_asyncio.client.exceptions.ApiException as e: if e.status == 409 and e.reason == "Conflict": # If we have a conflict, just load the existing object and continue - await deployment.refresh() + deployment = await DeploymentHelper.read( + "fiber-http", kube.namespace + ) - assert len(deployment.containers) == 1 - await deployment.inject_sidecar( - "whatever", "opsani/envoy-proxy:latest", service="fiber-http", port="http" + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) + assert len(deployment.spec.template.spec.containers) == 1 + await DeploymentHelper.inject_sidecar( + deployment, + "whatever", + "opsani/envoy-proxy:latest", + service="fiber-http", + port="http", ) - deployment_ = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) - assert len(deployment_.containers) == 2 + deployment_ = await DeploymentHelper.read("fiber-http", kube.namespace) + assert len(deployment_.spec.template.spec.containers) == 2 - async def test_inject_sidecar_invalid_service_name(self, kube) -> None: - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + async def test_inject_sidecar_invalid_service_name( + self, kube: kubetest.client.TestClient + ) -> None: + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) with pytest.raises(ValueError, match="Unknown Service 'invalid'"): - await deployment.inject_sidecar( - "whatever", "opsani/envoy-proxy:latest", service="invalid" + await DeploymentHelper.inject_sidecar( + deployment, "whatever", "opsani/envoy-proxy:latest", service="invalid" ) - async def test_inject_sidecar_port_not_in_given_service(self, kube) -> None: - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + async def test_inject_sidecar_port_not_in_given_service( + self, kube: kubetest.client.TestClient + ) -> None: + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) with pytest.raises( ValueError, match="Port 'invalid' does not exist in the Service 'fiber-http'", ): - await deployment.inject_sidecar( + await DeploymentHelper.inject_sidecar( + deployment, "whatever", "opsani/envoy-proxy:latest", service="fiber-http", @@ -407,16 +418,14 @@ async def test_check_resource_requirements_configured_get( self, config: servo.connectors.kubernetes.KubernetesConfiguration, kube ) -> None: # Zero out the CPU setting for requests and Memory setting for limits - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) assert deployment - container = deployment.containers[0] + container: V1Container = deployment.spec.template.spec.containers[0] container.resources = kubernetes_asyncio.client.V1ResourceRequirements( limits={"memory": None}, requests={"cpu": None} ) - await deployment.patch() - await deployment.wait_until_ready() + await DeploymentHelper.patch(deployment) + await DeploymentHelper.wait_until_ready(deployment) # Update resource config to require limits for CPU and requests for memory config.deployments[0].containers[0].cpu.get = [ @@ -443,16 +452,14 @@ async def test_check_resource_requirements_fail( self, config: servo.connectors.kubernetes.KubernetesConfiguration, kube ) -> None: # Zero out the CPU settings - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) assert deployment - container = deployment.containers[0] + container: V1Container = deployment.spec.template.spec.containers[0] container.resources = kubernetes_asyncio.client.V1ResourceRequirements( limits={"cpu": None}, requests={"cpu": None} ) - await deployment.patch() - await deployment.wait_until_ready() + await DeploymentHelper.patch(deployment) + await DeploymentHelper.wait_until_ready(deployment) # Fail the check because the CPU isn't limited results = await servo.connectors.kubernetes.KubernetesChecks.run( @@ -475,16 +482,14 @@ async def test_check_resource_requirements_cpu_config_mismatch( self, config: servo.connectors.kubernetes.KubernetesConfiguration, kube ) -> None: # Zero out the CPU setting for requests - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) assert deployment - container = deployment.containers[0] + container: V1Container = deployment.spec.template.spec.containers[0] container.resources = kubernetes_asyncio.client.V1ResourceRequirements( requests={"cpu": None} ) - await deployment.patch() - await deployment.wait_until_ready() + await DeploymentHelper.patch(deployment) + await DeploymentHelper.wait_until_ready(deployment) # Update resource config to require requests config.deployments[0].containers[0].cpu.get = [ @@ -512,16 +517,14 @@ async def test_check_resource_requirements_mem_config_mismatch( self, config: servo.connectors.kubernetes.KubernetesConfiguration, kube ) -> None: # Zero out the Memory setting for requests - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) assert deployment - container = deployment.containers[0] + container: V1Container = deployment.spec.template.spec.containers[0] container.resources = kubernetes_asyncio.client.V1ResourceRequirements( requests={"memory": None} ) - await deployment.patch() - await deployment.wait_until_ready() + await DeploymentHelper.patch(deployment) + await DeploymentHelper.wait_until_ready(deployment) # Update resource config to require requests config.deployments[0].containers[0].memory.get = [ @@ -546,20 +549,22 @@ async def test_check_resource_requirements_mem_config_mismatch( ), failed_message async def test_deployments_are_ready( - self, config: servo.connectors.kubernetes.KubernetesConfiguration, kube + self, + config: servo.connectors.kubernetes.KubernetesConfiguration, + kube: kubetest.client.TestClient, ) -> None: # Set the CPU request implausibly high to force it into pending - deployment = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + deployment = await DeploymentHelper.read("fiber-http", kube.namespace) assert deployment - container = deployment.containers[0] + container: V1Container = deployment.spec.template.spec.containers[0] container.resources = kubernetes_asyncio.client.V1ResourceRequirements( limits={"cpu": None}, requests={"cpu": "500"} ) - await deployment.patch() + await DeploymentHelper.patch(deployment) try: - await asyncio.wait_for(deployment.wait_until_ready(), timeout=2.0) + await asyncio.wait_for( + DeploymentHelper.wait_until_ready(deployment), timeout=2.0 + ) except asyncio.TimeoutError: pass @@ -583,30 +588,29 @@ async def test_deployments_are_ready( @pytest.mark.applymanifests("manifests", files=["fiber-http.yaml"]) class TestService: @pytest.fixture(autouse=True) - async def wait(self, kube) -> None: + async def wait(self, kube: kubetest.client.TestClient) -> None: kube.wait_for_registered() await asyncio.sleep(0.0001) async def test_read_service(self, kube: kubetest.client.TestClient) -> None: - svc = await servo.connectors.kubernetes.Service.read( - "fiber-http", kube.namespace - ) + svc = await ServiceHelper.read("fiber-http", kube.namespace) assert svc - assert svc.obj.metadata.name == "fiber-http" - assert svc.obj.metadata.namespace == kube.namespace - - async def test_patch_service(self, kube: kubetest.client.TestClient) -> None: - svc = await servo.connectors.kubernetes.Service.read( - "fiber-http", kube.namespace - ) - assert svc - sentinel_value = hashlib.blake2b( - str(datetime.datetime.now()).encode("utf-8"), digest_size=4 - ).hexdigest() - svc.obj.metadata.labels["testing.opsani.com"] = sentinel_value - await svc.patch() - await svc.refresh() - assert svc.obj.metadata.labels["testing.opsani.com"] == sentinel_value + assert svc.metadata.name == "fiber-http" + assert svc.metadata.namespace == kube.namespace + + # Tested code is unused/deprecated + # async def test_patch_service(self, kube: kubetest.client.TestClient) -> None: + # svc = await servo.connectors.kubernetes.Service.read( + # "fiber-http", kube.namespace + # ) + # assert svc + # sentinel_value = hashlib.blake2b( + # str(datetime.datetime.now()).encode("utf-8"), digest_size=4 + # ).hexdigest() + # svc.obj.metadata.labels["testing.opsani.com"] = sentinel_value + # await svc.patch() + # await svc.refresh() + # assert svc.obj.metadata.labels["testing.opsani.com"] == sentinel_value @pytest.mark.applymanifests("manifests", files=["fiber-http.yaml"]) @@ -622,9 +626,7 @@ async def test_get_latest_pods(kube: kubetest.client.TestClient) -> None: kube_dep.name, kube_dep.namespace, kube_dep.obj ) - servo_dep = await servo.connectors.kubernetes.Deployment.read( - "fiber-http", kube.namespace - ) + servo_dep = await DeploymentHelper.read("fiber-http", kube.namespace) async def wait_for_new_replicaset(): while len(kube.get_replicasets()) < 2: @@ -633,10 +635,10 @@ async def wait_for_new_replicaset(): await asyncio.wait_for(wait_for_new_replicaset(), timeout=2) for _ in range(10): - latest_pods = await servo_dep.get_latest_pods() + latest_pods = await DeploymentHelper.get_latest_pods(servo_dep) # Check the latest pods aren't from the old replicaset for pod in latest_pods: - for ow in pod.obj.metadata.owner_references: + for ow in cast(list[V1OwnerReference], pod.metadata.owner_references): assert ow.name != old_rset.obj.metadata.name await asyncio.sleep(0.1)