From 23393fa2a7939c67d1524df177764a45f131f962 Mon Sep 17 00:00:00 2001 From: Leonid Kalev Date: Mon, 23 Aug 2021 12:15:08 +0000 Subject: [PATCH 1/5] modify on_error:destroy behavior for deployments connectors/kubernetes.py: a deployment handled by the connector is 'foreign' (not created by the connector), the connector cannot auto-restore it if it has been deleted. Therefore, it should not delete its deployment-under-control when the on_error setting is configured to 'destroy' (which is actually intended as 'destroy and re-create on the next adjust'). This patch modifies the connector to set replicas=0, instead - consistent with the behavior of the servoX predecessor (servo-k8s). --- servo/connectors/kubernetes.py | 14 ++++++++++++-- tests/connectors/kubernetes_test.py | 12 ++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/servo/connectors/kubernetes.py b/servo/connectors/kubernetes.py index 5cf621e9f..6013d64be 100644 --- a/servo/connectors/kubernetes.py +++ b/servo/connectors/kubernetes.py @@ -1439,6 +1439,16 @@ async def delete(self, options:kubernetes_asyncio.client.V1DeleteOptions = None) body=options, ) + async def scale_to_zero(self) -> None: + """this is used instead of 'delete' as the 'destroy' operation to handle 'on_failure: destroy'. + Since the Deployment object is used as a wrapper around an existing k8s object that we did not create, + it shouldn't be destroyed. Instead, the deployments pods are destroyed by scaling it to 0 replicas. + """ + + await self.refresh() + self.replicas = 0 + await self.patch() + async def refresh(self) -> None: """Refresh the underlying Kubernetes Deployment resource.""" async with self.api_client() as api_client: @@ -2993,14 +3003,14 @@ async def rollback(self, error: Optional[Exception] = None) -> None: async def destroy(self, error: Optional[Exception] = None) -> None: """ - Initiates the asynchronous deletion of the Deployment under optimization. + Initiates the asynchronous deletion of all pods in the Deployment under optimization. Args: error: An optional error that triggered the destruction. """ self.logger.info(f"adjustment failed: destroying deployment...") await asyncio.wait_for( - self.deployment.delete(), + self.deployment.scale_to_zero(), timeout=self.timeout.total_seconds(), ) diff --git a/tests/connectors/kubernetes_test.py b/tests/connectors/kubernetes_test.py index f6dedc070..895a0f8a5 100644 --- a/tests/connectors/kubernetes_test.py +++ b/tests/connectors/kubernetes_test.py @@ -1256,7 +1256,9 @@ async def test_adjust_handle_error_respects_nested_config(self, config: Kubernet description = await connector.adjust([adjustment]) debug(description) - await Deployment.read("fiber-http", kube.namespace) + deployment = await Deployment.read("fiber-http", kube.namespace) + # check deployment was not scaled to 0 replicas (i.e., the outer-level 'destroy' was overridden) + assert deployment.obj.spec.replicas != 0 # async def test_apply_no_changes(self): # # resource_version stays the same and early exits @@ -1494,11 +1496,9 @@ async def test_adjust_deployment_settlement_failed( except AssertionError as e: raise e from rejection_info.value - # Validate deployment destroyed - with pytest.raises(kubernetes.client.exceptions.ApiException) as not_found_error: - kubetest_deployment_becomes_unready.refresh() - - assert not_found_error.value.status == 404 and not_found_error.value.reason == "Not Found", str(not_found_error.value) + # Validate deployment scaled down to 0 instances + kubetest_deployment_becomes_unready.refresh() + assert kubetest_deployment_becomes_unready.obj.spec.replicas == 0 async def test_adjust_tuning_never_ready( self, From 633b1dd5ac5cf97904ce4347c016817499488f8d Mon Sep 17 00:00:00 2001 From: Leonid Kalev Date: Mon, 30 Aug 2021 14:42:27 +0000 Subject: [PATCH 2/5] connectors/kubernetes: modifications per PR review - rename 'destroy' config option to 'shutdown' - modify tests to match - fix method comment --- servo/connectors/kubernetes.py | 24 ++++++++++++++---------- tests/connectors/kubernetes_test.py | 14 +++++++------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/servo/connectors/kubernetes.py b/servo/connectors/kubernetes.py index 6013d64be..d692e3f26 100644 --- a/servo/connectors/kubernetes.py +++ b/servo/connectors/kubernetes.py @@ -1440,7 +1440,7 @@ async def delete(self, options:kubernetes_asyncio.client.V1DeleteOptions = None) ) async def scale_to_zero(self) -> None: - """this is used instead of 'delete' as the 'destroy' operation to handle 'on_failure: destroy'. + """his is used as a "soft" 'delete'/'destroy'. Since the Deployment object is used as a wrapper around an existing k8s object that we did not create, it shouldn't be destroyed. Instead, the deployments pods are destroyed by scaling it to 0 replicas. """ @@ -2830,8 +2830,8 @@ async def handle_error(self, error: Exception) -> bool: elif self.on_failure == FailureMode.rollback: await self.rollback(error) - elif self.on_failure == FailureMode.destroy: - await self.destroy(error) + elif self.on_failure == FailureMode.shutdown: + await self.shutdown(error) else: # Trap any new modes that need to be handled @@ -2857,9 +2857,9 @@ async def rollback(self, error: Optional[Exception] = None) -> None: ... @abc.abstractmethod - async def destroy(self, error: Optional[Exception] = None) -> None: + async def shutdown(self, error: Optional[Exception] = None) -> None: """ - Asynchronously destroy the Optimization. + Asynchronously shut down the Optimization. Args: error: An optional exception that contextualizes the cause of the destruction. @@ -3001,14 +3001,14 @@ async def rollback(self, error: Optional[Exception] = None) -> None: timeout=self.timeout.total_seconds(), ) - async def destroy(self, error: Optional[Exception] = None) -> None: + async def shutdown(self, error: Optional[Exception] = None) -> None: """ Initiates the asynchronous deletion of all pods in the Deployment under optimization. Args: error: An optional error that triggered the destruction. """ - self.logger.info(f"adjustment failed: destroying deployment...") + self.logger.info(f"adjustment failed: shutting down deployment's pods...") await asyncio.wait_for( self.deployment.scale_to_zero(), timeout=self.timeout.total_seconds(), @@ -3653,13 +3653,17 @@ async def destroy(self, error: Optional[Exception] = None) -> None: self.logger.success(f'destroyed tuning Pod "{self.tuning_pod_name}"') + async def shutdown(self, error: Optional[Exception] = None) -> None: + # (not called - see handle_error(), defined for completeness) + await self.destroy(error) + async def handle_error(self, error: Exception) -> bool: - if self.on_failure == FailureMode.rollback or self.on_failure == FailureMode.destroy: + if self.on_failure == FailureMode.rollback or self.on_failure == FailureMode.shutdown: # Ensure that we chain any underlying exceptions that may occur try: if self.on_failure == FailureMode.rollback: self.logger.warning( - f"cannot rollback a tuning Pod: falling back to destroy: {error}" + f"cannot rollback a tuning Pod: falling back to shutdown: {error}" ) await asyncio.wait_for(self.destroy(), timeout=self.timeout.total_seconds()) @@ -4032,7 +4036,7 @@ class FailureMode(str, enum.Enum): """ rollback = "rollback" - destroy = "destroy" + shutdown = "shutdown" ignore = "ignore" exception = "exception" diff --git a/tests/connectors/kubernetes_test.py b/tests/connectors/kubernetes_test.py index 895a0f8a5..05b8c1917 100644 --- a/tests/connectors/kubernetes_test.py +++ b/tests/connectors/kubernetes_test.py @@ -1241,7 +1241,7 @@ async def test_adjust_tuning_cpu_with_settlement(self, tuning_config, namespace, async def test_adjust_handle_error_respects_nested_config(self, config: KubernetesConfiguration, kube: kubetest.client.TestClient): config.timeout = "3s" - config.on_failure = FailureMode.destroy + config.on_failure = FailureMode.shutdown config.cascade_common_settings(overwrite=True) config.deployments[0].on_failure = FailureMode.exception config.deployments[0].containers[0].memory.max = "256Gi" @@ -1257,7 +1257,7 @@ async def test_adjust_handle_error_respects_nested_config(self, config: Kubernet debug(description) deployment = await Deployment.read("fiber-http", kube.namespace) - # check deployment was not scaled to 0 replicas (i.e., the outer-level 'destroy' was overridden) + # check deployment was not scaled to 0 replicas (i.e., the outer-level 'shutdown' was overridden) assert deployment.obj.spec.replicas != 0 # async def test_apply_no_changes(self): @@ -1475,7 +1475,7 @@ async def test_adjust_deployment_settlement_failed( ) -> None: config.timeout = "15s" config.settlement = "20s" - config.deployments[0].on_failure = FailureMode.destroy + config.deployments[0].on_failure = FailureMode.shutdown connector = KubernetesConnector(config=config) adjustment = Adjustment( @@ -1507,7 +1507,7 @@ async def test_adjust_tuning_never_ready( kube: kubetest.client.TestClient ) -> None: tuning_config.timeout = "30s" - tuning_config.on_failure = FailureMode.destroy + tuning_config.on_failure = FailureMode.shutdown tuning_config.cascade_common_settings(overwrite=True) connector = KubernetesConnector(config=tuning_config) @@ -1540,7 +1540,7 @@ async def test_adjust_tuning_oom_killed( kube: kubetest.client.TestClient ) -> None: tuning_config.timeout = "25s" - tuning_config.on_failure = FailureMode.destroy + tuning_config.on_failure = FailureMode.shutdown tuning_config.cascade_common_settings(overwrite=True) connector = KubernetesConnector(config=tuning_config) @@ -1574,8 +1574,8 @@ async def test_adjust_tuning_settlement_failed( ) -> None: tuning_config.timeout = "25s" tuning_config.settlement = "15s" - tuning_config.on_failure = FailureMode.destroy - tuning_config.deployments[0].on_failure = FailureMode.destroy + tuning_config.on_failure = FailureMode.shutdown + tuning_config.deployments[0].on_failure = FailureMode.shutdown connector = KubernetesConnector(config=tuning_config) From 0deeb2d5b7af7f0f529e52b28670db84b974f173 Mon Sep 17 00:00:00 2001 From: Leonid Kalev Date: Tue, 31 Aug 2021 15:39:46 +0000 Subject: [PATCH 3/5] allow 'on_failure: destroy' to be used, with deprecation warning --- servo/connectors/kubernetes.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/servo/connectors/kubernetes.py b/servo/connectors/kubernetes.py index d692e3f26..55eff91c2 100644 --- a/servo/connectors/kubernetes.py +++ b/servo/connectors/kubernetes.py @@ -4040,6 +4040,8 @@ class FailureMode(str, enum.Enum): ignore = "ignore" exception = "exception" + destroy = "destroy" # deprecated, but accepted as "shutdown" + @classmethod def options(cls) -> List[str]: """ @@ -4114,6 +4116,13 @@ class BaseKubernetesConfiguration(servo.BaseConfiguration): description="Time interval to wait before considering Kubernetes operations to have failed." ) + @pydantic.validator("on_failure") + def validate_failure_mode(cls, v): + if v == FailureMode.destroy: + servo.logger.warning(f"Deprecated value 'destroy' used for 'on_failure', replacing with 'shutdown'") + return FailureMode.shutdown + return v + StrategyTypes = Union[ OptimizationStrategy, From 99cf856a7fc841db34bfa2c4487dace2d5fb5372 Mon Sep 17 00:00:00 2001 From: Leonid Kalev Date: Tue, 31 Aug 2021 17:42:57 +0000 Subject: [PATCH 4/5] cosmetic updates, add unit test for destroy->shutdown conversion --- servo/connectors/kubernetes.py | 4 ++-- tests/connectors/kubernetes_test.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/servo/connectors/kubernetes.py b/servo/connectors/kubernetes.py index 55eff91c2..e74441f09 100644 --- a/servo/connectors/kubernetes.py +++ b/servo/connectors/kubernetes.py @@ -1440,7 +1440,7 @@ async def delete(self, options:kubernetes_asyncio.client.V1DeleteOptions = None) ) async def scale_to_zero(self) -> None: - """his is used as a "soft" 'delete'/'destroy'. + """This is used as a "soft" 'delete'/'destroy'. Since the Deployment object is used as a wrapper around an existing k8s object that we did not create, it shouldn't be destroyed. Instead, the deployments pods are destroyed by scaling it to 0 replicas. """ @@ -3666,7 +3666,7 @@ async def handle_error(self, error: Exception) -> bool: f"cannot rollback a tuning Pod: falling back to shutdown: {error}" ) - await asyncio.wait_for(self.destroy(), timeout=self.timeout.total_seconds()) + await asyncio.wait_for(self.shutdown(), timeout=self.timeout.total_seconds()) # create a new canary against baseline self.logger.info( diff --git a/tests/connectors/kubernetes_test.py b/tests/connectors/kubernetes_test.py index 05b8c1917..65a280b9f 100644 --- a/tests/connectors/kubernetes_test.py +++ b/tests/connectors/kubernetes_test.py @@ -394,6 +394,36 @@ def test_generate_emits_human_readable_values( assert len(matches) == 1, "expected only a single matching node" assert matches[0].node == expected_value + def test_failure_mode_destroy(self) -> None: + """test that the old 'destroy' setting is converted to 'shutdown'""" + config = servo.connectors.kubernetes.KubernetesConfiguration( + namespace="default", + description="Update the namespace, deployment, etc. to match your Kubernetes cluster", + on_failure=servo.connectors.kubernetes.FailureMode.destroy, + deployments=[ + servo.connectors.kubernetes.DeploymentConfiguration( + name="fiber-http", + replicas=servo.Replicas( + min=1, + max=2, + ), + containers=[ + servo.connectors.kubernetes.ContainerConfiguration( + name="fiber-http", + cpu=servo.connectors.kubernetes.CPU( + min="250m", max="4000m", step="125m" + ), + memory=servo.connectors.kubernetes.Memory( + min="128MiB", max="4.0GiB", step="128MiB" + ), + ) + ], + ) + ], + ) + assert config.on_failure == FailureMode.shutdown + assert config.deployments[0].on_failure == FailureMode.shutdown + class TestKubernetesConnector: pass From 8f671e6a34d2e3f40946fa86bf6d7ac51f4b5165 Mon Sep 17 00:00:00 2001 From: Leonid Kalev Date: Tue, 31 Aug 2021 18:00:20 +0000 Subject: [PATCH 5/5] (cosmetic) remove obsolete comment --- servo/connectors/kubernetes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/servo/connectors/kubernetes.py b/servo/connectors/kubernetes.py index e74441f09..77b794b92 100644 --- a/servo/connectors/kubernetes.py +++ b/servo/connectors/kubernetes.py @@ -3654,7 +3654,6 @@ async def destroy(self, error: Optional[Exception] = None) -> None: self.logger.success(f'destroyed tuning Pod "{self.tuning_pod_name}"') async def shutdown(self, error: Optional[Exception] = None) -> None: - # (not called - see handle_error(), defined for completeness) await self.destroy(error) async def handle_error(self, error: Exception) -> bool: