Skip to content

Commit

Permalink
GCP: allow stop/autostop for spot VMs. (#2877)
Browse files Browse the repository at this point in the history
* GCP: allow stop/autostop for spot VMs.

* CloudImplementationFeatures.STOP_SPOT_INSTANCE

* update core.py

* changes

* format

* refactoring for feature supported check

* remove special handling for TPU VM pod

* format

* Message

* add docstr

---------

Co-authored-by: Zhanghao Wu <[email protected]>
  • Loading branch information
concretevitamin and Michaelvll authored Dec 28, 2023
1 parent dc543b7 commit 20013e5
Show file tree
Hide file tree
Showing 16 changed files with 210 additions and 119 deletions.
1 change: 1 addition & 0 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1940,6 +1940,7 @@ def check_can_clone_disk_and_override_task(
new_task_resources = []
original_cloud = handle.launched_resources.cloud
original_cloud.check_features_are_supported(
handle.launched_resources,
{clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER})

assert original_cloud is not None, handle.launched_resources
Expand Down
18 changes: 16 additions & 2 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2205,7 +2205,7 @@ def provision_with_retries(
cloud_user = to_provision.cloud.get_current_user_identity()
# Skip if to_provision.cloud does not support requested features
to_provision.cloud.check_features_are_supported(
self._requested_features)
to_provision, self._requested_features)

config_dict = self._retry_zones(
to_provision,
Expand Down Expand Up @@ -3679,7 +3679,7 @@ def _teardown(self,
except filelock.Timeout as e:
raise RuntimeError(
f'Cluster {cluster_name!r} is locked by {lock_path}. '
'Check to see if it is still being launched.') from e
'Check to see if it is still being launched') from e

# --- CloudVMRayBackend Specific APIs ---

Expand Down Expand Up @@ -4278,6 +4278,20 @@ def set_autostop(self,
down: bool = False,
stream_logs: bool = True) -> None:
if idle_minutes_to_autostop is not None:

# Check if we're stopping spot
assert (handle.launched_resources is not None and
handle.launched_resources.cloud is not None), handle
if handle.launched_resources.use_spot:
# This can be triggered by, for example:
# sky launch --cloud aws --use-spot --cpus 2+ -i0 -y
# The cluster will be UP, the launch exited with code 1, and
# any stage after PRE_EXEC is not executed.
cloud = handle.launched_resources.cloud
cloud.check_features_are_supported(
handle.launched_resources,
{clouds.CloudImplementationFeatures.STOP})

code = autostop_lib.AutostopCodeGen.set_autostop(
idle_minutes_to_autostop, self.NAME, down)
returncode, _, stderr = self.run_on_head(handle,
Expand Down
13 changes: 10 additions & 3 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,16 @@ class AWS(clouds.Cloud):
)

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return dict()
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
if resources.use_spot:
return {
clouds.CloudImplementationFeatures.STOP:
('Stopping spot instances is currently not supported on'
f' {cls._REPR}.'),
}
return {}

@classmethod
def max_cluster_name_length(cls) -> Optional[int]:
Expand Down
14 changes: 10 additions & 4 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,18 @@ class Azure(clouds.Cloud):
_INDENT_PREFIX = ' ' * 4

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return {
def _unsupported_features_for_resources(
cls, resources: 'resources.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
features = {
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
(f'Migrating disk is not supported in {cls._REPR}.'),
(f'Migrating disk is currently not supported on {cls._REPR}.'),
}
if resources.use_spot:
features[clouds.CloudImplementationFeatures.STOP] = (
'Stopping spot instances is currently not supported on'
f' {cls._REPR}.')
return features

@classmethod
def max_cluster_name_length(cls) -> int:
Expand Down
61 changes: 35 additions & 26 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,11 @@ class CloudImplementationFeatures(enum.Enum):
Used by Cloud.check_features_are_supported().
Note: If any new feature is added, please check and update
NOTE: If any new feature is added, please check and update
_cloud_unsupported_features in all clouds to make sure the
check_features_are_supported() works as expected.
"""
STOP = 'stop'
AUTOSTOP = 'autostop'
STOP = 'stop' # Includes both stop and autostop.
MULTI_NODE = 'multi-node'
CLONE_DISK_FROM_CLUSTER = 'clone_disk_from_cluster'
DOCKER_IMAGE = 'docker_image'
Expand Down Expand Up @@ -68,20 +67,6 @@ class Cloud:
_REPR = '<Cloud>'
_DEFAULT_DISK_TIER = 'medium'

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[CloudImplementationFeatures, str]:
"""The features not supported by the cloud implementation.
This method is used by check_features_are_supported() to check if the
cloud implementation supports all the requested features.
Returns:
A dict of {feature: reason} for the features not supported by the
cloud implementation.
"""
raise NotImplementedError

@classmethod
def max_cluster_name_length(cls) -> Optional[int]:
"""Returns the maximum length limit of a cluster name.
Expand Down Expand Up @@ -304,7 +289,8 @@ def get_feasible_launchable_resources(
CloudImplementationFeatures.MULTI_NODE)

try:
self.check_features_are_supported(resources_required_features)
self.check_features_are_supported(resources,
resources_required_features)
except exceptions.NotSupportedError:
# TODO(zhwu): The resources are now silently filtered out. We
# should have some logging telling the user why the resources
Expand Down Expand Up @@ -457,28 +443,35 @@ def need_cleanup_after_preemption(

@classmethod
def check_features_are_supported(
cls, requested_features: Set[CloudImplementationFeatures]) -> None:
cls, resources: 'resources_lib.Resources',
requested_features: Set[CloudImplementationFeatures]) -> None:
"""Errors out if the cloud does not support all requested features.
For instance, Lambda Cloud does not support autostop, so
Lambda.check_features_are_supported({
CloudImplementationFeatures.AUTOSTOP
For instance, Lambda Cloud does not support stop, so
Lambda.check_features_are_supported(to_provision, {
CloudImplementationFeatures.STOP
}) raises the exception.
Resources are also passed as some features may depend on the resources
requested. For example, some clouds support stopping normal instances,
but not spot instances, e.g., AWS; or, GCP supports stopping TPU VMs but
not TPU VM pods.
Raises:
exceptions.NotSupportedError: If the cloud does not support all the
requested features.
"""
unsupported_features2reason = cls._cloud_unsupported_features()
unsupported_features2reason = cls._unsupported_features_for_resources(
resources)

# Docker image is not compatible with ssh proxy command.
if skypilot_config.get_nested(
(str(cls._REPR).lower(), 'ssh_proxy_command'), None) is not None:
unsupported_features2reason.update({
CloudImplementationFeatures.DOCKER_IMAGE: (
f'Docker image is not supported in {cls._REPR} when proxy '
'command is set. Please remove proxy command in the config.'
),
f'Docker image is currently not supported on {cls._REPR} '
'when proxy command is set. Please remove proxy command in '
'the config.'),
})

unsupported_features = set(unsupported_features2reason.keys())
Expand All @@ -494,6 +487,22 @@ def check_features_are_supported(
f'The following features are not supported by {cls._REPR}:'
'\n\t' + table.get_string().replace('\n', '\n\t'))

@classmethod
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[CloudImplementationFeatures, str]:
"""The features not supported based on the resources provided.
This method is used by check_features_are_supported() to check if the
cloud implementation supports all the requested features.
Returns:
A dict of {feature: reason} for the features not supported by the
cloud implementation.
"""
del resources
raise NotImplementedError

@classmethod
def check_cluster_name_is_valid(cls, cluster_name: str) -> None:
"""Errors out on invalid cluster names not supported by cloud providers.
Expand Down
11 changes: 9 additions & 2 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,15 @@ class GCP(clouds.Cloud):
)

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
def _unsupported_features_for_resources(
cls, resources: 'resources.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
if tpu_utils.is_tpu_vm_pod(resources):
return {
clouds.CloudImplementationFeatures.STOP: (
'TPU VM pods cannot be stopped. Please refer to: https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_your_resources'
)
}
return {}

@classmethod
Expand Down
21 changes: 14 additions & 7 deletions sky/clouds/ibm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,27 @@ class IBM(clouds.Cloud):
_regions: List[clouds.Region] = []

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return {
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
features = {
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
(f'Migrating disk is not supported in {cls._REPR}.'),
(f'Migrating disk is currently not supported on {cls._REPR}.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
(f'Docker image is not supported in {cls._REPR}. '
(f'Docker image is currently not supported on {cls._REPR}. '
'You can try running docker command inside the '
'`run` section in task.yaml.'),
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
(f'Custom disk tier is not supported in {cls._REPR}.'),
(f'Custom disk tier is currently not supported on {cls._REPR}.'
),
clouds.CloudImplementationFeatures.OPEN_PORTS:
(f'Opening ports is not supported in {cls._REPR}.'),
(f'Opening ports is currently not supported on {cls._REPR}.'),
}
if resources.use_spot:
features[clouds.CloudImplementationFeatures.STOP] = (
'Stopping spot instances is currently not supported on'
f' {cls._REPR}.')
return features

@classmethod
def max_cluster_name_length(cls) -> Optional[int]:
Expand Down
7 changes: 3 additions & 4 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,6 @@ class Kubernetes(clouds.Cloud):
# https://kubernetes.io/blog/2022/12/05/forensic-container-checkpointing-alpha/ # pylint: disable=line-too-long
clouds.CloudImplementationFeatures.STOP: 'Kubernetes does not '
'support stopping VMs.',
clouds.CloudImplementationFeatures.AUTOSTOP: 'Kubernetes does not '
'support stopping VMs.',
clouds.CloudImplementationFeatures.SPOT_INSTANCE: 'Spot instances are '
'not supported in '
'Kubernetes.',
Expand All @@ -75,8 +73,9 @@ class Kubernetes(clouds.Cloud):
IMAGE_GPU = 'skypilot:gpu-ubuntu-2004'

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
return cls._CLOUD_UNSUPPORTED_FEATURES

@classmethod
Expand Down
13 changes: 7 additions & 6 deletions sky/clouds/lambda_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,21 @@ class Lambda(clouds.Cloud):
# STOP/AUTOSTOP: The Lambda cloud provider does not support stopping VMs.
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.STOP: 'Lambda cloud does not support stopping VMs.',
clouds.CloudImplementationFeatures.AUTOSTOP: 'Lambda cloud does not support stopping VMs.',
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is not supported in {_REPR}.',
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER: f'Migrating disk is currently not supported on {_REPR}.',
clouds.CloudImplementationFeatures.DOCKER_IMAGE: (
f'Docker image is not supported in {_REPR}. '
f'Docker image is currently not supported on {_REPR}. '
'You can try running docker command inside the `run` section in task.yaml.'
),
clouds.CloudImplementationFeatures.SPOT_INSTANCE: f'Spot instances are not supported in {_REPR}.',
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
clouds.CloudImplementationFeatures.OPEN_PORTS: f'Opening ports is not supported in {_REPR}.',
clouds.CloudImplementationFeatures.OPEN_PORTS: f'Opening ports is currently not supported on {_REPR}.',
}

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
del resources # unused
return cls._CLOUD_UNSUPPORTED_FEATURES

@classmethod
Expand Down
7 changes: 3 additions & 4 deletions sky/clouds/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ class Local(clouds.Cloud):
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.STOP:
('Local cloud does not support stopping instances.'),
clouds.CloudImplementationFeatures.AUTOSTOP:
('Local cloud does not support stopping instances.'),
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
('Migrating disk is not supported for Local.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
Expand All @@ -43,8 +41,9 @@ class Local(clouds.Cloud):
}

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
return cls._CLOUD_UNSUPPORTED_FEATURES

@classmethod
Expand Down
18 changes: 12 additions & 6 deletions sky/clouds/oci.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,24 @@ class OCI(clouds.Cloud):
_INDENT_PREFIX = ' '

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return {
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
features = {
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
(f'Migrating disk is not supported in {cls._REPR}.'),
(f'Migrating disk is currently not supported on {cls._REPR}.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
(f'Docker image is not supported in {cls._REPR}. '
(f'Docker image is currently not supported on {cls._REPR}. '
'You can try running docker command inside the '
'`run` section in task.yaml.'),
clouds.CloudImplementationFeatures.OPEN_PORTS:
(f'Opening ports is not supported in {cls._REPR}.'),
(f'Opening ports is currently not supported on {cls._REPR}.'),
}
if resources.use_spot:
features[clouds.CloudImplementationFeatures.STOP] = (
f'Stopping spot instances is currently not supported on '
f'{cls._REPR}.')
return features

@classmethod
def max_cluster_name_length(cls) -> Optional[int]:
Expand Down
18 changes: 12 additions & 6 deletions sky/clouds/scp.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,31 @@ class SCP(clouds.Cloud):
_CLOUD_UNSUPPORTED_FEATURES = {
clouds.CloudImplementationFeatures.MULTI_NODE: _MULTI_NODE,
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
(f'Migrating disk is not supported in {_REPR}.'),
(f'Migrating disk is currently not supported on {_REPR}.'),
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
(f'Docker image is not supported in {_REPR}. '
(f'Docker image is currently not supported on {_REPR}. '
'You can try running docker command inside the '
'`run` section in task.yaml.'),
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
(f'Spot instances are not supported in {_REPR}.'),
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
(f'Custom disk tiers are not supported in {_REPR}.'),
clouds.CloudImplementationFeatures.OPEN_PORTS:
(f'Opening ports is not supported in {_REPR}.'),
(f'Opening ports is currently not supported on {_REPR}.'),
}

_INDENT_PREFIX = ' '

@classmethod
def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return cls._CLOUD_UNSUPPORTED_FEATURES
def _unsupported_features_for_resources(
cls, resources: 'resources_lib.Resources'
) -> Dict[clouds.CloudImplementationFeatures, str]:
features = cls._CLOUD_UNSUPPORTED_FEATURES
if resources.use_spot:
features[clouds.CloudImplementationFeatures.STOP] = (
'Stopping spot instances is currently not supported on'
f' {cls._REPR}.')
return features

@classmethod
def max_cluster_name_length(cls) -> Optional[int]:
Expand Down
Loading

0 comments on commit 20013e5

Please sign in to comment.