Skip to content

Commit 2c3d83a

Browse files
authored
Implement volumes force detach (#2242)
* Add comments on termination * Simplify job termination and fix instance locking * Fix tests * Implement volume force detach * Fix duration parameters parsing * Filter our instances with detaching volumes * Add docs on Force detach
1 parent d47ec67 commit 2c3d83a

File tree

25 files changed

+736
-161
lines changed

25 files changed

+736
-161
lines changed

docs/docs/concepts/volumes.md

+13-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ Volume my-volume does not exist yet. Create the volume? [y/n]: y
8484

8585
Once created, the volume can be attached to dev environments, tasks, and services.
8686

87-
> When creating a network volume, `dstack` automatically creates an `ext4` filesystem on it.
87+
!!! info "Filesystem"
88+
When creating a new network volume, `dstack` automatically creates an `ext4` filesystem on it.
8889

8990
### Attach a volume { #attach-network-volume }
9091

@@ -137,6 +138,17 @@ and its contents will persist across runs.
137138
to `/workflow` (and sets that as the current working directory). Right now, `dstack` doesn't allow you to
138139
attach volumes to `/workflow` or any of its subdirectories.
139140

141+
### Detach a volume { #detach-network-volume }
142+
143+
`dstack` automatically detaches volumes from instances when a run stops.
144+
145+
!!! info "Force detach"
146+
In some clouds such as AWS a volume may stuck in the detaching state.
147+
To fix this, you can abort the run, and `dstack` will force detach the volume.
148+
`dstack` will also force detach the stuck volume automatically after `stop_duration`.
149+
Note that force detaching a volume is a last resort measure and may corrupt the file system.
150+
Contact your cloud support if you're experience volumes stuck in the detaching state.
151+
140152
### Manage volumes { #manage-network-volumes }
141153

142154
#### List volumes

src/dstack/_internal/cli/services/configurators/run.py

+3
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,11 @@ def apply_configuration(
9999
backends=profile.backends,
100100
regions=profile.regions,
101101
instance_types=profile.instance_types,
102+
reservation=profile.reservation,
102103
spot_policy=profile.spot_policy,
103104
retry_policy=profile.retry_policy,
104105
max_duration=profile.max_duration,
106+
stop_duration=profile.stop_duration,
105107
max_price=profile.max_price,
106108
working_dir=conf.working_dir,
107109
run_name=conf.name,
@@ -110,6 +112,7 @@ def apply_configuration(
110112
creation_policy=profile.creation_policy,
111113
termination_policy=profile.termination_policy,
112114
termination_policy_idle=profile.termination_idle_time,
115+
idle_duration=profile.idle_duration,
113116
)
114117

115118
print_run_plan(run_plan, offers_limit=configurator_args.max_offers)

src/dstack/_internal/cli/utils/run.py

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def print_run_plan(run_plan: RunPlan, offers_limit: int = 3):
4040

4141
profile = run_plan.run_spec.merged_profile
4242
creation_policy = profile.creation_policy
43+
# FIXME: This assumes the default idle_duration is the same for client and server.
44+
# If the server changes idle_duration, old clients will see incorrect value.
4345
termination_policy, termination_idle_time = get_termination(
4446
profile, DEFAULT_RUN_TERMINATION_IDLE_TIME
4547
)

src/dstack/_internal/core/backends/aws/compute.py

+36-6
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
VolumeAttachmentData,
4141
VolumeProvisioningData,
4242
)
43+
from dstack._internal.utils.common import get_or_error
4344
from dstack._internal.utils.logging import get_logger
4445

4546
logger = get_logger(__name__)
@@ -630,17 +631,46 @@ def attach_volume(self, volume: Volume, instance_id: str) -> VolumeAttachmentDat
630631
logger.debug("Attached EBS volume %s to instance %s", volume.volume_id, instance_id)
631632
return VolumeAttachmentData(device_name=device_name)
632633

633-
def detach_volume(self, volume: Volume, instance_id: str):
634+
def detach_volume(self, volume: Volume, instance_id: str, force: bool = False):
634635
ec2_client = self.session.client("ec2", region_name=volume.configuration.region)
635636

636637
logger.debug("Detaching EBS volume %s from instance %s", volume.volume_id, instance_id)
637-
ec2_client.detach_volume(
638-
VolumeId=volume.volume_id,
639-
InstanceId=instance_id,
640-
Device=volume.attachment_data.device_name,
641-
)
638+
try:
639+
ec2_client.detach_volume(
640+
VolumeId=volume.volume_id,
641+
InstanceId=instance_id,
642+
Device=get_or_error(volume.attachment_data).device_name,
643+
Force=force,
644+
)
645+
except botocore.exceptions.ClientError as e:
646+
if e.response["Error"]["Code"] == "IncorrectState":
647+
logger.info(
648+
"Skipping EBS volume %s detach since it's already detached", volume.volume_id
649+
)
650+
return
651+
raise e
642652
logger.debug("Detached EBS volume %s from instance %s", volume.volume_id, instance_id)
643653

654+
def is_volume_detached(self, volume: Volume, instance_id: str) -> bool:
655+
ec2_client = self.session.client("ec2", region_name=volume.configuration.region)
656+
657+
logger.debug("Getting EBS volume %s status", volume.volume_id)
658+
response = ec2_client.describe_volumes(VolumeIds=[volume.volume_id])
659+
volumes_infos = response.get("Volumes")
660+
if len(volumes_infos) == 0:
661+
logger.debug(
662+
"Failed to check EBS volume %s status. Volume not found.", volume.volume_id
663+
)
664+
return True
665+
volume_info = volumes_infos[0]
666+
for attachment in volume_info["Attachments"]:
667+
if attachment["InstanceId"] != instance_id:
668+
continue
669+
if attachment["State"] != "detached":
670+
return False
671+
return True
672+
return True
673+
644674

645675
def get_maximum_efa_interfaces(ec2_client: botocore.client.BaseClient, instance_type: str) -> int:
646676
try:

src/dstack/_internal/core/backends/base/compute.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -177,12 +177,21 @@ def attach_volume(self, volume: Volume, instance_id: str) -> VolumeAttachmentDat
177177
"""
178178
raise NotImplementedError()
179179

180-
def detach_volume(self, volume: Volume, instance_id: str):
180+
def detach_volume(self, volume: Volume, instance_id: str, force: bool = False):
181181
"""
182182
Detaches a volume from the instance.
183183
"""
184184
raise NotImplementedError()
185185

186+
def is_volume_detached(self, volume: Volume, instance_id: str) -> bool:
187+
"""
188+
Checks if a volume was detached from the instance.
189+
If `detach_volume()` may fail to detach volume,
190+
this method should be overridden to check the volume status.
191+
The caller will trigger force detach if the volume gets stuck detaching.
192+
"""
193+
return True
194+
186195
def _get_offers_cached_key(self, requirements: Optional[Requirements] = None) -> int:
187196
# Requirements is not hashable, so we use a hack to get arguments hash
188197
if requirements is None:

src/dstack/_internal/core/backends/gcp/compute.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,7 @@ def attach_volume(self, volume: Volume, instance_id: str) -> VolumeAttachmentDat
641641
)
642642
return VolumeAttachmentData(device_name=device_name)
643643

644-
def detach_volume(self, volume: Volume, instance_id: str):
644+
def detach_volume(self, volume: Volume, instance_id: str, force: bool = False):
645645
logger.debug(
646646
"Detaching persistent disk for volume %s from instance %s",
647647
volume.volume_id,

src/dstack/_internal/core/backends/local/compute.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,5 +97,5 @@ def delete_volume(self, volume: Volume):
9797
def attach_volume(self, volume: Volume, instance_id: str):
9898
pass
9999

100-
def detach_volume(self, volume: Volume, instance_id: str):
100+
def detach_volume(self, volume: Volume, instance_id: str, force: bool = False):
101101
pass

src/dstack/_internal/core/models/fleets.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ class InstanceGroupParams(CoreModel):
178178
Field(
179179
description="Time to wait before terminating idle instances. Defaults to `5m` for runs and `3d` for fleets. Use `off` for unlimited duration"
180180
),
181-
]
181+
] = None
182182
# Deprecated:
183183
termination_policy: Annotated[
184184
Optional[TerminationPolicy],

src/dstack/_internal/core/models/profiles.py

+43-9
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
DEFAULT_INSTANCE_RETRY_DURATION = 60 * 60 * 24 # 24h
1717

18+
DEFAULT_STOP_DURATION = 300
19+
1820

1921
class SpotPolicy(str, Enum):
2022
SPOT = "spot"
@@ -38,16 +40,27 @@ def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
3840
return Duration.parse(v)
3941

4042

41-
def parse_max_duration(v: Optional[Union[int, str]]) -> Optional[Union[str, int]]:
42-
# TODO: [Andrey] Not sure this works (see `parse_idle_duration`)
43-
if v == "off":
44-
return v
43+
def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
44+
return parse_off_duration(v)
45+
46+
47+
def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
48+
return parse_off_duration(v)
49+
50+
51+
def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
52+
if v == "off" or v is False:
53+
return "off"
54+
if v is True:
55+
return None
4556
return parse_duration(v)
4657

4758

48-
def parse_idle_duration(v: Optional[Union[int, str]]) -> Optional[Union[str, int]]:
59+
def parse_idle_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
4960
if v is False:
5061
return -1
62+
if v is True:
63+
return None
5164
return parse_duration(v)
5265

5366

@@ -136,9 +149,24 @@ class ProfileParams(CoreModel):
136149
Field(description="The policy for resubmitting the run. Defaults to `false`"),
137150
]
138151
max_duration: Annotated[
139-
Optional[Union[Literal["off"], str, int]],
152+
Optional[Union[Literal["off"], str, int, bool]],
140153
Field(
141-
description="The maximum duration of a run (e.g., `2h`, `1d`, etc). After it elapses, the run is forced to stop. Defaults to `off`"
154+
description=(
155+
"The maximum duration of a run (e.g., `2h`, `1d`, etc)."
156+
" After it elapses, the run is automatically stopped."
157+
" Use `off` for unlimited duration. Defaults to `off`"
158+
)
159+
),
160+
]
161+
stop_duration: Annotated[
162+
Optional[Union[Literal["off"], str, int, bool]],
163+
Field(
164+
description=(
165+
"The maximum duration of a run gracefull stopping."
166+
" After it elapses, the run is automatically forced stopped."
167+
" This includes force detaching volumes used by the run."
168+
" Use `off` for unlimited duration. Defaults to `5m`"
169+
)
142170
),
143171
]
144172
max_price: Annotated[
@@ -152,9 +180,12 @@ class ProfileParams(CoreModel):
152180
),
153181
]
154182
idle_duration: Annotated[
155-
Optional[Union[Literal["off"], str, int]],
183+
Optional[Union[Literal["off"], str, int, bool]],
156184
Field(
157-
description="Time to wait before terminating idle instances. Defaults to `5m` for runs and `3d` for fleets. Use `off` for unlimited duration"
185+
description=(
186+
"Time to wait before terminating idle instances."
187+
" Defaults to `5m` for runs and `3d` for fleets. Use `off` for unlimited duration"
188+
)
158189
),
159190
]
160191
# Deprecated:
@@ -180,6 +211,9 @@ class ProfileParams(CoreModel):
180211
_validate_max_duration = validator("max_duration", pre=True, allow_reuse=True)(
181212
parse_max_duration
182213
)
214+
_validate_stop_duration = validator("stop_duration", pre=True, allow_reuse=True)(
215+
parse_stop_duration
216+
)
183217
_validate_termination_idle_time = validator(
184218
"termination_idle_time", pre=True, allow_reuse=True
185219
)(parse_duration)

src/dstack/_internal/core/models/runs.py

+1
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ class JobSpec(CoreModel):
185185
image_name: str
186186
privileged: bool = False
187187
max_duration: Optional[int]
188+
stop_duration: Optional[int] = None
188189
registry_auth: Optional[RegistryAuth]
189190
requirements: Requirements
190191
retry: Optional[Retry]

src/dstack/_internal/server/background/tasks/process_runs.py

+5
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,11 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
249249
JobStatus.TERMINATED,
250250
JobStatus.ABORTED,
251251
}:
252+
# FIXME: This code does not expect JobStatus.TERMINATED status,
253+
# so if a job transitions from RUNNING to TERMINATED,
254+
# the run will transition to PENDING instead of TERMINATING.
255+
# This may not be observed because process_runs is invoked more frequently
256+
# than process_terminating_jobs and because most jobs usually transition to FAILED.
252257
pass # unexpected, but let's ignore it
253258
else:
254259
raise ValueError(f"Unexpected job status {job_model.status}")

src/dstack/_internal/server/background/tasks/process_submitted_jobs.py

+20-10
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
DEFAULT_POOL_NAME,
2424
DEFAULT_RUN_TERMINATION_IDLE_TIME,
2525
CreationPolicy,
26+
Profile,
2627
TerminationPolicy,
2728
)
2829
from dstack._internal.core.models.runs import (
@@ -52,6 +53,7 @@
5253
)
5354
from dstack._internal.server.services.jobs import (
5455
find_job,
56+
get_instances_ids_with_detaching_volumes,
5557
)
5658
from dstack._internal.server.services.locking import get_locker
5759
from dstack._internal.server.services.logging import fmt
@@ -171,16 +173,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
171173
await session.commit()
172174
return
173175

174-
res = await session.execute(
175-
select(PoolModel)
176-
.where(
177-
PoolModel.project_id == project.id,
178-
PoolModel.name == (profile.pool_name or DEFAULT_POOL_NAME),
179-
PoolModel.deleted == False,
180-
)
181-
.options(lazyload(PoolModel.instances))
182-
)
183-
pool = res.scalar_one()
176+
pool = await _get_pool(session=session, project=project, profile=profile)
184177

185178
# Submitted jobs processing happens in two steps (transactions).
186179
# First, the jobs gets an instance assigned (or no instance).
@@ -204,9 +197,13 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
204197
# Start new transaction to see commited changes after lock
205198
await session.commit()
206199
async with get_locker().lock_ctx(InstanceModel.__tablename__, instances_ids):
200+
# If another job freed the instance but is still trying to detach volumes,
201+
# do not provision on it to prevent attaching volumes that are currently detaching.
202+
detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session)
207203
# Refetch after lock
208204
res = await session.execute(
209205
select(InstanceModel).where(
206+
InstanceModel.id.not_in(detaching_instances_ids),
210207
InstanceModel.id.in_(instances_ids),
211208
InstanceModel.deleted == False,
212209
InstanceModel.job_id.is_(None),
@@ -331,6 +328,19 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
331328
await session.commit()
332329

333330

331+
async def _get_pool(session: AsyncSession, project: ProjectModel, profile: Profile) -> PoolModel:
332+
res = await session.execute(
333+
select(PoolModel)
334+
.where(
335+
PoolModel.project_id == project.id,
336+
PoolModel.name == (profile.pool_name or DEFAULT_POOL_NAME),
337+
PoolModel.deleted == False,
338+
)
339+
.options(lazyload(PoolModel.instances))
340+
)
341+
return res.scalar_one()
342+
343+
334344
async def _assign_job_to_pool_instance(
335345
session: AsyncSession,
336346
pool_instances: List[InstanceModel],

0 commit comments

Comments
 (0)