Skip to content

Commit

Permalink
Merge pull request #532 from JuliaParallel/dead-workers
Browse files Browse the repository at this point in the history
Dead worker handling
  • Loading branch information
jpsamaroo authored Jun 17, 2024
2 parents 530805e + 4d123ac commit 684d80c
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
15 changes: 14 additions & 1 deletion src/sch/Sch.jl
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,20 @@ function cleanup_proc(state, p, log_sink)
delete!(WORKER_MONITOR_CHANS[wid], state.uid)
end
end
remote_do(_cleanup_proc, wid, state.uid, log_sink)

# If the worker process is still alive, clean it up
if wid in workers()
try
remotecall_wait(_cleanup_proc, wid, state.uid, log_sink)
catch ex
# We allow ProcessExitedException's, which means that the worker
# shutdown halfway through cleanup.
if !(ex isa ProcessExitedException)
rethrow()
end
end
end

timespan_finish(ctx, :cleanup_proc, (;worker=wid), nothing)
end

Expand Down
13 changes: 11 additions & 2 deletions src/sch/dynamic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,18 @@ function safepoint(state)
if state.halt.set
# Force dynamic thunks and listeners to terminate
for (inp_chan,out_chan) in values(state.worker_chans)
close(inp_chan)
close(out_chan)
# Closing these channels will fail if the worker died, which we
# allow.
try
close(inp_chan)
close(out_chan)
catch ex
if !(ex isa ProcessExitedException)
rethrow()
end
end
end

# Throw out of scheduler
throw(SchedulerHaltedException())
end
Expand Down

0 comments on commit 684d80c

Please sign in to comment.