Skip to content

Commit

Permalink
Merge pull request #258 from JuliaParallel/jps/wavetoy-fix
Browse files Browse the repository at this point in the history
Fix eager API thunk retention
  • Loading branch information
jpsamaroo authored Aug 13, 2021
2 parents e26b2ef + 03f7aa4 commit 5a6c939
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 148 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "Dagger"
uuid = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
version = "0.12.3"
version = "0.12.4"

[deps]
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
Expand Down
105 changes: 34 additions & 71 deletions src/sch/Sch.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
module Sch

using Distributed
import MemPool: DRef
import MemPool: DRef, poolset

import ..Dagger
import ..Dagger: Context, Processor, Thunk, ThunkFuture, ThunkFailedException, Chunk, OSProc, AnyScope
import ..Dagger: order, free!, dependents, noffspring, istask, inputs, unwrap_weak, affinity, tochunk, @dbg, @logmsg, timespan_start, timespan_end, unrelease, procs, move, capacity, chunktype, default_enabled, get_processors, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime
import ..Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, OSProc, AnyScope
import ..Dagger: order, free!, dependents, noffspring, istask, inputs, unwrap_weak_checked, affinity, tochunk, @dbg, @logmsg, timespan_start, timespan_end, unrelease, procs, move, capacity, chunktype, default_enabled, get_processors, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime

const OneToMany = Dict{Thunk, Set{Thunk}}

Expand Down Expand Up @@ -45,7 +45,7 @@ Fields:
- `cache::Dict{Thunk, Any}` - Maps from a finished `Thunk` to it's cached result, often a DRef
- `running::Set{Thunk}` - The set of currently-running `Thunk`s
- `running_on::Dict{Thunk,OSProc}` - Map from `Thunk` to the OS process executing it
- `thunk_dict::Dict{Int, Any}` - Maps from thunk IDs to a `Thunk`
- `thunk_dict::Dict{Int, WeakThunk}` - Maps from thunk IDs to a `Thunk`
- `node_order::Any` - Function that returns the order of a thunk
- `worker_pressure::Dict{Int,Dict{Type,UInt64}}` - Cache of worker pressure
- `worker_capacity::Dict{Int,Dict{Type,UInt64}}` - Maps from worker ID to capacity
Expand All @@ -67,7 +67,7 @@ struct ComputeState
cache::WeakKeyDict{Thunk, Any}
running::Set{Thunk}
running_on::Dict{Thunk,OSProc}
thunk_dict::Dict{Int, Any}
thunk_dict::Dict{Int, WeakThunk}
node_order::Any
worker_pressure::Dict{Int,Dict{Type,UInt64}}
worker_capacity::Dict{Int,Dict{Type,UInt64}}
Expand All @@ -90,7 +90,7 @@ function start_state(deps::Dict, node_order, chan)
Dict{Thunk, Any}(),
Set{Thunk}(),
Dict{Thunk,OSProc}(),
Dict{Int, Thunk}(),
Dict{Int, WeakThunk}(),
node_order,
Dict{Int,Dict{Type,UInt64}}(),
Dict{Int,Dict{Type,UInt64}}(),
Expand Down Expand Up @@ -333,15 +333,22 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())

# setup thunk_dict mappings
for node in filter(istask, keys(deps))
state.thunk_dict[node.id] = node
state.thunk_dict[node.id] = WeakThunk(node)
for dep in deps[node]
state.thunk_dict[dep.id] = dep
state.thunk_dict[dep.id] = WeakThunk(dep)
end
end

# Initialize procs, pressure, and capacity
@sync for p in procs_to_use(ctx)
@async init_proc(state, p)
@async begin
try
init_proc(state, p)
catch err
@error "Error initializing worker $p" exception=(err,catch_backtrace())
remove_dead_proc!(ctx, state, p)
end
end
end

# setup dynamic listeners
Expand All @@ -357,6 +364,7 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())

# Loop while we still have thunks to execute
while !isempty(state.ready) || !isempty(state.running)
procs_state = assign_new_procs!(ctx, state, procs_state)
if !isempty(state.ready)
# Nothing running, so schedule up to N thunks, 1 per N workers
schedule!(ctx, state, procs_state)
Expand Down Expand Up @@ -407,14 +415,14 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
end
continue
else
if ctx.options.allow_errors || state.thunk_dict[thunk_id].options.allow_errors
if ctx.options.allow_errors || unwrap_weak_checked(state.thunk_dict[thunk_id]).options.allow_errors
thunk_failed = true
else
throw(res)
end
end
end
node = state.thunk_dict[thunk_id]
node = unwrap_weak_checked(state.thunk_dict[thunk_id])
if metadata !== nothing
state.worker_pressure[pid][typeof(proc)] = metadata.pressure
state.worker_loadavg[pid] = metadata.loadavg
Expand Down Expand Up @@ -572,7 +580,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))

# Calculate scope
scope = AnyScope()
for input in unwrap_weak.(task.inputs)
for input in unwrap_weak_checked.(task.inputs)
chunk = if istask(input)
state.cache[input]
elseif input isa Chunk
Expand Down Expand Up @@ -741,69 +749,23 @@ function finish_task!(ctx, state, node, thunk_failed)
if node.cache
node.cache_ref = state.cache[node]
end
for dep in sort!(collect(get(()->Set{Thunk}(), state.waiting_data, node)), by=state.node_order)
dep_isready = false
if haskey(state.waiting, dep)
set = state.waiting[dep]
node in set && pop!(set, node)
dep_isready = isempty(set)
if dep_isready
delete!(state.waiting, dep)
end
else
dep_isready = true
end
if dep_isready
if !thunk_failed
push!(state.ready, dep)
end
end
end
if haskey(state.futures, node)
# Notify any listening thunks
for future in state.futures[node]
put!(future, state.cache[node]; error=thunk_failed)
end
delete!(state.futures, node)
end
schedule_dependents!(state, node, thunk_failed)
fill_registered_futures!(state, node, thunk_failed)

# Chunk clean-up
to_evict = Set{Chunk}()
for inp in filter(t->istask(t) || (t isa Chunk), unwrap_weak.(node.inputs))
if inp in keys(state.waiting_data)
w = state.waiting_data[inp]
if node in w
pop!(w, node)
end
if isempty(w)
delete!(state.waiting_data, inp)
if istask(inp) && haskey(state.cache, inp)
_node = state.cache[inp]
if _node isa Chunk
push!(to_evict, _node)
end
GC.@preserve inp begin
pop!(state.cache, inp)
if haskey(state.errored, inp)
pop!(state.errored, inp)
end
end
elseif inp isa Chunk
push!(to_evict, inp)
end
end
end
end
to_evict = cleanup_inputs!(state, node)
if haskey(state.waiting_data, node) && isempty(state.waiting_data[node])
delete!(state.waiting_data, node)
end
evict_all_chunks!(ctx, to_evict)
end

function evict_all_chunks!(ctx, to_evict)
if !isempty(to_evict)
@sync for w in map(p->p.pid, procs_to_use(ctx))
@async remote_do(evict_chunks!, w, to_evict)
end
end
end

function evict_chunks!(chunks::Set{Chunk})
for chunk in chunks
haskey(CHUNK_CACHE, chunk) && delete!(CHUNK_CACHE, chunk)
Expand Down Expand Up @@ -852,19 +814,20 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
end

ids = map(enumerate(thunk.inputs)) do (idx,x)
istask(x) ? unwrap_weak(x).id : -idx
istask(x) ? unwrap_weak_checked(x).id : -idx
end

data = map(thunk.inputs) do x
istask(x) ? state.cache[unwrap_weak(x)] : x
istask(x) ? state.cache[unwrap_weak_checked(x)] : x
end
toptions = thunk.options !== nothing ? thunk.options : ThunkOptions()
options = merge(ctx.options, toptions)
@assert (options.single == 0) || (gproc.pid == options.single)
sch_handle = SchedulerHandle(ThunkID(thunk.id), state.worker_chans[gproc.pid]...)
# TODO: Set `sch_handle.tid.ref` to the right `DRef`
sch_handle = SchedulerHandle(ThunkID(thunk.id, nothing), state.worker_chans[gproc.pid]...)
state.worker_pressure[gproc.pid][typeof(proc)] += util

# FIXME: De-dup common fields (log_sink, uid, etc.)
# TODO: De-dup common fields (log_sink, uid, etc.)
push!(to_send, (util, thunk.id, thunk.f, data, thunk.get_result,
thunk.persist, thunk.cache, thunk.meta, options, ids,
(log_sink=ctx.log_sink, profile=ctx.profile),
Expand Down Expand Up @@ -944,7 +907,7 @@ function do_task(to_proc, extra_util, thunk_id, f, data, send_result, persist, c
unlock(TASK_SYNC)
else
# Under-subscribed, calculate extra utilization and execute thunk
@debug "($(myid())) ($thunk_id) Using available $to_proc: $extra_util | $(real_util[])/$cap"
@debug "($(myid())) $f ($thunk_id) Using available $to_proc: $extra_util | $(real_util[])/$cap"
extra_util = if extra_util isa MaxUtilization
count(c->typeof(c)===typeof(to_proc), children(from_proc))
else
Expand Down Expand Up @@ -982,7 +945,7 @@ function do_task(to_proc, extra_util, thunk_id, f, data, send_result, persist, c
lock(TASK_SYNC) do
real_util[] -= extra_util
end
@debug "($(myid())) ($thunk_id) Releasing $(typeof(to_proc)): $extra_util | $(real_util[])/$cap"
@debug "($(myid())) $f ($thunk_id) Releasing $(typeof(to_proc)): $extra_util | $(real_util[])/$cap"
lock(TASK_SYNC) do
notify(TASK_SYNC)
end
Expand Down
101 changes: 68 additions & 33 deletions src/sch/dynamic.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
export SchedulerHaltedException
export sch_handle, halt!, exec!, get_dag_ids, add_thunk!

"Identifies a thunk by its ID."
"Identifies a thunk by its ID, and preserves the thunk in the scheduler."
struct ThunkID
id::Int
ref::Union{DRef,Nothing}
end
ThunkID(id::Int) = ThunkID(id, nothing)

"A handle to the scheduler, used by dynamic thunks."
struct SchedulerHandle
Expand Down Expand Up @@ -52,7 +54,13 @@ function dynamic_listener!(ctx, state)
if !(unwrap_nested_exception(err) isa Union{SchedulerHaltedException,
ProcessExitedException,
InvalidStateException})
@error exception=(err,catch_backtrace())
iob = IOContext(IOBuffer(), :color=>true)
println(iob, "Error in sending dynamic request:")
Base.showerror(iob, err)
Base.show_backtrace(iob, catch_backtrace())
println(iob)
seek(iob.io, 0)
write(stderr, iob)
end
break
end
Expand All @@ -69,7 +77,13 @@ function dynamic_listener!(ctx, state)
if !(unwrap_nested_exception(err) isa Union{SchedulerHaltedException,
ProcessExitedException,
InvalidStateException})
@error exception=(err,catch_backtrace())
iob = IOContext(IOBuffer(), :color=>true)
println(iob, "Error in sending dynamic result:")
Base.showerror(iob, err)
Base.show_backtrace(iob, catch_backtrace())
println(iob)
seek(iob.io, 0)
write(stderr, iob)
end
end
end
Expand Down Expand Up @@ -110,25 +124,34 @@ end
"Waits on a thunk to complete, and fetches its result."
function Base.fetch(h::SchedulerHandle, id::ThunkID)
future = ThunkFuture(Future(1))
exec!(_register_future!, h, future, id.id)
exec!(_register_future!, h, future, id)
fetch(future; proc=thunk_processor())
end
"Waits on a thunk to complete, and fetches its result."
function register_future!(h::SchedulerHandle, id::ThunkID, future::ThunkFuture)
exec!(_register_future!, h, future, id.id)
end
function _register_future!(ctx, state, task, tid, (future, id))
tid != id || throw(DynamicThunkException("Cannot fetch own result"))
thunk = state.thunk_dict[id]
ownthunk = state.thunk_dict[tid]
dominates(target, t) = (t == target) || any(_t->dominates(target, _t), filter(istask, unwrap_weak.(t.inputs)))
!dominates(ownthunk, thunk) || throw(DynamicThunkException("Cannot fetch result of dominated thunk"))
# TODO: Assert that future will be fulfilled
if haskey(state.cache, thunk)
put!(future, state.cache[thunk]; error=state.errored[thunk])
else
futures = get!(()->ThunkFuture[], state.futures, thunk)
push!(futures, future)
register_future!(h::SchedulerHandle, id::ThunkID, future::ThunkFuture) =
exec!(_register_future!, h, future, id)
function _register_future!(ctx, state, task, tid, (future, id)::Tuple{ThunkFuture,ThunkID})
tid != id.id || throw(DynamicThunkException("Cannot fetch own result"))
GC.@preserve id begin
thunk = unwrap_weak_checked(state.thunk_dict[id.id])
ownthunk = unwrap_weak_checked(state.thunk_dict[tid])
function dominates(target, t)
t == target && return true
# N.B. Skips expired tasks
task_inputs = filter(istask, Dagger.unwrap_weak.(t.inputs))
if any(_t->dominates(target, _t), task_inputs)
return true
end
return false
end
!dominates(ownthunk, thunk) || throw(DynamicThunkException("Cannot fetch result of dominated thunk"))
# TODO: Assert that future will be fulfilled
if haskey(state.cache, thunk)
put!(future, state.cache[thunk]; error=state.errored[thunk])
else
futures = get!(()->ThunkFuture[], state.futures, thunk)
push!(futures, future)
end
end
nothing
end
Expand All @@ -146,27 +169,39 @@ get_dag_ids(h::SchedulerHandle) =
function _get_dag_ids(ctx, state, task, tid, _)
deps = Dict{ThunkID,Set{ThunkID}}()
for (id,thunk) in state.thunk_dict
thunk = unwrap_weak_checked(thunk)
# TODO: Get at `thunk_ref` for `thunk_id.ref`
thunk_id = ThunkID(id, nothing)
if haskey(state.waiting_data, thunk)
deps[ThunkID(id)] = Set(map(t->ThunkID(t.id), collect(state.waiting_data[thunk])))
deps[thunk_id] = Set(map(t->ThunkID(t.id, nothing), collect(state.waiting_data[thunk])))
else
deps[ThunkID(id)] = Set{ThunkID}()
deps[thunk_id] = Set{ThunkID}()
end
end
deps
end

"Adds a new Thunk to the DAG."
add_thunk!(f, h::SchedulerHandle, args...; future=nothing, kwargs...) =
ThunkID(exec!(_add_thunk!, h, f, args, kwargs, future))
function _add_thunk!(ctx, state, task, tid, (f, args, kwargs, future))
_args = map(arg->arg isa ThunkID ? Dagger.WeakThunk(state.thunk_dict[arg.id]) : arg, args)
thunk = Thunk(f, _args...; kwargs...)
state.thunk_dict[thunk.id] = thunk
@assert reschedule_inputs!(state, thunk)
if future !== nothing
# Ensure we attach a future before the thunk is scheduled
_register_future!(ctx, state, task, tid, (future, thunk.id))
add_thunk!(f, h::SchedulerHandle, args...; future=nothing, ref=nothing, kwargs...) =
exec!(_add_thunk!, h, f, args, kwargs, future, ref)
function _add_thunk!(ctx, state, task, tid, (f, args, kwargs, future, ref))
_args = map(arg->arg isa ThunkID ? state.thunk_dict[arg.id] : arg, args)
GC.@preserve _args begin
thunk = Thunk(f, _args...; kwargs...)
# Create a `DRef` to `thunk` so that the caller can preserve it
thunk_ref = poolset(thunk)
thunk_id = ThunkID(thunk.id, thunk_ref)
state.thunk_dict[thunk.id] = WeakThunk(thunk)
reschedule_inputs!(state, thunk)
if future !== nothing
# Ensure we attach a future before the thunk is scheduled
_register_future!(ctx, state, task, tid, (future, thunk_id))
end
if ref !== nothing
# Preserve the `EagerThunkFinalizer` through `thunk`
thunk.eager_ref = ref
end
put!(state.chan, RescheduleSignal())
return thunk_id
end
put!(state.chan, RescheduleSignal())
return thunk.id::Int
end
8 changes: 4 additions & 4 deletions src/sch/eager.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,14 @@ function eager_thunk()
adjust_pressure!(h, Dagger.ThreadProc, -util)
while isopen(EAGER_THUNK_CHAN)
try
ev, future, uid, f, args, opts = take!(EAGER_THUNK_CHAN)
added_future, future, uid, ref, f, args, opts = take!(EAGER_THUNK_CHAN)
# preserve inputs until they enter the scheduler
tid = GC.@preserve args begin
args = map(x->x isa Dagger.EagerThunk ? ThunkID(EAGER_ID_MAP[x.uid]) : x, args)
add_thunk!(f, h, args...; future=future, opts...)
_args = map(x->x isa Dagger.EagerThunk ? ThunkID(EAGER_ID_MAP[x.uid], x.thunk_ref) : x, args)
add_thunk!(f, h, _args...; future=future, ref=ref, opts...)
end
EAGER_ID_MAP[uid] = tid.id
notify(ev)
put!(added_future, tid.ref)
catch err
iob = IOContext(IOBuffer(), :color=>true)
println(iob, "Error in eager listener:")
Expand Down
Loading

2 comments on commit 5a6c939

@jpsamaroo
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/42841

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.12.4 -m "<description of version>" 5a6c9398b3a19c10d156ec4f0715548e7d778cd7
git push origin v0.12.4

Please sign in to comment.