Skip to content

Commit

Permalink
Merge pull request #278 from JuliaParallel/jps/logging-overhaul
Browse files Browse the repository at this point in the history
Logging overhaul and new web dashboard
  • Loading branch information
jpsamaroo authored Sep 11, 2021
2 parents 6a1aa1f + 11fb9f6 commit f897e6a
Show file tree
Hide file tree
Showing 25 changed files with 1,940 additions and 183 deletions.
4 changes: 2 additions & 2 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
env:
JULIA_NUM_THREADS: 1
steps:
- label: Julia 1.x
- label: Julia 1.6
command: julia --project -e 'using Pkg; Pkg.build(); Pkg.test()'
timeout_in_minutes: 60
<<: *test
plugins:
- JuliaCI/julia#v1:
version: "1.5"
version: "1.6"
- JuliaCI/julia-test#v1:
# - JuliaCI/julia-coverage#v1:
# codecov: true
Expand Down
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "Dagger"
uuid = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
version = "0.13.1"
version = "0.13.2"

[deps]
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
Expand All @@ -24,9 +24,9 @@ Colors = "0.10, 0.11, 0.12"
MemPool = "0.3.4"
Requires = "1"
StatsBase = "0.28, 0.29, 0.30, 0.31, 0.32, 0.33"
Tables = "1.1"
TableOperations = "1"
julia = "1.3"
Tables = "1.1"
julia = "1.6"

[extras]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
Expand Down
1 change: 0 additions & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
environment:
matrix:
- julia_version: 1.3
- julia_version: 1.6
- julia_version: nightly

Expand Down
54 changes: 46 additions & 8 deletions benchmarks/benchmark.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ if render == "live"
const live = true
using Luxor, ProfileSVG
using Mux
elseif render == "webdash"
const live = true
using DaggerWebDash
import DaggerWebDash: LinePlot, GanttPlot, GraphPlot, ProfileViewer
elseif render == "offline"
const live = false
using Luxor, ProfileSVG
Expand All @@ -43,6 +47,9 @@ const RENDERS = Dict{Int,Dict}()
const live_port = parse(Int, get(ENV, "BENCHMARK_LIVE_PORT", "8000"))

const graph = parse(Bool, get(ENV, "BENCHMARK_GRAPH", "0"))
if render == "webdash"
@warn "BENCHMARK_GRAPH=1 is not compatible with BENCHMARK_RENDER=webdash; disabling graphing"
end
const profile = parse(Bool, get(ENV, "BENCHMARK_PROFILE", "0"))

_benches = get(ENV, "BENCHMARK", "cpu,cpu+dagger")
Expand Down Expand Up @@ -247,20 +254,46 @@ function main()
output_prefix = "result-$(np)workers-$(nt)threads-$(Dates.now())"

suites = Dict()
graph_opts = if graph && render != ""
(log_sink=Dagger.LocalEventLog(), log_file=output_prefix*".dot")
elseif render != ""
(log_sink=Dagger.LocalEventLog(),)
else
NamedTuple()
opts = (;profile=profile)
if render == "live"
opts = merge(opts, (;log_sink=Dagger.LocalEventLog()))
if graph
opts = merge(opts, (;log_file=output_prefix*".dot"))
end
elseif render == "webdash"
ml = Dagger.MultiEventLog()
ml[:core] = Dagger.Events.CoreMetrics()
ml[:id] = Dagger.Events.IDMetrics()
ml[:timeline] = Dagger.Events.TimelineMetrics()
profile && (ml[:profile] = DaggerWebDash.ProfileMetrics())
ml[:wsat] = Dagger.Events.WorkerSaturation()
ml[:loadavg] = Dagger.Events.CPULoadAverages()
ml[:bytes] = Dagger.Events.BytesAllocd()
ml[:mem] = Dagger.Events.MemoryFree()
ml[:esat] = Dagger.Events.EventSaturation()
ml[:psat] = Dagger.Events.ProcessorSaturation()
lw = Dagger.Events.LogWindow(20*10^9, :core)
d3r = DaggerWebDash.D3Renderer(live_port)
push!(d3r, GanttPlot(:core, :id, :timeline, :esat, :psat, "Overview"))
# TODO: push!(d3r, ProfileViewer(:core, :profile, "Profile Viewer"))
push!(d3r, LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks"))
push!(d3r, LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads"))
push!(d3r, LinePlot(:core, :bytes, "Allocated Bytes", "Bytes"))
push!(d3r, LinePlot(:core, :mem, "Available Memory", "% Free"))
push!(d3r, GraphPlot(:core, :id, :timeline, :profile, "DAG"))
push!(lw.creation_handlers, d3r)
push!(lw.deletion_handlers, d3r)
ml.aggregators[:logwindow] = lw
ml.aggregators[:d3r] = d3r
opts = merge(opts, (;log_sink=ml))
end
ctx = Context(collect((1:nw) .+ 1); profile=profile, graph_opts...)
ctx = Context(collect((1:nw) .+ 1); opts...)
for bench in benches
name = bench.name
println("creating $name benchmarks")
suites[name] = nmf_suite(ctx; dagger=bench.dagger, accel=bench.accel)
end
if render != ""
if render == "live" || render == "offline"
Dagger.show_gantt(ctx; width=1800, window_length=5, delay=2, port=live_port, live=live)
if live
# Make sure server code is compiled
Expand All @@ -269,6 +302,11 @@ function main()
run(pipeline(`curl -s localhost:$live_port/profile`; stdout=devnull))
@info "Rendering started on port $live_port"
end
elseif render == "webdash"
# Kick the webserver into gear
collect(ctx, delayed(identity)(1))
run(pipeline(`curl -s localhost:$live_port/index.html`; stdout=devnull))
@info "Rendering started on port $live_port"
end
res = Dict()
for bench in benches
Expand Down
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ makedocs(;
"Scopes" => "scopes.md",
"Dynamic Scheduler Control" => "dynamic.md",
"Logging and Graphing" => "logging.md",
"Scheduler Visualization" => "scheduler-visualization.md",
"Benchmarking" => "benchmarking.md",
"Scheduler Internals" => "scheduler-internals.md",
"Distributed Table" => "dtable.md",
Expand Down
107 changes: 100 additions & 7 deletions docs/src/logging.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,99 @@

Dagger's scheduler keeps track of the important and potentially expensive
actions it does, such as moving data between workers or executing thunks, and
tracks how much time and memory allocations these operations consume. Saving
this information somewhere accessible is disabled by default, but it's quite
easy to turn it on:
tracks how much time and memory allocations these operations consume, among
other things. Saving this information somewhere accessible is disabled by
default, but it's quite easy to turn it on, by setting a "log sink" in the
`Context` being used, as `ctx.log_sink`. A variety of log sinks are built-in to
Dagger; the `NoOpLog` is the default log sink when one isn't explicitly
specified, and disables logging entirely (to minimize overhead). There are
currently two other log sinks of interest; the first and newer of the two is
the `Dagger.MultiEventLog`, which generates multiple independent log streams,
one per "consumer" (details in the next section). The second and older sink is
the `Dagger.LocalEventLog`, which is explained later in this document. Most
users are recommended to use the `MultiEventLog` since it's far more flexible
and extensible, and is more performant in general.

## MultiEventLog

The `MultiEventLog` is intended to be configurable to exclude unnecessary
information, and to include any built-in or user-defined metrics. It stores a
set of "sub-log" streams internally, appending a single element to each of them
when an event is generated. This element can be called a "sub-event" (to
distinguish it from the higher-level "event" that Dagger creates), and is
created by a "consumer". A consumer is a function or callable struct that, when
called with the `Dagger.Event` object generated by Dagger, returns a sub-event
characterizing whatever information the consumer represents. For example, the
`Dagger.Events.BytesAllocd` consumer calculates the total bytes allocated and
live at any given time within Dagger, and returns the current value when
called. Let's construct one:

```julia
ctx = Context()
ml = Dagger.MultiEventLog()

# Add the BytesAllocd consumer to the log as `:bytes`
ml[:bytes] = Dagger.Events.BytesAllocd()

ctx.log_sink = ml
```

As we can see above, each consumer gets a unique name as a `Symbol` that
identifies it. Now that the log sink is attached with a consumer, we can
execute some Dagger tasks, and then collect the sub-events generated by
`BytesAllocd`:

```julia
# Using the lazy API, for explanatory purposes
collect(ctx, delayed(+)(1, delayed(*)(3, 4))) # Allocates 8 bytes
log = Dagger.get_logs!(ml)[1] # Get the logs for worker 1
@show log[:bytes]
```

You'll then see that 8 bytes are allocated and then freed during the process of
executing and completing those tasks.

Note that the `MultiEventLog` can also be used perfectly well when using
Dagger's eager API:

```julia
ctx = Dagger.Sch.eager_context()
ctx.log_sink = ml

a = Dagger.@spawn 3*4
Dagger.@spawn 1+a
```

There are a variety of other consumers built-in to Dagger, under the
`Dagger.Events` module:

```@docs
Dagger.Events.CoreMetrics
Dagger.Events.IDMetrics
Dagger.Events.TimelineMetrics
Dagger.Events.FullMetrics
Dagger.Events.BytesAllocd
Dagger.Events.CPULoadAverages
Dagger.Events.MemoryFree
Dagger.Events.EventSaturation
Dagger.Events.WorkerSaturation
Dagger.Events.ProcessorSaturation
```

The `MultiEventLog` also has a mechanism to call a set of functions, called
"aggregators", after all consumers have been executed, and are passed the full
set of log streams as a `Dict{Symbol,Vector{Any}}`. The only one currently
shipped with Dagger directly is the `LogWindow`:

```@docs
Dagger.Events.LogWindow
```

## LocalEventLog

The `LocalEventLog` is generally only useful when you want combined events
(event start and finish combined as a single unit), and only care about a few
simple built-in generated events. Let's attach one to our context:

```julia
ctx = Context()
Expand All @@ -13,10 +103,7 @@ ctx.log_sink = log
```

Now anytime `ctx` is used as the context for a scheduler, the scheduler will
log events into `log`. A `LocalEventLog` logs information in-memory, and does
so on each worker. The default log object is a `NoOpLog`, which doesn't store
events at all. The `FilterLog` exists to allow writing events to a
user-defined location (such as a database, file, or network socket).
log events into `log`.

Once sufficient data has been accumulated into a `LocalEventLog`, it can be
gathered to a single host via `Dagger.get_logs!(log)`. The result is a
Expand Down Expand Up @@ -48,3 +135,9 @@ which aren't `Thunk`s (such as operations on the `Dagger.DArray`) will be
properly rendered with input arguments (which normally aren't rendered because
a `Thunk` is dynamically generated from such operations by Dagger before
scheduling).

## FilterLog

The `FilterLog` exists to allow writing events to a user-defined location (such
as a database, file, or network socket). It is not currently tested or
documented.
78 changes: 78 additions & 0 deletions docs/src/scheduler-visualization.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Scheduler Visualization with DaggerWebDash

When working with Dagger, especially when working with its scheduler, it can be
helpful to visualize what Dagger is doing internally. To assist with this, a
web dashboard is available in the DaggerWebDash.jl package. This web dashboard
uses a web server running within each Dagger worker, along with event logging
information, to expose details about the scheduler. Information like worker and
processor saturation, memory allocations, profiling traces, and much more are
available in easy-to-interpret plots.

Using the dashboard is relatively simple and straightforward; if you run
Dagger's benchmarking script, it's enabled for you automatically if the
`BENCHMARK_RENDER` environment variable is set to `webdash`. This is the
easiest way to get started with the web dashboard for new users.

For manual usage, the following snippet of code will suffice:

```julia
ctx = Context() # or `ctx = Dagger.Sch.eager_context()` for eager API usage
ml = Dagger.MultiEventLog()

## Add some logging events of interest

ml[:core] = Dagger.Events.CoreMetrics()
ml[:id] = Dagger.Events.IDMetrics()
ml[:timeline] = Dagger.Events.TimelineMetrics()
# ...

# (Optional) Enable profile flamegraph generation with ProfileSVG
ml[:profile] = DaggerWebDash.ProfileMetrics()
ctx.profile = true

# Create a LogWindow; necessary for real-time event updates
lw = Dagger.Events.LogWindow(20*10^9, :core)
ml.aggregators[:logwindow] = lw

# Create the D3Renderer server on port 8080
d3r = DaggerWebDash.D3Renderer(8080)

## Add some plots! Rendered top-down in order

# Show an overview of all generated events as a Gantt chart
push!(d3r, GanttPlot(:core, :id, :timeline, :esat, :psat, "Overview"))

# Show various numerical events as line plots over time
push!(d3r, LinePlot(:core, :wsat, "Worker Saturation", "Running Tasks"))
push!(d3r, LinePlot(:core, :loadavg, "CPU Load Average", "Average Running Threads"))
push!(d3r, LinePlot(:core, :bytes, "Allocated Bytes", "Bytes"))
push!(d3r, LinePlot(:core, :mem, "Available Memory", "% Free"))

# Show a graph rendering of compute tasks and data movement between them
# Note: Profile events are ignored if absent from the log
push!(d3r, GraphPlot(:core, :id, :timeline, :profile, "DAG"))

# TODO: Not yet functional
#push!(d3r, ProfileViewer(:core, :profile, "Profile Viewer"))

# Add the D3Renderer as a consumer of special events generated by LogWindow
push!(lw.creation_handlers, d3r)
push!(lw.deletion_handlers, d3r)

# D3Renderer is also an aggregator
ml.aggregators[:d3r] = d3r

ctx.log_sink = ml
# ... use `ctx`
```

Once the server has started, you can browse to `http://localhost:8080/` (if
running on your local machine) to view the plots in real time. The dashboard
also provides options at the top of the page to control the drawing speed,
enable and disable reading updates from the server (disabling freezes the
display at the current instant), and a selector for which worker to look at. If
the connection to the server is lost for any reason, the dashboard will attempt
to reconnect at 5 second intervals. The dashboard can usually survive restarts
of the server perfectly well, although refreshing the page is usually a good
idea. Informational messages are also logged to the browser console for
debugging.
23 changes: 23 additions & 0 deletions lib/DaggerWebDash/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name = "DaggerWebDash"
uuid = "cfc5aa84-1a2a-41ab-b391-ede92ecae40c"
authors = ["Julian P Samaroo <[email protected]>"]
version = "0.1.0"

[deps]
Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
Mux = "a975b10e-0019-58db-a62f-e48ff68538c9"
ProfileSVG = "132c30aa-f267-4189-9183-c8a63c7e05e6"
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"

[compat]
Dagger = "0.13.2"
JSON3 = "1"
MemPool = "0.3"
Mux = "0.7"
ProfileSVG = "0.2"
StructTypes = "1"
julia = "1.6"
9 changes: 9 additions & 0 deletions lib/DaggerWebDash/src/DaggerWebDash.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module DaggerWebDash

using Dagger

include("core.jl")
include("profile.jl")
include("d3.jl")

end
Loading

4 comments on commit f897e6a

@jpsamaroo
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/44717

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.13.2 -m "<description of version>" f897e6aba62ff170207d9d869598475a76e3a798
git push origin v0.13.2

@jpsamaroo
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register subdir=lib/DaggerWebDash

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/44727

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a DaggerWebDash-v0.1.0 -m "<description of version>" f897e6aba62ff170207d9d869598475a76e3a798
git push origin DaggerWebDash-v0.1.0

Please sign in to comment.