Merge pull request #454 from JuliaParallel/jps/datadeps

Add spawn_datadeps for OMP-like task model
JuliaParallel · Jan 29, 2024 · 02d727d · 02d727d
2 parents 276e4f0 + 66e8c3b
commit 02d727d
Show file tree

Hide file tree

Showing 13 changed files with 952 additions and 12 deletions.
diff --git a/Manifest.toml b/Manifest.toml
@@ -2,7 +2,13 @@
 
 julia_version = "1.8.5"
 manifest_format = "2.0"
-project_hash = "8da7911e4788068aaea8c0ef8589d674bce0fb39"
+project_hash = "63ad89f514e49fbb0061c336a95c9098f89440c9"
+
+[[deps.ArnoldiMethod]]
+deps = ["LinearAlgebra", "Random", "StaticArrays"]
+git-tree-sha1 = "62e51b39331de8911e4a7ff6f5aaf38a5f4cc0ae"
+uuid = "ec485272-7323-5ecc-a04f-4719b315124d"
+version = "0.2.0"
 
 [[deps.Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
@@ -12,9 +18,9 @@ uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
 [[deps.ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra", "SparseArrays"]
-git-tree-sha1 = "2118cb2765f8197b08e5958cdd17c165427425ee"
+git-tree-sha1 = "0d12ee16b3f62e4e33c3277773730a5b21a74152"
 uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-version = "1.19.0"
+version = "1.20.0"
 
 [[deps.ChangesOfVariables]]
 deps = ["InverseFunctions", "LinearAlgebra", "Test"]
@@ -23,26 +29,26 @@ uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
 version = "0.1.8"
 
 [[deps.Compat]]
-deps = ["Dates", "LinearAlgebra", "UUIDs"]
-git-tree-sha1 = "886826d76ea9e72b35fcd000e535588f7b60f21d"
+deps = ["Dates", "LinearAlgebra", "TOML", "UUIDs"]
+git-tree-sha1 = "75bd5b6fc5089df449b5d35fa501c846c9b6549b"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "4.10.1"
+version = "4.12.0"
 
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
 version = "1.0.1+0"
 
 [[deps.DataAPI]]
-git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c"
+git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe"
 uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.15.0"
+version = "1.16.0"
 
 [[deps.DataStructures]]
 deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "3dbd312d370723b6bb43ba9d02fc36abade4518d"
+git-tree-sha1 = "ac67408d9ddf207de5cfa9a97e114352430f01ed"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.18.15"
+version = "0.18.16"
 
 [[deps.Dates]]
 deps = ["Printf"]
@@ -58,11 +64,22 @@ git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
 uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 version = "0.9.3"
 
+[[deps.Graphs]]
+deps = ["ArnoldiMethod", "Compat", "DataStructures", "Distributed", "Inflate", "LinearAlgebra", "Random", "SharedArrays", "SimpleTraits", "SparseArrays", "Statistics"]
+git-tree-sha1 = "899050ace26649433ef1af25bc17a815b3db52b7"
+uuid = "86223c79-3864-5bf0-83f7-82e725a168b6"
+version = "1.9.0"
+
 [[deps.HashArrayMappedTries]]
 git-tree-sha1 = "2eaa69a7cab70a52b9687c8bf950a5a93ec895ae"
 uuid = "076d061b-32b6-4027-95e0-9a2c6f6d7e74"
 version = "0.2.0"
 
+[[deps.Inflate]]
+git-tree-sha1 = "ea8031dea4aff6bd41f1df8f2fdfb25b33626381"
+uuid = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+version = "0.1.4"
+
 [[deps.InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
@@ -100,9 +117,9 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
 [[deps.MacroTools]]
 deps = ["Markdown", "Random"]
-git-tree-sha1 = "b211c553c199c111d998ecdaf7623d1b89b69f93"
+git-tree-sha1 = "2fa9ee3e63fd3a4f7a9a4f4744a52f4856de82df"
 uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-version = "0.5.12"
+version = "0.5.13"
 
 [[deps.Markdown]]
 deps = ["Base64"]
@@ -184,6 +201,12 @@ uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 deps = ["Distributed", "Mmap", "Random", "Serialization"]
 uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
 
+[[deps.SimpleTraits]]
+deps = ["InteractiveUtils", "MacroTools"]
+git-tree-sha1 = "5d7e3f4e11935503d3ecaf7186eac40602e7d231"
+uuid = "699a6c99-e7fa-54fc-8d76-47d257e15c1d"
+version = "0.9.4"
+
 [[deps.Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -197,6 +220,17 @@ version = "1.2.1"
 deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
+[[deps.StaticArrays]]
+deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore", "Statistics"]
+git-tree-sha1 = "f68dd04d131d9a8a8eb836173ee8f105c360b0c5"
+uuid = "90137ffa-7385-5640-81b9-e52037218182"
+version = "1.9.1"
+
+[[deps.StaticArraysCore]]
+git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
+uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
+version = "1.4.2"
+
 [[deps.Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,7 @@ version = "0.18.6"
 [deps]
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
@@ -23,6 +24,7 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [compat]
 DataStructures = "0.18"
+Graphs = "1"
 MacroTools = "0.5"
 MemPool = "0.4.6"
 PrecompileTools = "1.2"

diff --git a/docs/make.jl b/docs/make.jl
@@ -22,6 +22,7 @@ makedocs(;
         "Scopes" => "scopes.md",
         "Processors" => "processors.md",
         "Task Queues" => "task-queues.md",
+        "Datadeps" => "datadeps.md",
         "Option Propagation" => "propagation.md",
         "Logging and Graphing" => "logging.md",
         "Checkpointing" => "checkpointing.md",

diff --git a/docs/src/datadeps.md b/docs/src/datadeps.md
@@ -0,0 +1,96 @@
+# Datadeps (Data Dependencies)
+
+For many programs, the restriction that tasks cannot write to their arguments
+feels overly restrictive and makes certain kinds of programs (such as in-place
+linear algebra) hard to express efficiently in Dagger. Thankfully, there is a
+solution: `spawn_datadeps`. This function constructs a "datadeps region",
+within which tasks are allowed to write to their arguments, with parallelism
+controlled via dependencies specified via argument annotations. Let's look at
+a simple example to make things concrete:
+
+```julia
+A = rand(1000)
+B = rand(1000)
+C = zeros(1000)
+add!(X, Y) = X .+= Y
+Dagger.spawn_datadeps() do
+    Dagger.@spawn add!(InOut(B), In(A))
+    Dagger.@spawn copyto!(Out(C), In(B))
+end
+```
+
+In this example, we have two Dagger tasks being launched, one adding `A` into
+`B`, and the other copying `B` into `C`. The `add!` task is specifying that
+`A` is being only read from (`In` for "input"), and that `B` is being read
+from and written to (`Out` for "output", `InOut` for "input and output"). The
+`copyto` task, similarly, is specifying that `B` is being read from, and `C`
+is only being written to.
+
+Without `spawn_datadeps` and `In`, `Out`, and `InOut`, the result of these
+tasks would be undefined; the two tasks could execute in parallel, or the
+`copyto!` could occur before the `add!`, resulting in all kinds of mayhem.
+However, `spawn_datadeps` changes things: because we have told Dagger how our
+tasks access their arguments, Dagger knows to control the parallelism and
+ordering, and ensure that `add!` executes and finishes before `copyto!`
+begins, ensuring that `copyto!` "sees" the changes to `B` before executing.
+
+There is another important aspect of `spawn_datadeps` that makes the above
+code work: if all of the `Dagger.@spawn` macros are removed, along with the
+dependency specifiers, the program would still produce the same results,
+without using Dagger. In other words, the parallel (Dagger) version of the
+program produces identical results to the serial (non-Dagger) version of the
+program. This is similar to using Dagger with purely functional tasks and
+without `spawn_datadeps` - removing `Dagger.@spawn` will still result in a
+correct (sequential and possibly slower) version of the program. Basically,
+`spawn_datadeps` will ensure that Dagger respects the ordering and
+dependencies of a program, while still providing parallelism, where possible.
+
+But where is the parallelism? The above example doesn't actually have any
+parallelism to exploit! Let's take a look at another example to see the
+datadeps model truly shine:
+
+```julia
+# Tree reduction of multiple arrays into the first array
+function tree_reduce!(op::Base.Callable, As::Vector{<:Array})
+    Dagger.spawn_datadeps() do
+        to_reduce = Vector[]
+        push!(to_reduce, As)
+        while !isempty(to_reduce)
+            As = pop!(to_reduce)
+            n = length(As)
+            if n == 2
+                Dagger.@spawn Base.mapreducedim!(identity, op, InOut(As[1]), In(As[2]))
+            elseif n > 2
+                push!(to_reduce, [As[1], As[div(n,2)+1]])
+                push!(to_reduce, As[1:div(n,2)])
+                push!(to_reduce, As[div(n,2)+1:end])
+            end
+        end
+    end
+    return As[1]
+end
+
+As = [rand(1000) for _ in 1:1000]
+Bs = copy.(As)
+tree_reduce!(+, As)
+@assert isapprox(As[1], reduce((x,y)->x .+ y, Bs))
+```
+
+In the above implementation of `tree_reduce!` (which is designed to perform an
+elementwise reduction across a vector of arrays), we have a tree reduction
+operation where pairs of arrays are reduced, starting with neighboring pairs,
+and then reducing pairs of reduction results, etc. until the final result is in
+`As[1]`. We can see that the application of Dagger to this algorithm is simple -
+only the single `Base.mapreducedim!` call is passed to Dagger - yet due to the
+data dependencies and the algorithm's structure, there should be plenty of
+parallelism to be exploited across each of the parallel reductions at each
+"level" of the reduction tree. Specifically, any two `Dagger.@spawn` calls
+which access completely different pairs of arrays can execute in parallel,
+while any call which has an `In` on an array will wait for any previous call
+which has an `InOut` on that same array.
+
+Additionally, we can notice a powerful feature of this model - if the
+`Dagger.@spawn` macro is removed, the code still remains correct, but simply
+runs sequentially. This means that the structure of the program doesn't have to
+change in order to use Dagger for parallelization, which can make applying
+Dagger to existing algorithms quite effortless.
diff --git a/src/Dagger.jl b/src/Dagger.jl
@@ -34,6 +34,7 @@ include("queue.jl")
 include("thunk.jl")
 include("submission.jl")
 include("chunks.jl")
+include("memory-spaces.jl")
 
 # Task scheduling
 include("compute.jl")
@@ -42,6 +43,9 @@ include("utils/system_uuid.jl")
 include("utils/caching.jl")
 include("sch/Sch.jl"); using .Sch
 
+# Data dependency task queue
+include("datadeps.jl")
+
 # Array computations
 include("array/darray.jl")
 include("array/alloc.jl")