Add documentation

LuxDL · Apr 6, 2024 · 5bd71be · 5bd71be
1 parent 6293e2a
commit 5bd71be
Show file tree

Hide file tree

Showing 7 changed files with 96 additions and 10 deletions.
diff --git a/docs/make.jl b/docs/make.jl
@@ -38,14 +38,16 @@ pages = [
         "manual/freezing_model_parameters.md",
         "manual/gpu_management.md",
         "manual/migrate_from_flux.md",
-        "manual/weight_initializers.md"
+        "manual/weight_initializers.md",
+        "manual/distributed_utils.md"
     ],
     "API Reference" => [
         "Lux" => [
             "api/Lux/layers.md",
             "api/Lux/utilities.md",
             "api/Lux/contrib.md",
-            "api/Lux/switching_frameworks.md"
+            "api/Lux/switching_frameworks.md",
+            "api/Lux/distributed_utils.md",
         ],
         "Accelerator Support" => [
             "api/Accelerator_Support/LuxAMDGPU.md",

diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts
@@ -73,7 +73,8 @@ export default defineConfig({
                             { text: 'Built-In Layers', link: '/api/Lux/layers' },
                             { text: 'Utilities', link: '/api/Lux/utilities' },
                             { text: 'Experimental', link: '/api/Lux/contrib' },
-                            { text: 'InterOp', link: '/api/Lux/switching_frameworks' }
+                            { text: 'InterOp', link: '/api/Lux/switching_frameworks' },
+                            { text: 'DistributedUtils', link: '/api/Lux/distributed_utils' }
                         ]
                     },
                     {
@@ -146,7 +147,8 @@ export default defineConfig({
                     { text: 'Freezing Model Parameters', link: '/manual/freezing_model_parameters' },
                     { text: 'GPU Management', link: '/manual/gpu_management' },
                     { text: 'Migrating from Flux to Lux', link: '/manual/migrate_from_flux' },
-                    { text: 'Initializing Weights', link: '/manual/weight_initializers' }]
+                    { text: 'Initializing Weights', link: '/manual/weight_initializers' },
+                    { text: 'Distributed Data Parallel Training', link: '/manual/distributed_utils' },]
             },
             "/api/": {
                 text: 'API Reference', collapsed: false, items: [
@@ -155,7 +157,8 @@ export default defineConfig({
                             { text: 'Built-In Layers', link: '/api/Lux/layers' },
                             { text: 'Utilities', link: '/api/Lux/utilities' },
                             { text: 'Experimental Features', link: '/api/Lux/contrib' },
-                            { text: 'Switching between Deep Learning Frameworks', link: '/api/Lux/switching_frameworks' }]
+                            { text: 'Switching between Deep Learning Frameworks', link: '/api/Lux/switching_frameworks' },
+                            { text: 'DistributedUtils', link: '/api/Lux/distributed_utils' }]
                     },
                     {
                         text: 'Accelerator Support', collapsed: false, items: [

diff --git a/docs/src/api/Lux/distributed_utils.md b/docs/src/api/Lux/distributed_utils.md
@@ -0,0 +1,58 @@
+# Distributed Utils
+
+!!! note
+
+    These functionalities are available via the `Lux.DistributedUtils` module.
+
+```@meta
+CurrentModule = Lux
+```
+
+## Index
+
+```@index
+Pages = ["distributed_utils.md"]
+```
+
+## Backends
+
+```@docs
+MPIBackend
+NCCLBackend
+```
+
+## Initialization
+
+```@docs
+DistributedUtils.initialize
+DistributedUtils.initialized
+DistributedUtils.get_distributed_backend
+```
+
+## Helper Functions
+
+```@docs
+DistributedUtils.local_rank
+DistributedUtils.total_workers
+```
+
+## Communication Primitives
+
+```@docs
+DistributedUtils.allreduce!
+DistributedUtils.bcast!
+DistributedUtils.reduce!
+DistributedUtils.synchronize!!
+```
+
+## Optimizers.jl Integration
+
+```@docs
+DistributedUtils.DistributedOptimizer
+```
+
+## MLUtils.jl Integration
+
+```@docs
+DistributedUtils.DistributedDataLoader
+```
diff --git a/docs/src/manual/distributed_utils.md b/docs/src/manual/distributed_utils.md
@@ -0,0 +1,15 @@
+# Distributed Data Parallel Training
+
+!!! tip
+
+    For a fully functional example, see the
+    [ImageNet Training Example](https://github.com/LuxDL/Lux.jl/tree/main/examples/ImageNet)
+
+DDP Training using `Lux.DistributedUtils` is a spiritual successor to
+[FluxMPI.jl](https://github.com/avik-pal/FluxMPI.jl), but has some key differences.
+
+## Backends Supported
+
+## Guide to Integrating DistributedUtils into your code
+
+## Main Differences from `FluxMPI.jl`
diff --git a/ext/LuxMPINCCLExt.jl b/ext/LuxMPINCCLExt.jl
@@ -8,15 +8,15 @@ using Setfield: @set!
 
 function DistributedUtils.__initialize(
         ::Type{NCCLBackend}; cuda_devices=nothing, amdgpu_devices=missing)
-    DistributedUtils.NCCL_Initialized[] = true
     @assert amdgpu_devices===missing "`AMDGPU` is not supported by `NCCL`."
-    DistributedUtils.__initialize(Val(:MPI); cuda_devices, amdgpu_devices)
+    DistributedUtils.__initialize(MPIBackend; cuda_devices, amdgpu_devices)
+    DistributedUtils.NCCL_Initialized[] = true
     return
 end
 
 function DistributedUtils.__get_distributed_backend(::Type{NCCLBackend})
     unique_id = NCCL.UniqueID()  # Generate on all ranks to know the type
-    mpi_backend = DistributedUtils.__get_distributed_backend(Val(:MPI))
+    mpi_backend = DistributedUtils.__get_distributed_backend(MPIBackend)
     buf = [unique_id.internal...]
     DistributedUtils.bcast!(mpi_backend, buf; root=0)
     @set! unique_id.internal = Tuple(buf)

diff --git a/src/distributed/backend.jl b/src/distributed/backend.jl
@@ -4,7 +4,7 @@ abstract type AbstractLuxDistributedBackend end
     MPIBackend(comm = nothing)
 
 Create an MPI backend for distributed training. Users should not use this function directly.
-Instead use [`DistributedUtils.get_distributed_backend(Val(:NCCL))`](@ref).
+Instead use [`DistributedUtils.get_distributed_backend(MPIBackend)`](@ref).
 """
 struct MPIBackend{C} <: AbstractLuxDistributedBackend
     comm::C
@@ -21,7 +21,7 @@ end
     NCCLBackend(comm = nothing, mpi_backend = nothing)
 
 Create an NCCL backend for distributed training. Users should not use this function
-directly. Instead use [`DistributedUtils.get_distributed_backend(Val(:NCCL))`](@ref).
+directly. Instead use [`DistributedUtils.get_distributed_backend(NCCLBackend)`](@ref).
 """
 struct NCCLBackend{C, M <: Union{Nothing, MPIBackend}} <: AbstractLuxDistributedBackend
     comm::C

diff --git a/src/distributed/public_api.jl b/src/distributed/public_api.jl
@@ -220,6 +220,10 @@ end
 `data` must be compatible with `MLUtils` interface. The returned container is compatible
 with `MLUtils` interface and is used to partition the dataset across the available
 processes.
+
+!!! danger
+
+    `MLUtils.jl` must be installed and loaded before using this.
 """
 @concrete struct DistributedDataContainer
     data
@@ -250,6 +254,10 @@ averages the gradients across the processes using Allreduce.
 ## Arguments
 
   - `optimizer`: An Optimizer compatible with the Optimisers.jl package
+
+!!! danger
+
+    `Optimisers.jl` must be installed and loaded before using this.
 """
 function DistributedOptimizer(backend::AbstractLuxDistributedBackend, opt)
     mod = Base.get_extension(@__MODULE__, :LuxOptimisersExt)