From 4dcc632183d3b9334a1a541bd04513cccf6bbabe Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Mon, 29 Nov 2021 22:38:28 +0300 Subject: [PATCH 01/19] deduplicate docs, fix typos, improve formatting --- docs/modules/optim.rst | 9 +- hivemind/optim/experimental/optimizer.py | 124 +++++++++++++---------- hivemind/optim/grad_scaler.py | 8 +- 3 files changed, 81 insertions(+), 60 deletions(-) diff --git a/docs/modules/optim.rst b/docs/modules/optim.rst index 8dd94b2b3..c1d53c20a 100644 --- a/docs/modules/optim.rst +++ b/docs/modules/optim.rst @@ -3,7 +3,10 @@ .. raw:: html - This module contains decentralized optimizers that wrap regular pytorch optimizers to collaboratively train a shared model. Depending on the exact type, optimizer may average model parameters with peers, exchange gradients, or follow a more complicated distributed training strategy. + This module contains decentralized optimizers that wrap your regular PyTorch Optimizer to train with peers. + Depending on the exact configuration, Optimizer may perform large synchronous updates equivalent, + or perform asynchrnous local updates and average model parameters. +

.. automodule:: hivemind.optim.experimental.optimizer @@ -13,7 +16,7 @@ ---------------------- .. autoclass:: Optimizer - :members: step, zero_grad, load_state_from_peers, param_groups, shutdown + :members: step, local_epoch, zero_grad, load_state_from_peers, param_groups, shutdown :member-order: bysource .. currentmodule:: hivemind.optim.grad_scaler @@ -26,7 +29,7 @@ .. raw:: html - CollaborativeOptimizer is a legacy version of hivemind.Optimizer. **For new projects, please use hivemind.Optimizer.** + CollaborativeOptimizer is a legacy version of hivemind.Optimizer. __For new projects, please use hivemind.Optimizer__. Currently, hivemind.Optimizer supports all the features of CollaborativeOptimizer and then some. CollaborativeOptimizer will still be supported for awhile, but will eventually be deprecated.

diff --git a/hivemind/optim/experimental/optimizer.py b/hivemind/optim/experimental/optimizer.py index 44b75b2a5..5c5b8b470 100644 --- a/hivemind/optim/experimental/optimizer.py +++ b/hivemind/optim/experimental/optimizer.py @@ -31,91 +31,100 @@ class Optimizer(torch.optim.Optimizer): """ Hivemind Optimizer wraps your regular PyTorch Optimizer for training collaboratively with peers. - By default, Optimizer is configured to be exactly **equivalent to synchronous training** with target_batch_size; + + By default, Optimizer is configured to be exactly **equivalent to synchronous training** with target_batch_size. There are advanced options make training semi-asynchronous (delay_optimizer_step and delay_gradient_averaging) - or even fully asynchronous (local_updates=True). However, these options require careful tuning. + or even fully asynchronous (use_local_updates=True). - :example: The Optimizer can be used as a drop-in replacement for your regular PyTorch Optimizer: + :example: The Optimizer can be used as a drop-in replacement for a regular PyTorch Optimizer: >>> model = transformers.AutoModel("albert-xxlarge-v2") >>> dht = hivemind.DHT(initial_peers=INITIAL_PEERS, start=True) - >>> opt = hivemind.Optimizer(dht, run_id="run_42", optimizer=lambda params: torch.optim.Adam(params, ...), - params=model.parameters(), target_batch_size=4096, batch_size_per_step=4) - >>> # alternative: opt = hivemind.Optimizer(dht, run_id="run_42", optimizer=torch.optim.Adam(model.parameters()) + >>> opt = hivemind.Optimizer(dht, run_id="run_42", params=model.parameters(), + >>> optimizer=lambda params: torch.optim.Adam(params, **other_options), + >>> target_batch_size=4096, batch_size_per_step=4) >>> while True: >>> loss = compute_loss_on_batch(model, batch_size=4) >>> opt.zero_grad() >>> loss.backward() >>> opt.step() # <-- train collaboratively with any peers that use the same prefix (run_42) - However, unlike regular optimizers, calling opt.step with hivemind.Optimizer can do one of the following: - - - accumulate a minibatch of gradients towards the (global) target batch size, without updating parameters yet; - - after accumulating the target batch size, all-reduce gradients with peers and perform optimizer step; - - if your peer lags behind the rest of the swarm, it will download latest state from other peers; - - :example: the optimizer has many keyword arguments that may be difficult to understand in one go. Here's quickstart - that will help you setup your first synchronous optimizer. - - >>> hivemind.Optimizer( - >>> dht=hivemind.DHT(initial_peers=ADDRESS_HERE, client_mode=TRUE_IF_BEHIND_FIREWALL_OR_UNRELIABLE, start=True), - >>> run_id="a_unique_name_that_every_participant_will_see_when_training", - >>> batch_size_per_step=ACTUAL_BATCH_SIZE_OF_THIS_PEER, - >>> target_batch_size=LARGE_GLOBAL_BATCH, # global batch will be this or *slightly* larger due to stragglers; - >>> # peers should finish averaging in roughly half the time they need to accumulate this batch between them - >>> optimizer=lambda params: AnyPyTorchOptimizer(params, **config_that_makes_sense_for_target_batch_size), - >>> # ^-- scale learning rate for your target_batch_size; good reference: https://arxiv.org/abs/1904.00962 - >>> offload_optimizer=True, # this saves GPU memory; large-batch training does not need optimizer that often - >>> scheduler=lambda opt: AnyPytTorchScheduler(opt, **config_that_makes_sense_for_target_batch_size), - >>> # scheduler.step will be called once every time peers collectively accumulate target_batch_size - >>> matchmaking_time=15.0, averaging_timeout=60.0, # <-- if the network is fast reduce to 3-5s and 10-15s - >>> # increase matchmaking_time if at least 25% of the time you see "averaged gradients with <...> peers", - >>> # ... but N is less than 0.9x the actual number of peers. Increase averaging_timeout if half of the epochs - >>> # ... print "Proceeding with local gradients" instead of "Averaged gradients with N peers" + By default, peers will perform the following steps: + + * accumulate a minibatch of gradients towards the (global) target batch size, without updating parameters yet; + * after peers collectively accumulate target_batch_size, average gradients with peers and perform optimizer step; + * if your peer lags behind the rest of the swarm, it will download parameters and optimizer state from others; + + Unlike regular training, your device may join midway through training, when other peers already made some progress. + For this reason, any learning rate schedulers, curriculum and other **time-dependent features should be based on** + ``optimizer.local_epoch`` (and not the number ot calls to opt.step). Otherwise, peers that joined training late + may end up having different learning rates. To do so automatically, specify ``scheduler=...`` parameter below. + + + :Configuration guide: This guide will help you set up your first collaborative training run. It covers the most + important basic options, but ignores features that require significant changes to the training code. + + >>> dht = hivemind.DHT(initial_peers=INITIAL_PEERS, client_mode=IF_BEHIND_FIREWALL_OR_VERY_UNRELIABLE, start=True) + >>> opt = hivemind.Optimizer( + >>> dht=dht, run_id="a_unique_name_that_every_participant_will_see_when_training", + >>> batch_size_per_step=ACTUAL_BATCH_SIZE_OF_THIS_PEER, target_batch_size=LARGE_GLOBAL_BATCH, + >>> # ^--- Each global optimzier step will use gradients from 1x-1.1x of target_batch_size (due to latency); + >>> # It is recommended to train with very large batch sizes to reduce the % of time spent on communication. + >>> + >>> params=params, optimizer=lambda params: AnyPyTorchOptimizer(params, **hyperparams_for_target_batch_size), + >>> # tune learning rate for your target_batch_size. Here's a good reference: https://arxiv.org/abs/1904.00962 + >>> scheduler=lambda opt: AnyPyTorchScheduler(opt, **hyperparams_for_target_batch_size), + >>> # scheduler.step will be called automatically each time when peers collectively accumulate target_batch_size + >>> + >>> offload_optimizer=True, # saves GPU memory, but increases RAM usage; Generally a good practice to use this. + >>> delay_grad_averaging=OPTIONAL, delay_optimizer_step=OPTIONAL, # train faster, but with 1 round of staleness; + >>> # setting both to True is equivalent to Delayed Parameter Updates (see https://arxiv.org/abs/2101.06840) + >>> >>> grad_compression=hivemind.Float16Compression(), state_averaging_compression=hivemind.Float16Compression(), - >>> # it is generally fine to use pure 16-bit or even lower precision during communication with no precaution; - >>> # See hivemind/examples/albert for an example of mixed 8-bit compression. - >>> delay_grad_averaging=SHOULD_I_USE_DPU, delay_optimizer_step=SHOULD_I_USE_DPU, # DPU stands for Delayed Para- - >>> # -meter Updates, running allreduce and optimizer step in background. See https://arxiv.org/abs/2101.06840 - >>> verbose=True # periodically report the training progress to the console + >>> # ^-- it is usually fine to use pure 16-bit or even lower precision during communication with no precaution; + >>> # See hivemind/examples/albert for an working example of mixed 8/16-bit compression. + >>> + >>> matchmaking_time=15.0, # 3-5s for small local runs, 10-15s for training over the internet or with many peers + >>> averaging_timeout=60.0, # around of 2x the actual time it takes to run all-reduce + >>> verbose=True # periodically report the training progress to the console (e.g. "Averaged with N peers") >>> ) # and you're done! - :note: hivemind.Optimizer can be used the same way any other pytorch optimizer, but there is one caveat: - learning rate schedulers, curriculum and other **time-dependent features should depend on Optimizer.local_epoch** - (and not the number ot calls to opt.step). This is because peers are allowed to join midway through training, - when others have already made some progress and changed their learning rates accordingly. - :param dht: a running hivemind.DHT instance connected to other peers + :param dht: a running hivemind.DHT instance connected to other peers. :param run_id: a unique identifier of this training run, used as a common prefix for all DHT keys. **Note:** peers with the same run_id should *generally* train the same model and use compatible configurations. Some options can be safely changed by individual peers: ``batch_size_per_step``, ``client_mode``, ``auxiliary``, ``reuse_grad_buffers``, ``offload_optimizer``, and ``verbose``. In some cases, other options may also be tuned individually by each peer, but they should be changed with caution to avoid deadlocks or convergence issues. - :param target_batch_size: global batch size that must be accumulated before the swarm transitions to the next epoch - :param batch_size_per_step: before each call to .step, user should accumulate gradients over this many samples + :param target_batch_size: global batch size that must be accumulated before the swarm transitions to the next epoch. + The actual batch may be *slightly* larger due asynchrony (e.g. peers submit more gradients in the last second). + :param batch_size_per_step: you should accumulate gradients over this many samples between calls to optimizer.step. - :param optimizer: a callable(parameters) -> pytorch.optim.Optimizer or a pre-initialized PyTorch optimizer - **Note:** some advanced options like offload_optimizer, delay_optimizer_step, or delay_grad_averaging are not - supported if hivemind.optimizer is created with a pre-initialized optimizer and require optimizer factory - :param params: parameters or param groups for the optimizer; required if optimizer is a callable(params) + :param params: parameters or param groups for the optimizer; required if optimizer is a callable(params). + :param optimizer: a callable(parameters) -> pytorch.optim.Optimizer or a pre-initialized PyTorch optimizer. + **Note:** some advanced options like offload_optimizer, delay_optimizer_step, or delay_grad_averaging require + and require the callable and will not work if hivemind.optimizer is created with a pre-existing PyTorch Optimizer. :param scheduler: callable(optimizer) -> PyTorch LRScheduler or a pre-initialized PyTorch scheduler. The learning rate scheduler will adjust learning rate based on global epoch, not the number of local calls to optimizer.step; this is required to keep different peers synchronized. - :param matchmaking_time: when looking for group, wait for peers to join for up to this many seconds - :param averaging_timeout: if an averaging step hangs for this long, it will be cancelled. - :param load_state_timeout: wait for at most this many seconds before giving up on load_state_from_peers + :param matchmaking_time: when looking for group, wait for peers to join for up to this many seconds. + Increase if you see "averaged gradients with N peers" where N is below 0.9x the real siee on >=25% of epochs. + When training with low-latency network, decreasing matchmaking_time allows training with smaller batch sizes. + :param averaging_timeout: if an averaging step hangs for this long, it will be cancelled automatically. + Increase averaging_timeout if you see "Proceeding with local gradients" at least 25% of the time. + Do not set this timeout too high, as it may cause your optimizer to hang after some types of network errors. + :param load_state_timeout: wait for at most this many seconds before giving up on load_state_from_peers. :param reuse_grad_buffers: if True, use model's .grad buffers for gradient accumulation. This is more memory efficient, but it requires that the user does *NOT* call model/opt zero_grad at all :param offload_optimizer: offload the optimizer to host memory, saving GPU memory for parameters and gradients :param delay_optimizer_step: run optimizer in background, apply results in future .step; requires offload_optimizer :param delay_grad_averaging: average gradients in background; requires offload_optimizer and delay_optimizer_step + :param delay_state_averaging: if enabled (default), average parameters and extra tensors in a background thread; if set to False, average parameters synchronously within the corresponding hivemind.Optimizer.step call. - The above 3 options (offload_optimizer, delay_optimizer_step and delay_grad_averaging) require that the optimizer - is created with: ``hivemind.Optimizer(..., optimizer=callable_optimizer_factory, params=model.parameters())`` :param average_state_every: average state (parameters, chosen opt tensors) with peers every this many **epochs**. This reduces the communication overhead increasing, but can cause parameters to diverge if too large. @@ -315,6 +324,11 @@ def is_alive(self) -> bool: @property def local_epoch(self) -> int: + """ + This worker's current epoch, kept synchronized with peers. If peer's local_epoch lags behind others, it will + automatically re-synchronize by downloading state from another peer. + An epoch corresponds to accumulating target_batch_size across all active devices. + """ return self.state_averager.local_epoch @property @@ -335,10 +349,10 @@ def step( Update training progress after accumulating another local batch size. Depending on the configuration, this will report progress to peers, run global or local optimizer step, average parameters or schedule background tasks. - :param closure: A closure that reevaluates the model and returns the loss - :param batch_size: optional override for batch_size_per_step from init - :param grad_scaler: if amp is enabled, this **must** be a hivemind-aware gradient scaler - :note: this .step is different from normal pytorch optimizers in several key ways. See __init__ for details. + :param closure: A closure that reevaluates the model and returns the loss. + :param batch_size: optional override for batch_size_per_step from init. + :param grad_scaler: if amp is enabled, this **must** be a hivemind-aware gradient scaler. + :note: this .step is more complex than normal pytorch optimizers in several key ways. See __init__ for details. """ if grad_scaler is not None and not isinstance(grad_scaler, GradScaler): raise ValueError("hivemind.Optimizer requires a hivemind-aware gradient scaler (hivemind.GradScaler)") diff --git a/hivemind/optim/grad_scaler.py b/hivemind/optim/grad_scaler.py index eb7527ca3..b3342c557 100644 --- a/hivemind/optim/grad_scaler.py +++ b/hivemind/optim/grad_scaler.py @@ -21,10 +21,14 @@ class GradScaler(TorchGradScaler): :note: if not using reuse_grad_buffers=True, one can and *should* train normally without this class, e.g. using standard PyTorch AMP or Apex. This custom GradScaler is more memory-efficient, but requires custom training code. - GradScaler removes several: + hivemind.GradScaler makes 3 modifications to the regular PyTorch AMP: + - bypass .unscale_ and .update calls in order to accumulate gradients over several steps - limit increasing gradient scale to only immediately after global optimizer steps - - allow training with some or all master parameters in fp16 + - allow training with some or master parameters in float16 + + :note: The above modiffications will be enabled automatically. One can (and should) use hivemind.GradScaler exactly + as regular ``torch.amp.GradScaler``. """ def __init__(self, *args, **kwargs): From 5bac846d9c61defd89dc539f169775860bf2c990 Mon Sep 17 00:00:00 2001 From: justheuristic Date: Mon, 29 Nov 2021 22:56:39 +0300 Subject: [PATCH 02/19] switch quickstart.md to the new Optimizer --- docs/user/quickstart.md | 58 +++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/docs/user/quickstart.md b/docs/user/quickstart.md index f81e1af18..fb46d8d0d 100644 --- a/docs/user/quickstart.md +++ b/docs/user/quickstart.md @@ -47,26 +47,27 @@ model = nn.Sequential(nn.Conv2d(3, 16, (5, 5)), nn.MaxPool2d(2, 2), nn.ReLU(), nn.Flatten(), nn.Linear(32 * 5 * 5, 10)) opt = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) - # Create DHT: a decentralized key-value storage shared between peers dht = hivemind.DHT(start=True) print("To join the training, use initial_peers =", [str(addr) for addr in dht.get_visible_maddrs()]) # Set up a decentralized optimizer that will average with peers in background -opt = hivemind.optim.DecentralizedOptimizer( - opt, # wrap the SGD optimizer defined above - dht, # use a DHT that is connected with other peers - average_parameters=True, # periodically average model weights in opt.step - average_gradients=False, # do not average accumulated gradients - prefix='my_cifar_run', # unique identifier of this collaborative run - target_group_size=16, # maximum concurrent peers for this run +opt = hivemind.Optimizer( + dht=dht, # use a DHT that is connected with other peers + run_id='my_cifar_run', # unique identifier of this collaborative run + optimizer=opt, # wrap the SGD optimizer defined above + use_local_updates=True, # perform optimizer steps with local gradients, average parameters in background + batch_size_per_step=32, # each call to opt.step adds this many samples towards the next epoch + target_batch_size=10000, # move to next epoch after peers collectively process this many samples + matchmaking_time=3.0, # when averaging parameters, gather peers in background for up to this many seconds + averaging_timeout=10.0, # give up on averaging if not successful in this many seconds verbose=True # print logs incessently ) -# Note: if you intend to use GPU, switch to it only after the decentralized optimizer is created +# Note: if you intend to use GPU, switch to it only after the decentralized optimizer is created with tqdm() as progressbar: while True: - for x_batch, y_batch in torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=256): + for x_batch, y_batch in torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=4): opt.zero_grad() loss = F.cross_entropy(model(x_batch), y_batch) loss.backward() @@ -78,7 +79,7 @@ with tqdm() as progressbar: As you can see, this code is regular PyTorch with one notable exception: it wraps your regular optimizer with a -`DecentralizedOptimizer`. This optimizer uses `DHT` to find other peers and tries to exchange weights them. When you run +`hivemind.Optimizer`. This optimizer uses `DHT` to find other peers and tries to exchange parameters them. When you run the code (please do so), you will see the following output: ```shell @@ -86,7 +87,7 @@ To join the training, use initial_peers = ['/ip4/127.0.0.1/tcp/XXX/p2p/YYY'] [...] Starting a new averaging round with current parameters. ``` -This is `DecentralizedOptimizer` telling you that it's looking for peers. Since there are no peers, we'll need to create +This is `hivemind.Optimizer` telling you that it's looking for peers. Since there are no peers, we'll need to create them ourselves. Copy the entire script (or notebook) and modify this line: @@ -123,26 +124,28 @@ model = nn.Sequential(nn.Conv2d(3, 16, (5, 5)), nn.MaxPool2d(2, 2), nn.ReLU(), opt = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) # Create DHT: a decentralized key-value storage shared between peers -dht = hivemind.DHT(initial_peers=[COPY_FROM_ANOTHER_PEER_OUTPUTS], start=True) +dht = hivemind.DHT(initial_peers=[COPY_FROM_OTHER_PEERS_OUTPUTS], start=True) print("To join the training, use initial_peers =", [str(addr) for addr in dht.get_visible_maddrs()]) # Set up a decentralized optimizer that will average with peers in background -opt = hivemind.optim.DecentralizedOptimizer( - opt, # wrap the SGD optimizer defined above - dht, # use a DHT that is connected with other peers - average_parameters=True, # periodically average model weights in opt.step - average_gradients=False, # do not average accumulated gradients - prefix='my_cifar_run', # unique identifier of this collaborative run - target_group_size=16, # maximum concurrent peers for this run +opt = hivemind.Optimizer( + dht=dht, # use a DHT that is connected with other peers + run_id='my_cifar_run', # unique identifier of this collaborative run + optimizer=opt, # wrap the SGD optimizer defined above + use_local_updates=True, # perform optimizer steps with local gradients, average parameters in background + batch_size_per_step=32, # each call to opt.step adds this many samples towards the next epoch + target_batch_size=10000, # move to next epoch after all peers collectively process this many samples + matchmaking_time=3.0, # when averaging parameters, gather peers in background for up to this many seconds + averaging_timeout=10.0, # give up on averaging if not successful in this many seconds verbose=True # print logs incessently ) -opt.averager.load_state_from_peers() +opt.load_state_from_peers() # Note: if you intend to use GPU, switch to it only after the decentralized optimizer is created with tqdm() as progressbar: while True: - for x_batch, y_batch in torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=256): + for x_batch, y_batch in torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=32): opt.zero_grad() loss = F.cross_entropy(model(x_batch), y_batch) loss.backward() @@ -166,17 +169,16 @@ This message means that the optimizer has averaged model parameters with another during one of the calls to `opt.step()`. You can start more peers by replicating the same code as the second peer, using either the first or second peer as `initial_peers`. -The only issue with this code is that each new peer starts with a different untrained network blends its un-trained -parameters with other peers, reseting their progress. You can see this effect as a spike increase in training loss -immediately after new peer joins training. To avoid this problem, the second peer can download the -current model/optimizer state from an existing peer right before it begins training on minibatches: +Each new peer starts with an untrained network and must download the latest training state before it can contribute. +By default, peer will automatically detect that it is out of sync and start ``Downloading parameters from peer <...>``. +To avoid wasting the first optimizer step, one can manually download the latest model/optimizer state right before it begins training on minibatches: ```python -opt.averager.load_state_from_peers() +opt.load_state_from_peers() ``` Congrats, you've just started a pocket-sized experiment with decentralized deep learning! -However, this is just the bare minimum of what hivemind can do. In [this example](https://github.com/learning-at-home/hivemind/tree/master/examples/albert), +However, this is just the basics of what hivemind can do. In [this example](https://github.com/learning-at-home/hivemind/tree/master/examples/albert), we show how to use a more advanced version of DecentralizedOptimizer to collaboratively train a large Transformer over the internet. If you want to learn more about each individual component, From ce29189aae06d6fc8bd86f62b312799b865c3a45 Mon Sep 17 00:00:00 2001 From: justheuristic Date: Mon, 29 Nov 2021 23:00:46 +0300 Subject: [PATCH 03/19] fix markdown --- docs/modules/optim.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/modules/optim.rst b/docs/modules/optim.rst index c1d53c20a..2ab5d1c3a 100644 --- a/docs/modules/optim.rst +++ b/docs/modules/optim.rst @@ -29,9 +29,9 @@ .. raw:: html - CollaborativeOptimizer is a legacy version of hivemind.Optimizer. __For new projects, please use hivemind.Optimizer__. + CollaborativeOptimizer is a legacy version of hivemind.Optimizer. For new projects please use hivemind.Optimizer. Currently, hivemind.Optimizer supports all the features of CollaborativeOptimizer and then some. - CollaborativeOptimizer will still be supported for awhile, but will eventually be deprecated. + CollaborativeOptimizer will still be supported for a while, but eventually it will be deprecated.

From 8c19c963e8fb86e794293143707f846aa8765542 Mon Sep 17 00:00:00 2001 From: justheuristic Date: Mon, 29 Nov 2021 23:06:15 +0300 Subject: [PATCH 04/19] reduce diff --- hivemind/optim/experimental/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hivemind/optim/experimental/optimizer.py b/hivemind/optim/experimental/optimizer.py index 5c5b8b470..e42e4c046 100644 --- a/hivemind/optim/experimental/optimizer.py +++ b/hivemind/optim/experimental/optimizer.py @@ -352,7 +352,7 @@ def step( :param closure: A closure that reevaluates the model and returns the loss. :param batch_size: optional override for batch_size_per_step from init. :param grad_scaler: if amp is enabled, this **must** be a hivemind-aware gradient scaler. - :note: this .step is more complex than normal pytorch optimizers in several key ways. See __init__ for details. + :note: this .step is different from normal pytorch optimizers in several key ways. See __init__ for details. """ if grad_scaler is not None and not isinstance(grad_scaler, GradScaler): raise ValueError("hivemind.Optimizer requires a hivemind-aware gradient scaler (hivemind.GradScaler)") From 1923f066be1394975331f7ef1d5f1263325bd90f Mon Sep 17 00:00:00 2001 From: justheuristic Date: Mon, 29 Nov 2021 23:12:35 +0300 Subject: [PATCH 05/19] hopefully improve RTD repr for compression --- hivemind/compression/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hivemind/compression/base.py b/hivemind/compression/base.py index 258779cef..b39609e23 100644 --- a/hivemind/compression/base.py +++ b/hivemind/compression/base.py @@ -65,6 +65,9 @@ def estimate_compression_ratio(self, info: CompressionInfo) -> float: """Estimate the compression ratio without doing the actual compression; lower ratio = better compression""" ... + def __repr__(self): + return f"hivemind.{self.__class__.__name__}()" + class NoCompression(CompressionBase): """A dummy compression strategy that preserves the original tensor as is.""" From 4827db6309a9aecf972bae151509baebf994ca0d Mon Sep 17 00:00:00 2001 From: justheuristic Date: Mon, 29 Nov 2021 23:19:40 +0300 Subject: [PATCH 06/19] fix typo in example --- hivemind/optim/experimental/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hivemind/optim/experimental/optimizer.py b/hivemind/optim/experimental/optimizer.py index e42e4c046..a87cf5e37 100644 --- a/hivemind/optim/experimental/optimizer.py +++ b/hivemind/optim/experimental/optimizer.py @@ -40,7 +40,7 @@ class Optimizer(torch.optim.Optimizer): >>> model = transformers.AutoModel("albert-xxlarge-v2") >>> dht = hivemind.DHT(initial_peers=INITIAL_PEERS, start=True) - >>> opt = hivemind.Optimizer(dht, run_id="run_42", params=model.parameters(), + >>> opt = hivemind.Optimizer(dht=dht, run_id="run_42", params=model.parameters(), >>> optimizer=lambda params: torch.optim.Adam(params, **other_options), >>> target_batch_size=4096, batch_size_per_step=4) >>> while True: From edf9874d84e264d3f029901bcc12df19626bb533 Mon Sep 17 00:00:00 2001 From: justheuristic Date: Mon, 29 Nov 2021 23:50:46 +0300 Subject: [PATCH 07/19] batch size 32 --- docs/user/quickstart.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/quickstart.md b/docs/user/quickstart.md index fb46d8d0d..54200a573 100644 --- a/docs/user/quickstart.md +++ b/docs/user/quickstart.md @@ -67,7 +67,7 @@ opt = hivemind.Optimizer( # Note: if you intend to use GPU, switch to it only after the decentralized optimizer is created with tqdm() as progressbar: while True: - for x_batch, y_batch in torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=4): + for x_batch, y_batch in torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=32): opt.zero_grad() loss = F.cross_entropy(model(x_batch), y_batch) loss.backward() From 3dbf881abcacd6c7fd4834e13294cfae58e20edd Mon Sep 17 00:00:00 2001 From: justheuristic Date: Mon, 29 Nov 2021 23:51:44 +0300 Subject: [PATCH 08/19] paraphrase --- docs/user/quickstart.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/quickstart.md b/docs/user/quickstart.md index 54200a573..f039308b8 100644 --- a/docs/user/quickstart.md +++ b/docs/user/quickstart.md @@ -178,7 +178,7 @@ opt.load_state_from_peers() Congrats, you've just started a pocket-sized experiment with decentralized deep learning! -However, this is just the basics of what hivemind can do. In [this example](https://github.com/learning-at-home/hivemind/tree/master/examples/albert), +However, this is only the basics of what hivemind can do. In [this example](https://github.com/learning-at-home/hivemind/tree/master/examples/albert), we show how to use a more advanced version of DecentralizedOptimizer to collaboratively train a large Transformer over the internet. If you want to learn more about each individual component, From f3b8b85b2970a49b280edda2dac10d7b87df9ee9 Mon Sep 17 00:00:00 2001 From: justheuristic Date: Tue, 30 Nov 2021 17:25:09 +0300 Subject: [PATCH 09/19] Update hivemind/optim/experimental/optimizer.py Co-authored-by: Alexander Borzunov --- hivemind/optim/experimental/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hivemind/optim/experimental/optimizer.py b/hivemind/optim/experimental/optimizer.py index a87cf5e37..8b0a04811 100644 --- a/hivemind/optim/experimental/optimizer.py +++ b/hivemind/optim/experimental/optimizer.py @@ -30,7 +30,7 @@ class Optimizer(torch.optim.Optimizer): """ - Hivemind Optimizer wraps your regular PyTorch Optimizer for training collaboratively with peers. + hivemind.Optimizer wraps your regular PyTorch Optimizer for training collaboratively with peers. By default, Optimizer is configured to be exactly **equivalent to synchronous training** with target_batch_size. There are advanced options make training semi-asynchronous (delay_optimizer_step and delay_gradient_averaging) From d10ed571bd647d1068eb9b3f06068917d161e861 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Tue, 30 Nov 2021 19:07:19 +0300 Subject: [PATCH 10/19] reivew --- docs/modules/optim.rst | 7 ------- hivemind/optim/collaborative.py | 4 ++++ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/docs/modules/optim.rst b/docs/modules/optim.rst index 2ab5d1c3a..a07984392 100644 --- a/docs/modules/optim.rst +++ b/docs/modules/optim.rst @@ -27,13 +27,6 @@ **CollaborativeOptimizer** -------------------------- -.. raw:: html - - CollaborativeOptimizer is a legacy version of hivemind.Optimizer. For new projects please use hivemind.Optimizer. - Currently, hivemind.Optimizer supports all the features of CollaborativeOptimizer and then some. - CollaborativeOptimizer will still be supported for a while, but eventually it will be deprecated. -

- .. automodule:: hivemind.optim.collaborative .. currentmodule:: hivemind.optim diff --git a/hivemind/optim/collaborative.py b/hivemind/optim/collaborative.py index d5869dce8..be7f925dd 100644 --- a/hivemind/optim/collaborative.py +++ b/hivemind/optim/collaborative.py @@ -57,6 +57,10 @@ class TrainingProgressSchema(BaseModel): class CollaborativeOptimizer(DecentralizedOptimizerBase): """ + :note: **For new projects please use hivemind.Optimizer**. CollaborativeOptimizer is an older version of that. + Currently, hivemind.Optimizer supports all the features of CollaborativeOptimizer and then some. + CollaborativeOptimizer will still be supported for a while, but eventually it will be deprecated. + An optimizer that performs model updates after collaboratively accumulating a target (large) batch size across peers These optimizers use DHT to track how much progress did the collaboration make towards target batch size. From f54bf9630371d150f34b43b41f4fb2c073c0ba4f Mon Sep 17 00:00:00 2001 From: justheuristic Date: Wed, 1 Dec 2021 02:43:21 +0300 Subject: [PATCH 11/19] fix bullets --- requirements-docs.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-docs.txt b/requirements-docs.txt index 1a5fa738d..25df3d480 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -1,3 +1,4 @@ recommonmark==0.5.0 sphinx_rtd_theme==0.4.3 +docutils==0.16 sphinx==4.2.0 From 2c858c247c549eaa8fe4aa0c539776d0a6bc5d9a Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Thu, 2 Dec 2021 13:58:19 +0300 Subject: [PATCH 12/19] review --- docs/user/quickstart.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user/quickstart.md b/docs/user/quickstart.md index f039308b8..5b6b927d1 100644 --- a/docs/user/quickstart.md +++ b/docs/user/quickstart.md @@ -142,7 +142,7 @@ opt = hivemind.Optimizer( opt.load_state_from_peers() -# Note: if you intend to use GPU, switch to it only after the decentralized optimizer is created +# Note: if you intend to use GPU, switch to it only after the optimizer is created with tqdm() as progressbar: while True: for x_batch, y_batch in torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=32): @@ -183,7 +183,7 @@ we show how to use a more advanced version of DecentralizedOptimizer to collabor If you want to learn more about each individual component, - Learn how to use `hivemind.DHT` using this basic [DHT tutorial](https://learning-at-home.readthedocs.io/en/latest/user/dht.html), -- Learn the underlying math behind DecentralizedOptimizer in - [(Li et al. 2020)](https://arxiv.org/abs/2005.00124) and [(Ryabinin et al. 2021)](https://arxiv.org/abs/2103.03239). +- Learn the underlying math behind hivemind.Optimizer in [Diskin et al., (2021)](https://arxiv.org/abs/2106.10207), + [Li et al. (2020)](https://arxiv.org/abs/2005.00124) and [Ryabinin et al. (2021)](https://arxiv.org/abs/2103.03239). - Read about setting up Mixture-of-Experts training in [this guide](https://learning-at-home.readthedocs.io/en/latest/user/moe.html), From c4011cf8cd0484d3366435afbcffa2110abfe202 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Thu, 2 Dec 2021 15:52:01 +0300 Subject: [PATCH 13/19] comment on epochs --- hivemind/optim/experimental/optimizer.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/hivemind/optim/experimental/optimizer.py b/hivemind/optim/experimental/optimizer.py index 8b0a04811..6f4849eae 100644 --- a/hivemind/optim/experimental/optimizer.py +++ b/hivemind/optim/experimental/optimizer.py @@ -60,6 +60,17 @@ class Optimizer(torch.optim.Optimizer): ``optimizer.local_epoch`` (and not the number ot calls to opt.step). Otherwise, peers that joined training late may end up having different learning rates. To do so automatically, specify ``scheduler=...`` parameter below. + :Note: hivemind.Optimizer uses the term `epoch` to describe a synchronization point. One epoch corresponds to the + time in which all participants collectively process some pre-defined number of samples (`target_batch_size`). + Like in PyTorch LR Scheduler, **epoch does not necessarily correspond to a full pass over the training data.** + At the end of epoch, peers perform synchronous actions such as averaging gradients for a global optimizer update, + (or just averaging parameters if using local updates). This ensures that adding or removing peers does not affect + per-epoch convergence. For instance, if the number of peers doubles, they will run all-reduce more frequently + to adjust for faster training. + + + + :Configuration guide: This guide will help you set up your first collaborative training run. It covers the most important basic options, but ignores features that require significant changes to the training code. From 6859078607a5668c0e896b029bbcfa2886b8efe8 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Thu, 2 Dec 2021 15:58:21 +0300 Subject: [PATCH 14/19] paraphrase --- hivemind/optim/experimental/optimizer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hivemind/optim/experimental/optimizer.py b/hivemind/optim/experimental/optimizer.py index 6f4849eae..6c9d6696c 100644 --- a/hivemind/optim/experimental/optimizer.py +++ b/hivemind/optim/experimental/optimizer.py @@ -60,8 +60,8 @@ class Optimizer(torch.optim.Optimizer): ``optimizer.local_epoch`` (and not the number ot calls to opt.step). Otherwise, peers that joined training late may end up having different learning rates. To do so automatically, specify ``scheduler=...`` parameter below. - :Note: hivemind.Optimizer uses the term `epoch` to describe a synchronization point. One epoch corresponds to the - time in which all participants collectively process some pre-defined number of samples (`target_batch_size`). + :Note: hivemind.Optimizer uses the term ``epoch`` to describe intervals between synchronizations. One epoch + coresponds to processing certain number of training samples (`target_batch_size`) in total across all peers. Like in PyTorch LR Scheduler, **epoch does not necessarily correspond to a full pass over the training data.** At the end of epoch, peers perform synchronous actions such as averaging gradients for a global optimizer update, (or just averaging parameters if using local updates). This ensures that adding or removing peers does not affect @@ -70,8 +70,6 @@ class Optimizer(torch.optim.Optimizer): - - :Configuration guide: This guide will help you set up your first collaborative training run. It covers the most important basic options, but ignores features that require significant changes to the training code. From 807f88b2ab382fdf52fff91828ec768a64e7fcef Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Thu, 2 Dec 2021 16:00:23 +0300 Subject: [PATCH 15/19] review --- hivemind/optim/experimental/optimizer.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/hivemind/optim/experimental/optimizer.py b/hivemind/optim/experimental/optimizer.py index 6c9d6696c..a64ddac8e 100644 --- a/hivemind/optim/experimental/optimizer.py +++ b/hivemind/optim/experimental/optimizer.py @@ -40,9 +40,8 @@ class Optimizer(torch.optim.Optimizer): >>> model = transformers.AutoModel("albert-xxlarge-v2") >>> dht = hivemind.DHT(initial_peers=INITIAL_PEERS, start=True) - >>> opt = hivemind.Optimizer(dht=dht, run_id="run_42", params=model.parameters(), - >>> optimizer=lambda params: torch.optim.Adam(params, **other_options), - >>> target_batch_size=4096, batch_size_per_step=4) + >>> opt = hivemind.Optimizer(dht=dht, run_id="run_42", batch_size_per_step=4, target_batch_size=4096, + >>> params=model.parameters(), optimizer=lambda params: torch.optim.Adam(params)) >>> while True: >>> loss = compute_loss_on_batch(model, batch_size=4) >>> opt.zero_grad() @@ -68,8 +67,6 @@ class Optimizer(torch.optim.Optimizer): per-epoch convergence. For instance, if the number of peers doubles, they will run all-reduce more frequently to adjust for faster training. - - :Configuration guide: This guide will help you set up your first collaborative training run. It covers the most important basic options, but ignores features that require significant changes to the training code. From c5cf09a49f685b918da1835d0a18f186bbf75167 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Thu, 2 Dec 2021 16:02:44 +0300 Subject: [PATCH 16/19] review --- hivemind/optim/collaborative.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hivemind/optim/collaborative.py b/hivemind/optim/collaborative.py index be7f925dd..c071f07bf 100644 --- a/hivemind/optim/collaborative.py +++ b/hivemind/optim/collaborative.py @@ -58,8 +58,8 @@ class TrainingProgressSchema(BaseModel): class CollaborativeOptimizer(DecentralizedOptimizerBase): """ :note: **For new projects please use hivemind.Optimizer**. CollaborativeOptimizer is an older version of that. - Currently, hivemind.Optimizer supports all the features of CollaborativeOptimizer and then some. - CollaborativeOptimizer will still be supported for a while, but eventually it will be deprecated. + Currently, hivemind.Optimizer supports all the features of CollaborativeOptimizer and a many advanced ones. + CollaborativeOptimizer will still be supported for a while, but it will be deprecated eventually. An optimizer that performs model updates after collaboratively accumulating a target (large) batch size across peers From 8420e9b586103cb9a8ea05fc567c4fe07c723bd5 Mon Sep 17 00:00:00 2001 From: justheuristic Date: Thu, 2 Dec 2021 16:06:00 +0300 Subject: [PATCH 17/19] typo --- hivemind/optim/experimental/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hivemind/optim/experimental/optimizer.py b/hivemind/optim/experimental/optimizer.py index a64ddac8e..a14b3e9c2 100644 --- a/hivemind/optim/experimental/optimizer.py +++ b/hivemind/optim/experimental/optimizer.py @@ -60,7 +60,7 @@ class Optimizer(torch.optim.Optimizer): may end up having different learning rates. To do so automatically, specify ``scheduler=...`` parameter below. :Note: hivemind.Optimizer uses the term ``epoch`` to describe intervals between synchronizations. One epoch - coresponds to processing certain number of training samples (`target_batch_size`) in total across all peers. + coresponds to processing certain number of training samples (``target_batch_size``) in total across all peers. Like in PyTorch LR Scheduler, **epoch does not necessarily correspond to a full pass over the training data.** At the end of epoch, peers perform synchronous actions such as averaging gradients for a global optimizer update, (or just averaging parameters if using local updates). This ensures that adding or removing peers does not affect From ba67c84775211eb49bf5fea48ce4fd4637dee086 Mon Sep 17 00:00:00 2001 From: Aleksandr Borzunov Date: Thu, 2 Dec 2021 16:09:53 +0300 Subject: [PATCH 18/19] review --- docs/user/quickstart.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/user/quickstart.md b/docs/user/quickstart.md index 5b6b927d1..876d26dcb 100644 --- a/docs/user/quickstart.md +++ b/docs/user/quickstart.md @@ -55,10 +55,10 @@ print("To join the training, use initial_peers =", [str(addr) for addr in dht.ge opt = hivemind.Optimizer( dht=dht, # use a DHT that is connected with other peers run_id='my_cifar_run', # unique identifier of this collaborative run + batch_size_per_step=32, # each call to opt.step adds this many samples towards the next epoch + target_batch_size=10000, # after peers collectively process this many samples, average weights and begin the next epoch optimizer=opt, # wrap the SGD optimizer defined above use_local_updates=True, # perform optimizer steps with local gradients, average parameters in background - batch_size_per_step=32, # each call to opt.step adds this many samples towards the next epoch - target_batch_size=10000, # move to next epoch after peers collectively process this many samples matchmaking_time=3.0, # when averaging parameters, gather peers in background for up to this many seconds averaging_timeout=10.0, # give up on averaging if not successful in this many seconds verbose=True # print logs incessently @@ -131,10 +131,10 @@ print("To join the training, use initial_peers =", [str(addr) for addr in dht.ge opt = hivemind.Optimizer( dht=dht, # use a DHT that is connected with other peers run_id='my_cifar_run', # unique identifier of this collaborative run + batch_size_per_step=32, # each call to opt.step adds this many samples towards the next epoch + target_batch_size=10000, # after peers collectively process this many samples, average weights and begin the next epoch optimizer=opt, # wrap the SGD optimizer defined above use_local_updates=True, # perform optimizer steps with local gradients, average parameters in background - batch_size_per_step=32, # each call to opt.step adds this many samples towards the next epoch - target_batch_size=10000, # move to next epoch after all peers collectively process this many samples matchmaking_time=3.0, # when averaging parameters, gather peers in background for up to this many seconds averaging_timeout=10.0, # give up on averaging if not successful in this many seconds verbose=True # print logs incessently From 7a202758ddd84819ea69d0a7ade9f26ee43c265c Mon Sep 17 00:00:00 2001 From: justheuristic Date: Thu, 2 Dec 2021 16:15:06 +0300 Subject: [PATCH 19/19] typo --- hivemind/optim/experimental/optimizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hivemind/optim/experimental/optimizer.py b/hivemind/optim/experimental/optimizer.py index a14b3e9c2..1f937f595 100644 --- a/hivemind/optim/experimental/optimizer.py +++ b/hivemind/optim/experimental/optimizer.py @@ -59,13 +59,13 @@ class Optimizer(torch.optim.Optimizer): ``optimizer.local_epoch`` (and not the number ot calls to opt.step). Otherwise, peers that joined training late may end up having different learning rates. To do so automatically, specify ``scheduler=...`` parameter below. - :Note: hivemind.Optimizer uses the term ``epoch`` to describe intervals between synchronizations. One epoch + :What is an epoch?: Optimizer uses the term ``epoch`` to describe intervals between synchronizations. One epoch coresponds to processing certain number of training samples (``target_batch_size``) in total across all peers. Like in PyTorch LR Scheduler, **epoch does not necessarily correspond to a full pass over the training data.** At the end of epoch, peers perform synchronous actions such as averaging gradients for a global optimizer update, - (or just averaging parameters if using local updates). This ensures that adding or removing peers does not affect - per-epoch convergence. For instance, if the number of peers doubles, they will run all-reduce more frequently - to adjust for faster training. + updating the learning rate scheduler or simply averaging parameters (if using local updates). + The purpose of this is to ensure that changing the number of peers does not reqire changing hyperparameters. + For instance, if the number of peers doubles, they will run all-reduce more frequently to adjust for faster training. :Configuration guide: This guide will help you set up your first collaborative training run. It covers the most important basic options, but ignores features that require significant changes to the training code.