From 2fd7085889f15de7c98d1ea69434bddb91657abf Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Thu, 12 Jul 2018 15:05:42 -0700 Subject: [PATCH] documentation enhancement for optimizers (#11657) * update tutorial * add doc for clarification; * add docs for gluon blocks * Update row_sparse.md * Update row_sparse.md --- docs/tutorials/sparse/row_sparse.md | 11 +++++++---- docs/tutorials/sparse/train.md | 8 +++++--- example/sparse/linear_classification/train.py | 7 +++++-- python/mxnet/gluon/contrib/nn/basic_layers.py | 6 ++++++ python/mxnet/gluon/nn/basic_layers.py | 5 +++++ python/mxnet/optimizer.py | 2 +- src/operator/tensor/dot.cc | 8 ++++++++ src/operator/tensor/indexing_op.cc | 11 +++++++++-- 8 files changed, 46 insertions(+), 12 deletions(-) diff --git a/docs/tutorials/sparse/row_sparse.md b/docs/tutorials/sparse/row_sparse.md index 85d76f498d71..c4cab75df543 100644 --- a/docs/tutorials/sparse/row_sparse.md +++ b/docs/tutorials/sparse/row_sparse.md @@ -469,7 +469,7 @@ state = momentum * state + rescaled_grad weight = weight - state ``` -Meanwhile, the sparse update rule for SGD optimizer is: +However, with sparse gradient the SGD optimizer uses the following lazy update by default: ``` for row in grad.indices: @@ -478,6 +478,9 @@ for row in grad.indices: weight[row] = weight[row] - state[row] ``` +This means that the lazy update leads to different optimization results if `weight_decay` or `momentum` is non-zero. +To disable lazy update, please set `lazy_update` to be False when creating the optimizer. + ```python # Create weight @@ -531,8 +534,8 @@ sgd.update(0, weight, grad, momentum) -Note that both [mxnet.optimizer.SGD](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.SGD) -and [mxnet.optimizer.Adam](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.Adam) support sparse updates in MXNet. +Note that only [mxnet.optimizer.SGD](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.SGD), [mxnet.optimizer.Adam](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.Adam), and +[mxnet.optimizer.AdaGrad](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.AdaGrad) support sparse updates in MXNet. ## Advanced Topics @@ -541,7 +544,7 @@ and [mxnet.optimizer.Adam](https://mxnet.incubator.apache.org/api/python/optimiz By default, RowSparseNDArray operators are executed on CPU. In MXNet, GPU support for RowSparseNDArray is limited to a few sparse operators such as [sgd_update](https://mxnet.incubator.apache.org/api/python/ndarray/sparse.html#mxnet.ndarray.sparse.sgd_update), [dot](https://mxnet.incubator.apache.org/api/python/ndarray/sparse.html#mxnet.ndarray.sparse.dot) and -[SparseEmbedding](https://mxnet.incubator.apache.org/api/python/ndarray/contrib.html#mxnet.ndarray.contrib.SparseEmbedding). +[Embedding](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.Embedding). To create a RowSparseNDArray on gpu, we need to explicitly specify the context: diff --git a/docs/tutorials/sparse/train.md b/docs/tutorials/sparse/train.md index ce7020553c2d..7472fcd14ca3 100644 --- a/docs/tutorials/sparse/train.md +++ b/docs/tutorials/sparse/train.md @@ -190,7 +190,7 @@ fallback_log = fallback_exec.outputs[1] -### Inspecting Storage Types of the Symbol Graph (Work in Progress) +### Inspecting Storage Types of the Symbol Graph When the environment variable `MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING` is set to `1`, MXNet will log the storage type information of operators' inputs and outputs in the computation graph. For example, we can inspect the storage types of @@ -312,8 +312,10 @@ assert metric.get()[1] < 1, "Achieved MSE (%f) is larger than expected (1.0)" % -### Training the model with multiple machines +### Training the model with multiple machines or multiple devices -To train a sparse model with multiple machines, please refer to the example in [mxnet/example/sparse/](https://github.com/apache/incubator-mxnet/tree/master/example/sparse) +To train a sparse model with multiple machines, you need to call `prepare` before `forward`, or `save_checkpoint`. +Please refer to the example in [mxnet/example/sparse/linear_classification](https://github.com/apache/incubator-mxnet/tree/master/example/sparse/linear_classification) +for more details. diff --git a/example/sparse/linear_classification/train.py b/example/sparse/linear_classification/train.py index 4d60efbaf4f3..0a8acfd87bef 100644 --- a/example/sparse/linear_classification/train.py +++ b/example/sparse/linear_classification/train.py @@ -32,9 +32,9 @@ parser.add_argument('--kvstore', type=str, default=None, help='what kvstore to use', choices=["dist_sync", "dist_async", "local"]) -parser.add_argument('--optimizer', type=str, default='ftrl', +parser.add_argument('--optimizer', type=str, default='sgd', help='what optimizer to use', - choices=["ftrl", "sgd", "adam"]) + choices=["adagrad", "sgd", "adam"]) AVAZU = { 'train': 'avazu-app', @@ -129,6 +129,9 @@ def all_row_ids(data_batch): # evaluate metric on validation dataset score = mod.score(eval_data, ['nll_loss']) logging.info('epoch %d, eval nll = %s ' % (epoch, score[0][1])) + + # prepare the module weight with all row ids before making a checkpoint. + mod.prepare(None, all_row_ids) mod.save_checkpoint("checkpoint", epoch) # reset the iterator for next pass of data train_data.reset() diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py index 1edef1476ee3..843a02286594 100644 --- a/python/mxnet/gluon/contrib/nn/basic_layers.py +++ b/python/mxnet/gluon/contrib/nn/basic_layers.py @@ -118,6 +118,12 @@ class SparseEmbedding(Block): This SparseBlock is designed for distributed training with extremely large input dimension. Both weight and gradient w.r.t. weight are `RowSparseNDArray`. + Note: if `sparse_grad` is set to True, the gradient w.r.t weight will be + sparse. Only a subset of optimizers support sparse gradients, including SGD, AdaGrad + and Adam. By default lazy updates is turned on, which may perform differently + from standard updates. For more details, please check the Optimization API at: + https://mxnet.incubator.apache.org/api/python/optimization/optimization.html + Parameters ---------- input_dim : int diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py index abde51b433af..ad69d4e9dd90 100644 --- a/python/mxnet/gluon/nn/basic_layers.py +++ b/python/mxnet/gluon/nn/basic_layers.py @@ -370,6 +370,11 @@ class Embedding(HybridBlock): r"""Turns non-negative integers (indexes/tokens) into dense vectors of fixed size. eg. [4, 20] -> [[0.25, 0.1], [0.6, -0.2]] + Note: if `sparse_grad` is set to True, the gradient w.r.t weight will be + sparse. Only a subset of optimizers support sparse gradients, including SGD, AdaGrad + and Adam. By default lazy updates is turned on, which may perform differently + from standard updates. For more details, please check the Optimization API at: + https://mxnet.incubator.apache.org/api/python/optimization/optimization.html Parameters ---------- diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py index e73a45f74b04..f758af5f982c 100644 --- a/python/mxnet/optimizer.py +++ b/python/mxnet/optimizer.py @@ -550,7 +550,7 @@ def update_multi_precision(self, index, weight, grad, state): class Signum(Optimizer): """The Signum optimizer that takes the sign of gradient or momentum. - The optimizer updates the weight by: + The optimizer updates the weight by:: rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight state = momentum * state + (1-momentum)*rescaled_grad diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc index 556fd1fea56d..d45551d383b8 100644 --- a/src/operator/tensor/dot.cc +++ b/src/operator/tensor/dot.cc @@ -66,6 +66,14 @@ forward_stype option for output storage type. Implemented sparse operations incl If the combination of input storage types and forward_stype does not match any of the above patterns, ``dot`` will fallback and generate output with default storage. +.. Note:: + + If the storage type of the lhs is "csr", the storage type of gradient w.r.t rhs will be + "row_sparse". Only a subset of optimizers support sparse gradients, including SGD, AdaGrad + and Adam. Note that by default lazy updates is turned on, which may perform differently + from standard updates. For more details, please check the Optimization API at: + https://mxnet.incubator.apache.org/api/python/optimization/optimization.html + )doc" ADD_FILELINE) .set_num_inputs(2) .set_num_outputs(1) diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc index e5ba058fb25e..64c5d86cbd1c 100644 --- a/src/operator/tensor/indexing_op.cc +++ b/src/operator/tensor/indexing_op.cc @@ -226,8 +226,15 @@ Examples:: [ 10., 11., 12., 13., 14.]]] -The storage type of weight can be either row_sparse or default, while -the storage type of weight's grad depends on the value of "sparse_grad". +The storage type of weight can be either row_sparse or default. + +.. Note:: + + If "sparse_grad" is set to True, the storage type of gradient w.r.t weights will be + "row_sparse". Only a subset of optimizers support sparse gradients, including SGD, AdaGrad + and Adam. Note that by default lazy updates is turned on, which may perform differently + from standard updates. For more details, please check the Optimization API at: + https://mxnet.incubator.apache.org/api/python/optimization/optimization.html )code" ADD_FILELINE) .set_num_inputs(2)