From 92971b822dd0151aadba965c0c6b8b22cb82bf76 Mon Sep 17 00:00:00 2001
From: Neutron3529 <qweytr_1@163.com>
Date: Thu, 18 Jun 2020 13:30:10 +0800
Subject: [PATCH] fix misbehave of KLDivLoss (#18423)

* fix misbehave of KLDivLoss

In the current version of KLDivLoss, the return value is not the same value calculated by SoftmaxCrossEntropyLoss, which is not documented. It may due to the incorrect settings which using mean rather than sum dealing with the return value.
I provide a fix of this setting, which will keep the return value of `KLDivLoss` and SoftmaxCrossEntropyLoss` almost the same when `from_logits=False` and `sparse_label=False` are set to these functions seperately.
Now, the behave of KLDivLoss is exactly the same to what the document say.
```
import mxnet as mx
a=mx.nd.array([[-1,1],[1,-1]])
b=mx.nd.array([1,0]).one_hot(2)
TrueLoss=mx.gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False)
FalseLoss=mx.gluon.loss.KLDivLoss(from_logits=False)
c=TrueLoss(a,b)
d=FalseLoss(a,b)*a.shape[-1]
assert((c-d).abs().sum()==0 and a.shape[-1]==2)
```

* update sdml loss

the current version of SDMLLoss told us to `multiply for the number of labels` but actually it `multiply batch_size`. After this PR, it is no need to `multiply batch_size` or `multiply the number of labels` any more.

* remove outdated comment
---
 python/mxnet/gluon/loss.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 5dc91a584d2c..852a9a791d53 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -476,7 +476,7 @@ def hybrid_forward(self, F, pred, label, sample_weight=None):
             pred = F.log_softmax(pred, self._axis)
         loss = label * (F.log(label + 1e-12) - pred)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return F.mean(loss, axis=self._batch_axis, exclude=True)
+        return F.sum(loss, axis=self._batch_axis, exclude=True)
 
 
 class CTCLoss(Loss):
@@ -1010,8 +1010,7 @@ def _compute_labels(self, F, batch_size):
         confident output distributions." arXiv preprint arXiv:1701.06548 (2017).
         """
 
-        # TODO: replace with mx.nd.eye(batch_size) with mxnet 1.2
-        gold = F.one_hot(F.arange(batch_size), batch_size)
+        gold = F.eye(batch_size)
         labels = gold * (1 - self.smoothing_parameter) + (1 - gold) * self.smoothing_parameter / (batch_size - 1)
         return labels
 
@@ -1039,7 +1038,9 @@ def _loss(self, F, x1, x2):
         distances = self._compute_distances(x1, x2)
         log_probabilities = F.log_softmax(-distances, axis=1)
         # multiply for the number of labels to obtain the correct loss (gluon kl_loss averages instead of sum)
-        return self.kl_loss(log_probabilities, labels.as_in_context(distances.context)) * batch_size
+        # PR#18423:multiply for the number of labels should multiply x1.shape[1] rather than x1.shape[0])
+        # After PR#18423, it is no need to multiply it anymore.
+        return self.kl_loss(log_probabilities, labels.as_in_context(distances.context))
 
 
     def hybrid_forward(self, F, x1, x2):