RasaHQ · Ghostvv · Apr 23, 2020 · Apr 19, 2020 · Apr 20, 2020 · Apr 20, 2020
diff --git a/changelog/5669.improvement.rst b/changelog/5669.improvement.rst
@@ -0,0 +1 @@
+Remove regularization gradient for variables that don't have prediction gradient.
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
@@ -187,10 +187,36 @@ def train_on_batch(
     ) -> None:
         """Train on batch"""
 
-        with tf.GradientTape() as tape:
-            total_loss = self._total_batch_loss(batch_in)
+        # calculate supervision and regularization losses separately
+        with tf.GradientTape(persistent=True) as tape:
+            prediction_loss = self.batch_loss(batch_in)
+            regularization_loss = tf.math.add_n(self.losses)
+            total_loss = prediction_loss + regularization_loss
+
+        self.total_loss.update_state(total_loss)
+
+        # calculate the gradients that come from supervision signal
+        prediction_gradients = tape.gradient(prediction_loss, self.trainable_variables)
+        # calculate the gradients that come from regularization
+        regularization_gradients = tape.gradient(
+            regularization_loss, self.trainable_variables
+        )
+        # delete gradient tape manually
+        # since it was created with `persistent=True` option
+        del tape
+
+        gradients = []
+        for pred_grad, reg_grad in zip(prediction_gradients, regularization_gradients):
+            if pred_grad is not None and reg_grad is not None:
+                # remove regularization gradient for variables
+                # that don't have prediction gradient
+                gradients.append(
+                    pred_grad
+                    + tf.where(pred_grad > 0, reg_grad, tf.zeros_like(reg_grad))
+                )
+            else:
+                gradients.append(pred_grad)
 
-        gradients = tape.gradient(total_loss, self.trainable_variables)
         self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
 
     def build_for_predict(
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Remove regularization gradient for variables that don't have prediction gradient.