From c30b75a9d8ed464c77d4a911c9da6132aacb59f0 Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Tue, 15 Jan 2019 01:35:46 +0100
Subject: [PATCH 01/26] edited buildfile for normalizations. Implemented
 GroupNorm,InstanceNorm and LayerNorm and first testcase

---
 tensorflow_addons/layers/BUILD                |  15 +-
 .../layers/python/normalizations.py           | 284 ++++++++++++++++++
 .../layers/python/normalizations_test.py      |  69 +++++
 3 files changed, 367 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow_addons/layers/python/normalizations.py
 create mode 100644 tensorflow_addons/layers/python/normalizations_test.py

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index 1d5c07d687..208c1e1312 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -8,6 +8,7 @@ py_library(
         "__init__.py",
         "python/__init__.py",
         "python/wrappers.py",
+        "python/normalizations.py"
     ]),
     srcs_version = "PY2AND3",
 )
@@ -22,4 +23,16 @@ py_test(
             ":layers_py",
         ],
     srcs_version = "PY2AND3",
-)
\ No newline at end of file
+)
+
+py_test(
+    name = "layers_normalizations_py_test",
+    srcs = [
+        "python/normalizations_test.py",
+    ],
+    main = "python/normalizations_test.py",
+    deps = [
+            ":layers_py",
+        ],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
new file mode 100644
index 0000000000..7bc85054c3
--- /dev/null
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -0,0 +1,284 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# Orginal implementation from keras_contrib/layer/normalization
+
+from tensorflow.keras.layers import Layer, InputSpec
+from tensorflow.keras import initializers, regularizers, constraints
+from tensorflow.keras import backend as K
+from tensorflow.keras.utils import get_custom_objects
+from tensorflow.python.ops import nn
+
+class GroupNorm(Layer):
+    """Group normalization layer.
+    Group Normalization divides the channels into groups and computes
+    within each group
+    the mean and variance for normalization.
+    Group Normalization's computation is independent
+     of batch sizes, and its accuracy is stable in a wide range of batch sizes.
+    Relation to Layer Normalization:
+    If the number of groups is set to 1, then this operation becomes identical to
+    Layer Normalization.
+    Relation to Instance Normalization:
+    If the number of groups is set to the
+    input dimension (number of groups is equal
+    to number of channels), then this operation becomes
+    identical to Instance Normalization.
+    # Arguments
+        groups: Integer, the number of groups for Group Normalization.
+            Can be in the range [1, N] where N is the input dimension.
+            The input dimension must be divisible by the number of groups.
+        axis: Integer, the axis that should be normalized
+            (typically the features axis).
+            For instance, after a `Conv2D` layer with
+            `data_format="channels_first"`,
+            set `axis=1` in `BatchNormalization`.
+        epsilon: Small float added to variance to avoid dividing by zero.
+        center: If True, add offset of `beta` to normalized tensor.
+            If False, `beta` is ignored.
+        scale: If True, multiply by `gamma`.
+            If False, `gamma` is not used.
+            When the next layer is linear (also e.g. `nn.relu`),
+            this can be disabled since the scaling
+            will be done by the next layer.
+        beta_initializer: Initializer for the beta weight.
+        gamma_initializer: Initializer for the gamma weight.
+        beta_regularizer: Optional regularizer for the beta weight.
+        gamma_regularizer: Optional regularizer for the gamma weight.
+        beta_constraint: Optional constraint for the beta weight.
+        gamma_constraint: Optional constraint for the gamma weight.
+    # Input shape
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
+    # Output shape
+        Same shape as input.
+    # References
+        - [Group Normalization](https://arxiv.org/abs/1803.08494)
+    """
+
+    def __init__(self,
+                 layer,
+                 groups=32,
+                 axis=-1,
+                 epsilon=1e-5,
+                 center=True,
+                 scale=True,
+                 beta_initializer='zeros',
+                 gamma_initializer='ones',
+                 beta_regularizer=None,
+                 gamma_regularizer=None,
+                 beta_constraint=None,
+                 gamma_constraint=None,
+                 **kwargs):
+        super(GroupNorm, self).__init__(layer,**kwargs)
+        self.supports_masking = True
+        self.groups = groups
+        self.axis = axis
+        self.epsilon = epsilon
+        self.center = center
+        self.scale = scale
+        self.beta_initializer = initializers.get(beta_initializer)
+        self.gamma_initializer = initializers.get(gamma_initializer)
+        self.beta_regularizer = regularizers.get(beta_regularizer)
+        self.gamma_regularizer = regularizers.get(gamma_regularizer)
+        self.beta_constraint = constraints.get(beta_constraint)
+        self.gamma_constraint = constraints.get(gamma_constraint)
+
+    def build(self, input_shape):
+        dim = input_shape[self.axis]
+
+        if dim is None:
+            raise ValueError('Axis ' + str(self.axis) + ' of '
+                             'input tensor should have a defined dimension '
+                             'but the layer received an input with shape ' +
+                             str(input_shape) + '.')
+        if self.groups==-1:
+            self.groups=dim
+
+        if dim < self.groups:
+            raise ValueError('Number of groups (' + str(self.groups) + ') cannot be '
+                             'more than the number of channels (' +
+                             str(dim) + ').')
+
+        if dim % self.groups != 0:
+            raise ValueError('Number of groups (' + str(self.groups) + ') must be a '
+                             'multiple of the number of channels (' +
+                             str(dim) + ').')
+
+        self.input_spec = InputSpec(ndim=len(input_shape),
+                                    axes={self.axis: dim})
+        shape = (dim,)
+
+        if self.scale:
+            self.gamma = self.add_weight(shape=shape,
+                                         name='gamma',
+                                         initializer=self.gamma_initializer,
+                                         regularizer=self.gamma_regularizer,
+                                         constraint=self.gamma_constraint)
+        else:
+            self.gamma = None
+        if self.center:
+            self.beta = self.add_weight(shape=shape,
+                                        name='beta',
+                                        initializer=self.beta_initializer,
+                                        regularizer=self.beta_regularizer,
+                                        constraint=self.beta_constraint)
+        else:
+            self.beta = None
+        self.built = True
+
+    def call(self, inputs, **kwargs):
+        input_shape = K.int_shape(inputs)
+        tensor_input_shape = K.shape(inputs)
+
+        # Prepare broadcasting shape.
+        reduction_axes = list(range(len(input_shape)))
+        del reduction_axes[self.axis]
+        broadcast_shape = [1] * len(input_shape)
+        broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
+        broadcast_shape.insert(1, self.groups)
+
+        reshape_group_shape = K.shape(inputs)
+        group_axes = [reshape_group_shape[i] for i in range(len(input_shape))]
+        group_axes[self.axis] = input_shape[self.axis] // self.groups
+        group_axes.insert(1, self.groups)
+
+        # reshape inputs to new group shape
+        group_shape = [group_axes[0], self.groups] + group_axes[2:]
+        group_shape = K.stack(group_shape)
+        inputs = K.reshape(inputs, group_shape)
+
+        group_reduction_axes = list(range(len(group_axes)))
+        mean, variance = nn.moments(inputs, group_reduction_axes[2:],
+                                    keep_dims=True)
+        inputs = (inputs - mean) / (K.sqrt(variance + self.epsilon))
+
+        # prepare broadcast shape
+        inputs = K.reshape(inputs, group_shape)
+
+        outputs = inputs
+
+        # In this case we must explicitly broadcast all parameters.
+        if self.scale:
+            broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
+            outputs = outputs * broadcast_gamma
+
+        if self.center:
+            broadcast_beta = K.reshape(self.beta, broadcast_shape)
+            outputs = outputs + broadcast_beta
+
+        # finally we reshape the output back to the input shape
+        outputs = K.reshape(outputs, tensor_input_shape)
+
+        return outputs
+
+    def get_config(self):
+        config = {
+            'groups': self.groups,
+            'axis': self.axis,
+            'epsilon': self.epsilon,
+            'center': self.center,
+            'scale': self.scale,
+            'beta_initializer': initializers.serialize(self.beta_initializer),
+            'gamma_initializer': initializers.serialize(self.gamma_initializer),
+            'beta_regularizer': regularizers.serialize(self.beta_regularizer),
+            'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
+            'beta_constraint': constraints.serialize(self.beta_constraint),
+            'gamma_constraint': constraints.serialize(self.gamma_constraint)
+        }
+        base_config = super(GroupNormalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+class LayerNorm(GroupNorm):
+    """Layer normalization layer.
+    Layer Normalization is an specific case of ```GroupNormalization```since it
+    normalizes all features of a layer. The Groupsize is 1.
+    Layer Normalization's computation is independent
+    of batch sizes, and its accuracy is stable in a wide range of batch sizes.
+    # Arguments
+        axis: Integer, the axis that should be normalized
+            (typically the features axis).
+            For instance, after a `Conv2D` layer with
+            `data_format="channels_first"`,
+            set `axis=1` in `BatchNormalization`.
+        epsilon: Small float added to variance to avoid dividing by zero.
+        center: If True, add offset of `beta` to normalized tensor.
+            If False, `beta` is ignored.
+        scale: If True, multiply by `gamma`.
+            If False, `gamma` is not used.
+            When the next layer is linear (also e.g. `nn.relu`),
+            this can be disabled since the scaling
+            will be done by the next layer.
+        beta_initializer: Initializer for the beta weight.
+        gamma_initializer: Initializer for the gamma weight.
+        beta_regularizer: Optional regularizer for the beta weight.
+        gamma_regularizer: Optional regularizer for the gamma weight.
+        beta_constraint: Optional constraint for the beta weight.
+        gamma_constraint: Optional constraint for the gamma weight.
+    # Input shape
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
+    # Output shape
+        Same shape as input.
+    # References
+        - [Layer Normalization](https://arxiv.org/abs/1607.06450)
+    """
+    def __init__(self,**kwargs):
+        kwargs["groups"]=1
+        super(LayerNorm,self).__init__(**kwargs)
+
+class InstanceNorm(GroupNorm):
+    """Instance normalization layer.
+    Instance Normalization is an specific case of ```GroupNormalization```since it
+    normalizes all features of one channel. The Groupsize is equal to the channel size.
+    Instance Normalization's computation is independent
+    of batch sizes, and its accuracy is stable in a wide range of batch sizes.
+    # Arguments
+        axis: Integer, the axis that should be normalized
+            (typically the features axis).
+            For instance, after a `Conv2D` layer with
+            `data_format="channels_first"`,
+            set `axis=1` in `BatchNormalization`.
+        epsilon: Small float added to variance to avoid dividing by zero.
+        center: If True, add offset of `beta` to normalized tensor.
+            If False, `beta` is ignored.
+        scale: If True, multiply by `gamma`.
+            If False, `gamma` is not used.
+            When the next layer is linear (also e.g. `nn.relu`),
+            this can be disabled since the scaling
+            will be done by the next layer.
+        beta_initializer: Initializer for the beta weight.
+        gamma_initializer: Initializer for the gamma weight.
+        beta_regularizer: Optional regularizer for the beta weight.
+        gamma_regularizer: Optional regularizer for the gamma weight.
+        beta_constraint: Optional constraint for the beta weight.
+        gamma_constraint: Optional constraint for the gamma weight.
+    # Input shape
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
+    # Output shape
+        Same shape as input.
+    # References
+        - [Layer Normalization](https://arxiv.org/abs/1607.06450)
+    """
+    def __init__(self,**kwargs):
+        kwargs["groups"]=-1
+        super(InstanceNorm,self).__init__(**kwargs)
diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
new file mode 100644
index 0000000000..d7b8d913d1
--- /dev/null
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -0,0 +1,69 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from tensorflow_addons.layers.python.normalizations import GroupNorm,LayerNorm,InstanceNorm
+import numpy as np
+import tensorflow as tf
+from tensorflow.python import keras as keras
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+from tensorflow.python.platform import test
+from tensorflow.python.framework import test_util as tf_test_util
+
+
+class NormTest(test.TestCase):
+
+    @tf_test_util.run_all_in_graph_and_eager_modes
+    def test_groupnorm_flat(self):
+        # Testing for 1 == LayerNorm, 5 == GroupNorm, -1 == InstanceNorm
+        groups=[-1,16,1]
+        for i in groups:
+
+            model = keras.models.Sequential()
+            model.add(GroupNorm(
+                keras.layers.Dense(32), input_shape=(32,),groups=i))
+
+            model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+            model.fit(
+                    np.random.random((10,32)),
+                    np.random.random((10,32)),
+                    epochs=1,
+                    batch_size=10)
+            self.assertTrue(hasattr(model.layers[0], 'gamma'))
+            self.assertTrue(hasattr(model.layers[0], 'beta'))
+
+    @tf_test_util.run_all_in_graph_and_eager_modes
+    def test_groupnorm_conv(self):
+        # Testing for 1 == LayerNorm, 5 == GroupNorm, -1 == InstanceNorm
+        groups=[1,5,-1]
+        for i in groups:
+
+            model = keras.models.Sequential()
+            model.add(GroupNorm(
+                keras.layers.Conv2D(5, (3, 10), padding='same'),
+                input_shape=(3,10),groups=i))
+
+            model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+            model.fit(
+                    np.random.random((10, 3, 10)),
+                    np.random.random((10, 3, 10)),
+                    epochs=1,
+                    batch_size=10)
+            self.assertTrue(hasattr(model.layers[0], 'gamma'))
+            self.assertTrue(hasattr(model.layers[0], 'beta'))
+
+
+if __name__ == "__main__":
+    test.main()

From 0e7674badbafa16e5653065f4e2597ef9d440d0f Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Thu, 17 Jan 2019 16:51:46 +0100
Subject: [PATCH 02/26] Resolved Comments

---
 tensorflow_addons/layers/python/normalizations.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 7bc85054c3..14c37c4ff7 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -18,10 +18,9 @@
 from tensorflow.keras.layers import Layer, InputSpec
 from tensorflow.keras import initializers, regularizers, constraints
 from tensorflow.keras import backend as K
-from tensorflow.keras.utils import get_custom_objects
 from tensorflow.python.ops import nn
 
-class GroupNorm(Layer):
+class GroupNormalization(Layer):
     """Group normalization layer.
     Group Normalization divides the channels into groups and computes
     within each group
@@ -83,7 +82,7 @@ def __init__(self,
                  beta_constraint=None,
                  gamma_constraint=None,
                  **kwargs):
-        super(GroupNorm, self).__init__(layer,**kwargs)
+        super(GroupNormalization, self).__init__(layer,**kwargs)
         self.supports_masking = True
         self.groups = groups
         self.axis = axis
@@ -140,7 +139,7 @@ def build(self, input_shape):
             self.beta = None
         self.built = True
 
-    def call(self, inputs, **kwargs):
+    def call(self, inputs):
         input_shape = K.int_shape(inputs)
         tensor_input_shape = K.shape(inputs)
 
@@ -205,7 +204,7 @@ def get_config(self):
     def compute_output_shape(self, input_shape):
         return input_shape
 
-class LayerNorm(GroupNorm):
+class LayerNormalization(GroupNormalization):
     """Layer normalization layer.
     Layer Normalization is an specific case of ```GroupNormalization```since it
     normalizes all features of a layer. The Groupsize is 1.
@@ -242,9 +241,9 @@ class LayerNorm(GroupNorm):
     """
     def __init__(self,**kwargs):
         kwargs["groups"]=1
-        super(LayerNorm,self).__init__(**kwargs)
+        super(LayerNormalization,self).__init__(**kwargs)
 
-class InstanceNorm(GroupNorm):
+class InstanceNormalization(GroupNormalization):
     """Instance normalization layer.
     Instance Normalization is an specific case of ```GroupNormalization```since it
     normalizes all features of one channel. The Groupsize is equal to the channel size.
@@ -281,4 +280,4 @@ class InstanceNorm(GroupNorm):
     """
     def __init__(self,**kwargs):
         kwargs["groups"]=-1
-        super(InstanceNorm,self).__init__(**kwargs)
+        super(InstanceNormalization,self).__init__(**kwargs)

From 65a5495c41b1467b5cc6cfc5347073ec080362fe Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Sat, 9 Feb 2019 12:16:35 +0100
Subject: [PATCH 03/26] found bug in normalizations init

---
 .../layers/python/normalizations.py           |   3 +-
 .../layers/python/normalizations_test.py      | 225 ++++++++++++++++--
 2 files changed, 210 insertions(+), 18 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 14c37c4ff7..52251557e8 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -69,7 +69,6 @@ class GroupNormalization(Layer):
     """
 
     def __init__(self,
-                 layer,
                  groups=32,
                  axis=-1,
                  epsilon=1e-5,
@@ -82,7 +81,7 @@ def __init__(self,
                  beta_constraint=None,
                  gamma_constraint=None,
                  **kwargs):
-        super(GroupNormalization, self).__init__(layer,**kwargs)
+        super(GroupNormalization, self).__init__(**kwargs)
         self.supports_masking = True
         self.groups = groups
         self.axis = axis
diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index d7b8d913d1..57e9f6cfec 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # =============================================================================
 
-from tensorflow_addons.layers.python.normalizations import GroupNorm,LayerNorm,InstanceNorm
+from tensorflow_addons.layers.python.normalizations import GroupNormalization,LayerNormalization,InstanceNormalization
 import numpy as np
 import tensorflow as tf
 from tensorflow.python import keras as keras
@@ -23,17 +23,32 @@
 from tensorflow.python.framework import test_util as tf_test_util
 
 
-class NormTest(test.TestCase):
+class normalization_test(test.TestCase):
+
+    @tf_test_util.run_all_in_graph_and_eager_modes
+    def test_weights(self):
+        layer = GroupNormalization(groups=1,scale=False, center=False)
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.trainable_weights), 0)
+        self.assertEqual(len(layer.weights), 0)
+
+        layer = keras.layers.LayerNormalization()
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.trainable_weights), 2)
+        self.assertEqual(len(layer.weights), 2)
+
+
+
 
     @tf_test_util.run_all_in_graph_and_eager_modes
     def test_groupnorm_flat(self):
         # Testing for 1 == LayerNorm, 5 == GroupNorm, -1 == InstanceNorm
         groups=[-1,16,1]
         for i in groups:
-
-            model = keras.models.Sequential()
-            model.add(GroupNorm(
-                keras.layers.Dense(32), input_shape=(32,),groups=i))
+            model=keras.models.Sequential()
+            model.add(GroupNormalization(
+                 input_shape=(32,),groups=i))
+            model.add(keras.layers.Dense(32))
 
             model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
             model.fit(
@@ -44,26 +59,204 @@ def test_groupnorm_flat(self):
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
             self.assertTrue(hasattr(model.layers[0], 'beta'))
 
-    @tf_test_util.run_all_in_graph_and_eager_modes
     def test_groupnorm_conv(self):
         # Testing for 1 == LayerNorm, 5 == GroupNorm, -1 == InstanceNorm
-        groups=[1,5,-1]
+        #groups=[1,5,-1]
+        groups=[1]
         for i in groups:
 
             model = keras.models.Sequential()
-            model.add(GroupNorm(
-                keras.layers.Conv2D(5, (3, 10), padding='same'),
-                input_shape=(3,10),groups=i))
+            model.add(GroupNormalization(
+                 input_shape=(20,20,3,),groups=i))
+
+            model.add(keras.layers.Conv2D(5, (1, 1), padding='same'))
 
             model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
-            model.fit(
-                    np.random.random((10, 3, 10)),
-                    np.random.random((10, 3, 10)),
-                    epochs=1,
-                    batch_size=10)
+            model.fit(np.random.random((10,20, 20, 3)))
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
             self.assertTrue(hasattr(model.layers[0], 'beta'))
 
+    """def testUnknownShape(self):
+        inputs = array_ops.placeholder(dtypes.float32)
+        with self.assertRaisesRegexp(ValueError, 'undefined rank'):
+            GroupNormalization(inputs)
+            LayerNormaliztion(inputs)
+            InstanceNormalization(inputs)"""
+"""
+class LayerNormalizationTest(keras_parameterized.TestCase):
+
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_layernorm_regularization(self):
+    layer = keras.layers.LayerNormalization(
+        gamma_regularizer='l1', beta_regularizer='l1')
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.losses), 2)
+    max_norm = keras.constraints.max_norm
+    layer = keras.layers.LayerNormalization(
+        gamma_constraint=max_norm, beta_constraint=max_norm)
+    layer.build((None, 3, 4))
+    self.assertEqual(layer.gamma.constraint, max_norm)
+    self.assertEqual(layer.beta.constraint, max_norm)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_convnet(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        model = keras.models.Sequential()
+        norm = keras.layers.LayerNormalization(input_shape=(3, 4, 4))
+        model.add(norm)
+        model.compile(loss='mse',
+                      optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                      run_eagerly=testing_utils.should_run_eagerly())
+
+        # centered on 5.0, variance 10.0
+        x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+        model.fit(x, x, epochs=4, verbose=0)
+        out = model.predict(x)
+        out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
+        out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
+
+        np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
+        np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_convnet_channel_last(self):
+    model = keras.models.Sequential()
+    norm = keras.layers.LayerNormalization(input_shape=(4, 4, 3))
+    model.add(norm)
+    model.compile(loss='mse',
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    # centered on 5.0, variance 10.0
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+    out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+    out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+    np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+    np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_correctness(self):
+    _run_layernorm_correctness_test(
+        normalization.LayerNormalization, dtype='float32')
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_mixed_precision(self):
+    _run_layernorm_correctness_test(
+        normalization.LayerNormalization, dtype='float16')
+
+  def doOutputTest(self,
+                   input_shape,
+                   tol=1e-5,
+                   norm_axis=None,
+                   params_axis=-1,
+                   dtype=None):
+    ndim = len(input_shape)
+    if norm_axis is None:
+      moments_axis = range(1, ndim)
+    elif isinstance(norm_axis, int):
+      if norm_axis < 0:
+        moments_axis = [norm_axis + ndim]
+      else:
+        moments_axis = [norm_axis]
+    else:
+      moments_axis = []
+      for dim in norm_axis:
+        if dim < 0:
+          dim = dim + ndim
+        moments_axis.append(dim)
+
+    moments_axis = tuple(moments_axis)
+    expected_shape = []
+    for i in range(ndim):
+      if i not in moments_axis:
+        expected_shape.append(input_shape[i])
+
+    expected_mean = np.zeros(expected_shape)
+    expected_var = np.ones(expected_shape)
+    for mu in [0.0, 1e2]:
+      for sigma in [1.0, 0.1]:
+        inputs = np.random.randn(*input_shape) * sigma + mu
+        inputs_t = constant_op.constant(inputs, shape=input_shape)
+        layer = normalization.LayerNormalization(
+            norm_axis=norm_axis, params_axis=params_axis, dtype=dtype)
+        outputs = layer(inputs_t)
+        beta = layer.beta
+        gamma = layer.gamma
+        for weight in layer.weights:
+          self.evaluate(weight.initializer)
+        outputs = self.evaluate(outputs)
+        beta = self.evaluate(beta)
+        gamma = self.evaluate(gamma)
+
+        # The mean and variance of the output should be close to 0 and 1
+        # respectively.
+
+        # Make sure that there are no NaNs
+        self.assertFalse(np.isnan(outputs).any())
+        mean = np.mean(outputs, axis=moments_axis)
+        var = np.var(outputs, axis=moments_axis)
+        # Layer-norm implemented in numpy
+        eps = 1e-12
+        expected_out = (
+            (gamma * (inputs - np.mean(
+                inputs, axis=moments_axis, keepdims=True)) /
+             np.sqrt(eps + np.var(
+                 inputs, axis=moments_axis, keepdims=True))) + beta)
+        self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
+        self.assertAllClose(expected_var, var, atol=tol)
+        # The full computation gets a bigger tolerance
+        self.assertAllClose(expected_out, outputs, atol=5 * tol)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput2DInput(self):
+    self.doOutputTest((10, 300))
+    self.doOutputTest((10, 300), norm_axis=[0])
+    self.doOutputTest((10, 300), params_axis=[0, 1])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput2DInputDegenerateNormAxis(self):
+    with self.assertRaisesRegexp(ValueError, r'Invalid axis: 2'):
+      self.doOutputTest((10, 300), norm_axis=2)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput4DInput(self):
+    self.doOutputTest((100, 10, 10, 3))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput4DInputNormOnInnermostAxis(self):
+    # Equivalent tests
+    shape = (100, 10, 10, 3)
+    self.doOutputTest(
+        shape, norm_axis=list(range(3, len(shape))), tol=1e-4, dtype='float64')
+    self.doOutputTest(shape, norm_axis=-1, tol=1e-4, dtype='float64')
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInput(self):
+    self.doOutputTest((10, 10, 10, 30))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInputNormOnInnermostAxis(self):
+    self.doOutputTest((10, 10, 10, 30), norm_axis=3)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInputNormOnMixedAxes(self):
+    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3])
+    self.doOutputTest((10, 10, 10, 30), params_axis=[-2, -1])
+    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3],
+                      params_axis=[-3, -2, -1])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputBigInput(self):
+    self.doOutputTest((1, 100, 100, 1))
+    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2])
+    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2],
+                      params_axis=[-2, -1])
 
+"""
 if __name__ == "__main__":
     test.main()

From 892110cdbe7f06638c7dd14bce5b8edd48e35be7 Mon Sep 17 00:00:00 2001
From: Moritz <moritz.kroeger@tu-dortmund.de>
Date: Sun, 10 Feb 2019 20:54:26 +0100
Subject: [PATCH 04/26] minor changes

---
 tensorflow_addons/layers/python/normalizations_test.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index 57e9f6cfec..ed5f831955 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -42,7 +42,7 @@ def test_weights(self):
 
     @tf_test_util.run_all_in_graph_and_eager_modes
     def test_groupnorm_flat(self):
-        # Testing for 1 == LayerNorm, 5 == GroupNorm, -1 == InstanceNorm
+        # Testing for 1 == LayerNorm, 16 == GroupNorm, -1 == InstanceNorm
         groups=[-1,16,1]
         for i in groups:
             model=keras.models.Sequential()
@@ -59,6 +59,7 @@ def test_groupnorm_flat(self):
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
             self.assertTrue(hasattr(model.layers[0], 'beta'))
 
+    @tf_test_util.run_all_in_graph_and_eager_modes
     def test_groupnorm_conv(self):
         # Testing for 1 == LayerNorm, 5 == GroupNorm, -1 == InstanceNorm
         #groups=[1,5,-1]
@@ -74,14 +75,7 @@ def test_groupnorm_conv(self):
             model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
             model.fit(np.random.random((10,20, 20, 3)))
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
-            self.assertTrue(hasattr(model.layers[0], 'beta'))
 
-    """def testUnknownShape(self):
-        inputs = array_ops.placeholder(dtypes.float32)
-        with self.assertRaisesRegexp(ValueError, 'undefined rank'):
-            GroupNormalization(inputs)
-            LayerNormaliztion(inputs)
-            InstanceNormalization(inputs)"""
 """
 class LayerNormalizationTest(keras_parameterized.TestCase):
 

From 57c60a7c9c3c3471a21827be1749c9b91ac8db90 Mon Sep 17 00:00:00 2001
From: Moritz <moritz.kroeger@tu-dortmund.de>
Date: Sun, 10 Feb 2019 20:56:17 +0100
Subject: [PATCH 05/26] Merge remote-tracking branch 'upstream/master' into
 dev/tests

---
 BUILD                                         |   2 +-
 tensorflow_addons/layers/BUILD                |  31 +-
 tensorflow_addons/layers/__init__.py          |   4 +-
 tensorflow_addons/layers/python/maxout.py     |  98 +++++
 .../layers/python/maxout_test.py              |  71 ++++
 tensorflow_addons/layers/python/poincare.py   |  77 ++++
 .../layers/python/poincare_test.py            |  87 +++++
 tensorflow_addons/layers/python/wrappers.py   |  20 +-
 .../layers/python/wrappers_test.py            |  12 +-
 tensorflow_addons/optimizers/BUILD            |  24 ++
 .../optimizers/python/lazy_adam_optimizer.py  |  81 ++++
 .../python/lazy_adam_optimizer_test.py        | 348 ++++++++++++++++++
 .../text/cc/kernels/skip_gram_kernels.cc      |  10 +-
 .../text/python/skip_gram_ops_test.py         |  16 +-
 14 files changed, 847 insertions(+), 34 deletions(-)
 create mode 100644 tensorflow_addons/layers/python/maxout.py
 create mode 100644 tensorflow_addons/layers/python/maxout_test.py
 create mode 100644 tensorflow_addons/layers/python/poincare.py
 create mode 100644 tensorflow_addons/layers/python/poincare_test.py
 create mode 100644 tensorflow_addons/optimizers/python/lazy_adam_optimizer.py
 create mode 100644 tensorflow_addons/optimizers/python/lazy_adam_optimizer_test.py

diff --git a/BUILD b/BUILD
index 09d2a8a26a..01d63a8ac8 100644
--- a/BUILD
+++ b/BUILD
@@ -6,7 +6,7 @@ sh_binary(
         "MANIFEST.in",
         "setup.py",
         "tensorflow_addons/__init__.py",
-	    "//tensorflow_addons/layers:layers_py",
+        "//tensorflow_addons/layers:layers_py",
         "//tensorflow_addons/text:text_py",
     ],
 )
diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index 208c1e1312..1b1ae10b9d 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -4,9 +4,11 @@ package(default_visibility = ["//visibility:public"])
 
 py_library(
     name = "layers_py",
-    srcs = ([
+    srcs = [
         "__init__.py",
         "python/__init__.py",
+        "python/maxout.py",
+        "python/poincare.py",
         "python/wrappers.py",
         "python/normalizations.py"
     ]),
@@ -19,9 +21,32 @@ py_test(
         "python/wrappers_test.py",
     ],
     main = "python/wrappers_test.py",
+    srcs_version = "PY2AND3",
     deps = [
-            ":layers_py",
-        ],
+        ":layers_py",
+    ],
+)
+
+py_test(
+    name = "maxout_py_test",
+    size = "small",
+    srcs = [
+        "python/maxout_test.py",
+    ],
+    main = "python/maxout_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers_py",
+    ],
+)
+
+py_test(
+    name = "poincare_py_test",
+    size = "small",
+    srcs = [
+        "python/poincare_test.py",
+    ],
+    main = "python/poincare_test.py",
     srcs_version = "PY2AND3",
 )
 
diff --git a/tensorflow_addons/layers/__init__.py b/tensorflow_addons/layers/__init__.py
index de8f5c2d2c..09a236c8c9 100644
--- a/tensorflow_addons/layers/__init__.py
+++ b/tensorflow_addons/layers/__init__.py
@@ -19,5 +19,5 @@
 from __future__ import division
 from __future__ import print_function
 
-# Weight Normalization Wrapper
-from tensorflow_addons.layers.python.wrappers import WeightNorm
+from tensorflow_addons.layers.python.maxout import Maxout
+from tensorflow_addons.layers.python.wrappers import WeightNormalization
diff --git a/tensorflow_addons/layers/python/maxout.py b/tensorflow_addons/layers/python/maxout.py
new file mode 100644
index 0000000000..0beb2ae24d
--- /dev/null
+++ b/tensorflow_addons/layers/python/maxout.py
@@ -0,0 +1,98 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementing Maxout layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+class Maxout(Layer):
+    """Applies Maxout to the input.
+
+    "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+    Courville, Yoshua Bengio. https://arxiv.org/abs/1302.4389
+
+    Usually the operation is performed in the filter/channel dimension. This can
+    also be used after Dense layers to reduce number of features.
+
+    Arguments:
+        num_units: Specifies how many features will remain after maxout
+            in the `axis` dimension (usually channel).
+            This must be a factor of number of features.
+        axis: The dimension where max pooling will be performed. Default is the
+            last dimension.
+
+    Input shape:
+        nD tensor with shape: `(batch_size, ..., axis_dim, ...)`.
+
+    Output shape:
+        nD tensor with shape: `(batch_size, ..., num_units, ...)`.
+    """
+
+    def __init__(self, num_units, axis=-1, **kwargs):
+        super(Maxout, self).__init__(**kwargs)
+        self.num_units = num_units
+        self.axis = axis
+
+    def call(self, inputs):
+        inputs = ops.convert_to_tensor(inputs)
+        shape = inputs.get_shape().as_list()
+        # Dealing with batches with arbitrary sizes
+        for i in range(len(shape)):
+            if shape[i] is None:
+                shape[i] = array_ops.shape(inputs)[i]
+
+        num_channels = shape[self.axis]
+        if (not isinstance(num_channels, ops.Tensor)
+                and num_channels % self.num_units):
+            raise ValueError('number of features({}) is not '
+                             'a multiple of num_units({})'.format(
+                                 num_channels, self.num_units))
+
+        if self.axis < 0:
+            axis = self.axis + len(shape)
+        else:
+            axis = self.axis
+        assert axis >= 0, 'Find invalid axis: {}'.format(self.axis)
+
+        expand_shape = shape[:]
+        expand_shape[axis] = self.num_units
+        k = num_channels // self.num_units
+        expand_shape.insert(axis, k)
+
+        outputs = math_ops.reduce_max(
+            array_ops.reshape(inputs, expand_shape), axis, keepdims=False)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tensor_shape.TensorShape(input_shape).as_list()
+        input_shape[self.axis] = self.num_units
+        return tensor_shape.TensorShape(input_shape)
+
+    def get_config(self):
+        config = {'num_units': self.num_units, 'axis': self.axis}
+        base_config = super(Maxout, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+generic_utils._GLOBAL_CUSTOM_OBJECTS['Maxout'] = Maxout
diff --git a/tensorflow_addons/layers/python/maxout_test.py b/tensorflow_addons/layers/python/maxout_test.py
new file mode 100644
index 0000000000..22e381f8c2
--- /dev/null
+++ b/tensorflow_addons/layers/python/maxout_test.py
@@ -0,0 +1,71 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Maxout layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow_addons.layers.python.maxout import Maxout
+
+
+class MaxOutTest(test.TestCase):
+    def test_simple(self):
+        testing_utils.layer_test(
+            Maxout, kwargs={'num_units': 3}, input_shape=(5, 4, 2, 18))
+
+    def test_nchw(self):
+        testing_utils.layer_test(
+            Maxout,
+            kwargs={
+                'num_units': 4,
+                'axis': 1
+            },
+            input_shape=(2, 20, 3, 6))
+
+        testing_utils.layer_test(
+            Maxout,
+            kwargs={
+                'num_units': 4,
+                'axis': -3
+            },
+            input_shape=(2, 20, 3, 6))
+
+    def test_unknown(self):
+        inputs = np.random.random((5, 4, 2, 18)).astype('float32')
+        testing_utils.layer_test(
+            Maxout,
+            kwargs={'num_units': 3},
+            input_shape=(5, 4, 2, None),
+            input_data=inputs)
+
+        testing_utils.layer_test(
+            Maxout,
+            kwargs={'num_units': 3},
+            input_shape=(None, None, None, None),
+            input_data=inputs)
+
+    def test_invalid_shape(self):
+        with self.assertRaisesRegexp(ValueError, r'number of features'):
+            testing_utils.layer_test(
+                Maxout, kwargs={'num_units': 3}, input_shape=(5, 4, 2, 7))
+
+
+if __name__ == '__main__':
+    test.main()
diff --git a/tensorflow_addons/layers/python/poincare.py b/tensorflow_addons/layers/python/poincare.py
new file mode 100644
index 0000000000..037e7b0c82
--- /dev/null
+++ b/tensorflow_addons/layers/python/poincare.py
@@ -0,0 +1,77 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementing PoincareNormalize layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.ops import math_ops
+
+
+class PoincareNormalize(Layer):
+    """Project into the Poincare ball with norm <= 1.0 - epsilon.
+
+    https://en.wikipedia.org/wiki/Poincare_ball_model
+
+    Used in
+    Poincare Embeddings for Learning Hierarchical Representations
+    Maximilian Nickel, Douwe Kiela
+    https://arxiv.org/pdf/1705.08039.pdf
+
+    For a 1-D tensor with `axis = 0`, computes
+
+                  (x * (1 - epsilon)) / ||x||     if ||x|| > 1 - epsilon
+        output =
+                   x                              otherwise
+
+    For `x` with more dimensions, independently normalizes each 1-D slice along
+    dimension `axis`.
+
+    Arguments:
+        axis: Axis along which to normalize.  A scalar or a vector of
+              integers.
+        epsilon: A small deviation from the edge of the unit sphere for numerical
+              stability.
+    """
+
+    def __init__(self, axis=1, epsilon=1e-5, **kwargs):
+        super(PoincareNormalize, self).__init__(**kwargs)
+        self.axis = axis
+        self.epsilon = epsilon
+
+    def call(self, inputs):
+        x = ops.convert_to_tensor(inputs)
+        square_sum = math_ops.reduce_sum(
+            math_ops.square(x), self.axis, keepdims=True)
+        x_inv_norm = math_ops.rsqrt(square_sum)
+        x_inv_norm = math_ops.minimum((1. - self.epsilon) * x_inv_norm, 1.)
+        outputs = math_ops.multiply(x, x_inv_norm)
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def get_config(self):
+        config = {'axis': self.axis, 'epsilon': self.epsilon}
+        base_config = super(PoincareNormalize, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+generic_utils._GLOBAL_CUSTOM_OBJECTS['PoincareNormalize'] = PoincareNormalize
+    
diff --git a/tensorflow_addons/layers/python/poincare_test.py b/tensorflow_addons/layers/python/poincare_test.py
new file mode 100644
index 0000000000..81be19c249
--- /dev/null
+++ b/tensorflow_addons/layers/python/poincare_test.py
@@ -0,0 +1,87 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for PoincareNormalize layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow_addons.layers.python.poincare import PoincareNormalize
+
+
+class PoincareNormalizeTest(test.TestCase):
+    def _PoincareNormalize(self, x, dim, epsilon=1e-5):
+        if isinstance(dim, list):
+            norm = np.linalg.norm(x, axis=tuple(dim))
+            for d in dim:
+                norm = np.expand_dims(norm, d)
+            norm_x = ((1. - epsilon) * x) / norm
+        else:
+            norm = np.expand_dims(
+                np.apply_along_axis(np.linalg.norm, dim, x), dim)
+            norm_x = ((1. - epsilon) * x) / norm
+        return np.where(norm > 1.0 - epsilon, norm_x, x)
+
+    def testPoincareNormalize(self):
+        x_shape = [20, 7, 3]
+        epsilon = 1e-5
+        tol = 1e-6
+        np.random.seed(1)
+        inputs = np.random.random_sample(x_shape).astype(np.float32)
+
+        for dim in range(len(x_shape)):
+            outputs_expected = self._PoincareNormalize(inputs, dim, epsilon)
+
+            outputs = testing_utils.layer_test(
+                PoincareNormalize,
+                kwargs={
+                    'axis': dim,
+                    'epsilon': epsilon
+                },
+                input_data=inputs,
+                expected_output=outputs_expected)
+            for y in outputs_expected, outputs:
+                norm = np.linalg.norm(y, axis=dim)
+                self.assertLessEqual(norm.max(), 1. - epsilon + tol)
+
+    def testPoincareNormalizeDimArray(self):
+        x_shape = [20, 7, 3]
+        epsilon = 1e-5
+        tol = 1e-6
+        np.random.seed(1)
+        inputs = np.random.random_sample(x_shape).astype(np.float32)
+        dim = [1, 2]
+
+        outputs_expected = self._PoincareNormalize(inputs, dim, epsilon)
+
+        outputs = testing_utils.layer_test(
+            PoincareNormalize,
+            kwargs={
+                'axis': dim,
+                'epsilon': epsilon
+            },
+            input_data=inputs,
+            expected_output=outputs_expected)
+        for y in outputs_expected, outputs:
+            norm = np.linalg.norm(y, axis=tuple(dim))
+            self.assertLessEqual(norm.max(), 1. - epsilon + tol)
+
+
+if __name__ == '__main__':
+    test.main()
diff --git a/tensorflow_addons/layers/python/wrappers.py b/tensorflow_addons/layers/python/wrappers.py
index e9e5df37c9..95bfd7eb68 100644
--- a/tensorflow_addons/layers/python/wrappers.py
+++ b/tensorflow_addons/layers/python/wrappers.py
@@ -26,22 +26,22 @@
 from tensorflow.python.ops import variables as tf_variables
 
 
-class WeightNorm(Wrapper):
+class WeightNormalization(Wrapper):
     """ This wrapper reparameterizes a layer by decoupling the weight's
     magnitude and direction. This speeds up convergence by improving the
     conditioning of the optimization problem.
     Weight Normalization: A Simple Reparameterization to Accelerate
     Training of Deep Neural Networks: https://arxiv.org/abs/1602.07868
     Tim Salimans, Diederik P. Kingma (2016)
-    WeightNorm wrapper works for keras and tf layers.
+    WeightNormalization wrapper works for keras and tf layers.
     ```python
-      net = WeightNorm(tf.keras.layers.Conv2D(2, 2, activation='relu'),
+      net = WeightNormalization(tf.keras.layers.Conv2D(2, 2, activation='relu'),
              input_shape=(32, 32, 3), data_init=True)(x)
-      net = WeightNorm(tf.keras.layers.Conv2D(16, 5, activation='relu'),
+      net = WeightNormalization(tf.keras.layers.Conv2D(16, 5, activation='relu'),
                        data_init=True)(net)
-      net = WeightNorm(tf.keras.layers.Dense(120, activation='relu'),
+      net = WeightNormalization(tf.keras.layers.Dense(120, activation='relu'),
                        data_init=True)(net)
-      net = WeightNorm(tf.keras.layers.Dense(n_classes),
+      net = WeightNormalization(tf.keras.layers.Dense(n_classes),
                        data_init=True)(net)
     ```
     Arguments:
@@ -55,7 +55,7 @@ class WeightNorm(Wrapper):
     def __init__(self, layer, data_init=False, **kwargs):
         if not isinstance(layer, Layer):
             raise ValueError(
-                'Please initialize `WeightNorm` layer with a '
+                'Please initialize `WeightNormalization` layer with a '
                 '`Layer` instance. You passed: {input}'.format(input=layer))
 
         if not context.executing_eagerly() and data_init:
@@ -67,7 +67,7 @@ def __init__(self, layer, data_init=False, **kwargs):
         if data_init:
             self.initialized = False
 
-        super(WeightNorm, self).__init__(layer, **kwargs)
+        super(WeightNormalization, self).__init__(layer, **kwargs)
         self._track_checkpointable(layer, name='layer')
 
     def _compute_weights(self):
@@ -114,7 +114,7 @@ def build(self, input_shape):
 
             if not hasattr(self.layer, 'kernel'):
                 raise ValueError(
-                    '`WeightNorm` must wrap a layer that'
+                    '`WeightNormalization` must wrap a layer that'
                     ' contains a `kernel` for weights'
                 )
 
@@ -137,7 +137,7 @@ def build(self, input_shape):
 
             self.layer.built = True
 
-        super(WeightNorm, self).build()
+        super(WeightNormalization, self).build()
         self.built = True
 
     def call(self, inputs):
diff --git a/tensorflow_addons/layers/python/wrappers_test.py b/tensorflow_addons/layers/python/wrappers_test.py
index da418fcb3e..faa5e5b90f 100644
--- a/tensorflow_addons/layers/python/wrappers_test.py
+++ b/tensorflow_addons/layers/python/wrappers_test.py
@@ -29,12 +29,12 @@
 from tensorflow.python import keras
 
 
-class WeightNormTest(test.TestCase):
+class WeightNormalizationTest(test.TestCase):
 
     @tf_test_util.run_all_in_graph_and_eager_modes
     def test_weightnorm_dense_train(self):
         model = keras.models.Sequential()
-        model.add(wrappers.WeightNorm(
+        model.add(wrappers.WeightNormalization(
             keras.layers.Dense(2), input_shape=(3, 4)))
 
         model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
@@ -48,7 +48,7 @@ def test_weightnorm_dense_train(self):
     @tf_test_util.run_all_in_graph_and_eager_modes
     def test_weightnorm_conv2d(self):
         model = keras.models.Sequential()
-        model.add(wrappers.WeightNorm(
+        model.add(wrappers.WeightNormalization(
             keras.layers.Conv2D(5, (2, 2), padding='same'),
             input_shape=(4, 4, 3)))
 
@@ -63,7 +63,7 @@ def test_weightnorm_conv2d(self):
     @tf_test_util.run_all_in_graph_and_eager_modes
     def test_weight_norm_tflayers(self):
         images = random_ops.random_uniform((2, 4, 4, 3))
-        wn_wrapper = wrappers.WeightNorm(layers.Conv2D(32, [2, 2]),
+        wn_wrapper = wrappers.WeightNormalization(layers.Conv2D(32, [2, 2]),
                                          input_shape=(4, 4, 3))
         wn_wrapper.apply(images)
         self.assertTrue(hasattr(wn_wrapper.layer, 'g'))
@@ -72,12 +72,12 @@ def test_weight_norm_tflayers(self):
     def test_weight_norm_nonlayer(self):
         images = random_ops.random_uniform((2, 4, 43))
         with self.assertRaises(ValueError):
-            wrappers.WeightNorm(images)
+            wrappers.WeightNormalization(images)
 
     @tf_test_util.run_all_in_graph_and_eager_modes
     def test_weight_norm_nokernel(self):
         with self.assertRaises(ValueError):
-            wrappers.WeightNorm(layers.MaxPooling2D(2, 2)).build((2, 2))
+            wrappers.WeightNormalization(layers.MaxPooling2D(2, 2)).build((2, 2))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow_addons/optimizers/BUILD b/tensorflow_addons/optimizers/BUILD
index 3ad427fd87..dff0f34c88 100644
--- a/tensorflow_addons/optimizers/BUILD
+++ b/tensorflow_addons/optimizers/BUILD
@@ -1,3 +1,27 @@
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:public"])
+
+py_library(
+    name = "optimizers_py",
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+        "python/lazy_adam_optimizer.py",
+    ],
+    srcs_version = "PY2AND3",
+)
+
+
+py_test(
+    name = "lazy_adam_optimizer_test",
+    size = "small",
+    srcs = [
+        "python/lazy_adam_optimizer_test.py"
+    ],
+    main = "python/lazy_adam_optimizer_test.py",
+    deps = [
+        ":optimizers_py",
+    ],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow_addons/optimizers/python/lazy_adam_optimizer.py b/tensorflow_addons/optimizers/python/lazy_adam_optimizer.py
new file mode 100644
index 0000000000..91e48085f3
--- /dev/null
+++ b/tensorflow_addons/optimizers/python/lazy_adam_optimizer.py
@@ -0,0 +1,81 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+Compared with the original Adam optimizer, the one in this file can
+provide a large improvement in model training throughput for some
+applications. However, it provides slightly different semantics than the
+original Adam algorithm, and may lead to different empirical results.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+
+
+class LazyAdamOptimizer(adam.Adam):
+    """Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+    The original Adam algorithm maintains two moving-average accumulators for
+    each trainable variable; the accumulators are updated at every step.
+    This class provides lazier handling of gradient updates for sparse variables.
+    It only updates moving-average accumulators for sparse variable indices that
+    appear in the current batch, rather than updating the accumulators for all
+    indices. Compared with the original Adam optimizer, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original Adam algorithm, and
+    may lead to different empirical results.
+
+    Note, amsgrad is currently not supported and the argument can only be False.
+    """
+
+    def _resource_apply_sparse(self, grad, var, indices):
+        var_dtype = var.dtype.base_dtype
+        lr_t = self._decayed_lr(var_dtype)
+        beta_1_t = self._get_hyper('beta_1', var_dtype)
+        beta_2_t = self._get_hyper('beta_2', var_dtype)
+        local_step = math_ops.cast(self.iterations + 1, var_dtype)
+        beta_1_power = math_ops.pow(beta_1_t, local_step)
+        beta_2_power = math_ops.pow(beta_2_t, local_step)
+        epsilon_t = self._get_hyper('epsilon', var_dtype)
+        lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+        # \\(m := beta1 * m + (1 - beta1) * g_t\\)
+        m = self.get_slot(var, "m")
+        m_t_slice = beta_1_t * array_ops.gather(
+            m, indices) + (1 - beta_1_t) * grad
+        m_update_op = resource_variable_ops.resource_scatter_update(
+            m.handle, indices, m_t_slice)
+
+        # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
+        v = self.get_slot(var, "v")
+        v_t_slice = (beta_2_t * array_ops.gather(v, indices) +
+                     (1 - beta_2_t) * math_ops.square(grad))
+        v_update_op = resource_variable_ops.resource_scatter_update(
+            v.handle, indices, v_t_slice)
+
+        # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
+        var_slice = lr * m_t_slice / (math_ops.sqrt(v_t_slice) + epsilon_t)
+        var_update_op = resource_variable_ops.resource_scatter_sub(
+            var.handle, indices, var_slice)
+
+        return control_flow_ops.group(
+            *[var_update_op, m_update_op, v_update_op])
diff --git a/tensorflow_addons/optimizers/python/lazy_adam_optimizer_test.py b/tensorflow_addons/optimizers/python/lazy_adam_optimizer_test.py
new file mode 100644
index 0000000000..6b7e034045
--- /dev/null
+++ b/tensorflow_addons/optimizers/python/lazy_adam_optimizer_test.py
@@ -0,0 +1,348 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LazyAdamOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow_addons.optimizers.python import lazy_adam_optimizer
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      lr=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-7):
+    lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+    m_t = beta1 * m + (1 - beta1) * g_t
+    v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+    param_t = param - lr_t * m_t / (np.sqrt(v_t) + epsilon)
+    return param_t, m_t, v_t
+
+
+def get_beta_accumulators(opt, dtype):
+    local_step = math_ops.cast(opt.iterations + 1, dtype)
+    beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_t = math_ops.cast(opt._get_hyper("beta_2"), dtype)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    return (beta_1_power, beta_2_power)
+
+
+class LazyAdamOptimizerTest(test.TestCase):
+
+    # TODO: remove v1 tests (keep pace with adam_test.py in keras).
+    @test_util.run_deprecated_v1
+    def testSparse(self):
+        for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+            with self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.0, 0.1],
+                                     dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.0, 0.01],
+                                     dtype=dtype.as_numpy_dtype)
+
+                var0 = resource_variable_ops.ResourceVariable(var0_np)
+                var1 = resource_variable_ops.ResourceVariable(var1_np)
+                grads0_np_indices = np.array([0, 2], dtype=np.int32)
+                grads0 = ops.IndexedSlices(
+                    constant_op.constant(grads0_np[grads0_np_indices]),
+                    constant_op.constant(grads0_np_indices),
+                    constant_op.constant([3]))
+                grads1_np_indices = np.array([0, 2], dtype=np.int32)
+                grads1 = ops.IndexedSlices(
+                    constant_op.constant(grads1_np[grads1_np_indices]),
+                    constant_op.constant(grads1_np_indices),
+                    constant_op.constant([3]))
+                opt = lazy_adam_optimizer.LazyAdamOptimizer()
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1]))
+                self.evaluate(variables.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
+
+                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+                # Run 3 steps of Adam
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9**(t + 1), self.evaluate(beta_1_power))
+                    self.assertAllCloseAccordingToType(
+                        0.999**(t + 1), self.evaluate(beta_2_power))
+                    self.evaluate(update)
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0)
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1)
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np,
+                                                       self.evaluate(var0))
+                    self.assertAllCloseAccordingToType(var1_np,
+                                                       self.evaluate(var1))
+
+    @test_util.run_deprecated_v1
+    def testSparseDevicePlacement(self):
+        for index_dtype in [dtypes.int32, dtypes.int64]:
+            with self.cached_session(force_gpu=test.is_gpu_available()):
+                # If a GPU is available, tests that all optimizer ops can be placed on
+                # it (i.e. they have GPU kernels).
+                var = variables.Variable([[1.0], [2.0]])
+                indices = constant_op.constant([0, 1], dtype=index_dtype)
+                g_sum = lambda: math_ops.reduce_sum(array_ops.gather(var, indices))  # pylint: disable=cell-var-from-loop
+                optimizer = lazy_adam_optimizer.LazyAdamOptimizer(3.0)
+                minimize_op = optimizer.minimize(g_sum, var_list=[var])
+                self.evaluate(variables.global_variables_initializer())
+                self.evaluate(minimize_op)
+
+    @test_util.run_deprecated_v1
+    def testSparseRepeatedIndices(self):
+        for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+            with self.cached_session():
+                repeated_index_update_var = variables.Variable([[1.0], [2.0]],
+                                                               dtype=dtype)
+                aggregated_update_var = variables.Variable([[1.0], [2.0]],
+                                                           dtype=dtype)
+                grad_repeated_index = ops.IndexedSlices(
+                    constant_op.constant([0.1, 0.1],
+                                         shape=[2, 1],
+                                         dtype=dtype),
+                    constant_op.constant([1, 1]),
+                    constant_op.constant([2, 1]))
+                grad_aggregated = ops.IndexedSlices(
+                    constant_op.constant([0.2], shape=[1, 1], dtype=dtype),
+                    constant_op.constant([1]), constant_op.constant([2, 1]))
+                repeated_update_opt = lazy_adam_optimizer.LazyAdamOptimizer()
+                repeated_update = repeated_update_opt.apply_gradients(
+                    [(grad_repeated_index, repeated_index_update_var)])
+                aggregated_update_opt = lazy_adam_optimizer.LazyAdamOptimizer()
+                aggregated_update = aggregated_update_opt.apply_gradients(
+                    [(grad_aggregated, aggregated_update_var)])
+                self.evaluate(variables.global_variables_initializer())
+                self.assertAllClose(aggregated_update_var.eval(),
+                                    repeated_index_update_var.eval())
+                for _ in range(3):
+                    repeated_update.run()
+                    aggregated_update.run()
+                    self.assertAllClose(aggregated_update_var.eval(),
+                                        repeated_index_update_var.eval())
+
+    def doTestBasic(self, use_callable_params=False):
+        for i, dtype in enumerate(
+            [dtypes.half, dtypes.float32, dtypes.float64]):
+            with self.session(graph=ops.Graph()):
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = resource_variable_ops.ResourceVariable(
+                    var0_np, name="var0_%d" % i)
+                var1 = resource_variable_ops.ResourceVariable(
+                    var1_np, name="var1_%d" % i)
+                grads0 = constant_op.constant(grads0_np)
+                grads1 = constant_op.constant(grads1_np)
+
+                learning_rate = lambda: 0.001
+                beta1 = lambda: 0.9
+                beta2 = lambda: 0.999
+                epsilon = lambda: 1e-8
+                if not use_callable_params:
+                    learning_rate = learning_rate()
+                    beta1 = beta1()
+                    beta2 = beta2()
+                    epsilon = epsilon()
+
+                opt = lazy_adam_optimizer.LazyAdamOptimizer(
+                    learning_rate=learning_rate)
+                if not context.executing_eagerly():
+                    update = opt.apply_gradients(
+                        zip([grads0, grads1], [var0, var1]))
+                    self.evaluate(variables.global_variables_initializer())
+                    # Fetch params to validate initial values
+                    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                # Run 3 steps of Adam
+                for t in range(3):
+                    beta_1_power, beta_2_power = get_beta_accumulators(
+                        opt, dtype)
+                    self.assertAllCloseAccordingToType(
+                        0.9**(t + 1), self.evaluate(beta_1_power))
+                    self.assertAllCloseAccordingToType(
+                        0.999**(t + 1), self.evaluate(beta_2_power))
+                    if not context.executing_eagerly():
+                        self.evaluate(update)
+                    else:
+                        opt.apply_gradients(
+                            zip([grads0, grads1], [var0, var1]))
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0)
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1)
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np,
+                                                       self.evaluate(var0))
+                    self.assertAllCloseAccordingToType(var1_np,
+                                                       self.evaluate(var1))
+                    self.assertEqual("var0_%d/m:0" % (i, ),
+                                     opt.get_slot(var0, "m").name)
+
+    @test_util.run_in_graph_and_eager_modes(reset_test=True)
+    def testResourceBasic(self):
+        self.doTestBasic()
+
+    def testBasicCallableParams(self):
+        with context.eager_mode():
+            self.doTestBasic(use_callable_params=True)
+
+    @test_util.run_deprecated_v1
+    def testTensorLearningRate(self):
+        for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+            with self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = variables.Variable(var0_np)
+                var1 = variables.Variable(var1_np)
+                grads0 = constant_op.constant(grads0_np)
+                grads1 = constant_op.constant(grads1_np)
+                opt = lazy_adam_optimizer.LazyAdamOptimizer(
+                    constant_op.constant(0.001))
+                update = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1]))
+                self.evaluate(variables.global_variables_initializer())
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], var0.eval())
+                self.assertAllClose([3.0, 4.0], var1.eval())
+
+                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+                # Run 3 steps of Adam
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9**(t + 1), self.evaluate(beta_1_power))
+                    self.assertAllCloseAccordingToType(
+                        0.999**(t + 1), self.evaluate(beta_2_power))
+                    self.evaluate(update)
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0)
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1)
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np,
+                                                       self.evaluate(var0))
+                    self.assertAllCloseAccordingToType(var1_np,
+                                                       self.evaluate(var1))
+
+    @test_util.run_deprecated_v1
+    def testSharing(self):
+        for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+            with self.cached_session():
+                # Initialize variables for numpy implementation.
+                m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+                var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+                var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+                var0 = variables.Variable(var0_np)
+                var1 = variables.Variable(var1_np)
+                grads0 = constant_op.constant(grads0_np)
+                grads1 = constant_op.constant(grads1_np)
+                opt = lazy_adam_optimizer.LazyAdamOptimizer()
+                update1 = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1]))
+                update2 = opt.apply_gradients(
+                    zip([grads0, grads1], [var0, var1]))
+                self.evaluate(variables.global_variables_initializer())
+
+                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+
+                # Fetch params to validate initial values
+                self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+                self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+                # Run 3 steps of intertwined Adam1 and Adam2.
+                for t in range(3):
+                    self.assertAllCloseAccordingToType(
+                        0.9**(t + 1), self.evaluate(beta_1_power))
+                    self.assertAllCloseAccordingToType(
+                        0.999**(t + 1), self.evaluate(beta_2_power))
+                    if t % 2 == 0:
+                        update1.run()
+                    else:
+                        update2.run()
+
+                    var0_np, m0, v0 = adam_update_numpy(
+                        var0_np, grads0_np, t, m0, v0)
+                    var1_np, m1, v1 = adam_update_numpy(
+                        var1_np, grads1_np, t, m1, v1)
+
+                    # Validate updated params
+                    self.assertAllCloseAccordingToType(var0_np,
+                                                       self.evaluate(var0))
+                    self.assertAllCloseAccordingToType(var1_np,
+                                                       self.evaluate(var1))
+
+    def testSlotsUniqueEager(self):
+        with context.eager_mode():
+            v1 = resource_variable_ops.ResourceVariable(1.)
+            v2 = resource_variable_ops.ResourceVariable(1.)
+            opt = lazy_adam_optimizer.LazyAdamOptimizer(1.)
+            opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+            # There should be iteration, and two unique slot variables for v1 and v2.
+            self.assertEqual(5, len(set(opt.variables())))
+            self.assertEqual(
+                self.evaluate(opt.variables()[0]),
+                self.evaluate(opt.iterations))
+
+
+if __name__ == "__main__":
+    test.main()
diff --git a/tensorflow_addons/text/cc/kernels/skip_gram_kernels.cc b/tensorflow_addons/text/cc/kernels/skip_gram_kernels.cc
index c75b98a924..7480177985 100644
--- a/tensorflow_addons/text/cc/kernels/skip_gram_kernels.cc
+++ b/tensorflow_addons/text/cc/kernels/skip_gram_kernels.cc
@@ -47,11 +47,17 @@ class SkipGramGenerateCandidatesOp : public OpKernel {
     OP_REQUIRES_OK(context, context->input("max_skips", &max_skips_tensor));
     const int max_skips = *(max_skips_tensor->scalar<int>().data());
 
+    const Tensor& input_check = context->input(0);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(input_check.shape()),
+        errors::InvalidArgument("input_tensor must be of rank 1"));
+
     OP_REQUIRES(
         context, min_skips >= 0 && max_skips >= 0,
         errors::InvalidArgument("Both min_skips and max_skips must be >= 0."));
-    OP_REQUIRES(context, min_skips <= max_skips,
-                errors::InvalidArgument("min_skips must be <= max_skips."));
+    OP_REQUIRES(
+        context, min_skips <= max_skips,
+        errors::InvalidArgument("min_skips must be <= max_skips."));
 
     const Tensor* start_tensor;
     OP_REQUIRES_OK(context, context->input("start", &start_tensor));
diff --git a/tensorflow_addons/text/python/skip_gram_ops_test.py b/tensorflow_addons/text/python/skip_gram_ops_test.py
index 8f3a578c55..01bf5da7de 100644
--- a/tensorflow_addons/text/python/skip_gram_ops_test.py
+++ b/tensorflow_addons/text/python/skip_gram_ops_test.py
@@ -265,15 +265,11 @@ def test_skip_gram_sample_errors(self):
                 text.skip_gram_sample(input_tensor, min_skips=min_skips,
                                       max_skips=max_skips)
 
-        #########################################
-
-        # FIXME: Why is this not failing?
-        # with self.assertRaises(ValueError):
-        #     invalid_tensor = constant_op.constant([[b"the"], [b"quick"],
-        #                                            [b"brown"]])
-        #     text.skip_gram_sample(invalid_tensor)
-
-        #########################################
+        # Eager tensor must be rank 1
+        with self.assertRaises(errors.InvalidArgumentError):
+            invalid_tensor = constant_op.constant([[b"the"], [b"quick"],
+                                                   [b"brown"]])
+            text.skip_gram_sample(invalid_tensor)
 
         # vocab_freq_table must be provided if vocab_min_count,
         # vocab_subsampling, or corpus_size is specified.
@@ -479,7 +475,7 @@ def _text_vocab_subsample_vocab_helper(self, vocab_freq_file,
             vocab_freq_file=vocab_freq_file,
             vocab_token_index=0,
             vocab_freq_index=1,
-            vocab_freq_dtype=vocab_freq_dtype,
+            vocab_freq_dtype=dtypes.float64,
             vocab_min_count=vocab_min_count,
             vocab_subsampling=0.05,
             corpus_size=corpus_size,

From d5beb61aeb9e7a06385a556dd0c33f9900856fb2 Mon Sep 17 00:00:00 2001
From: Moritz <moritz.kroeger@tu-dortmund.de>
Date: Sun, 10 Feb 2019 23:27:54 +0100
Subject: [PATCH 06/26] added function for easy testing.

---
 .../layers/python/normalizations_test.py      | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index ed5f831955..2b29fc97f7 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -23,6 +23,19 @@
 from tensorflow.python.framework import test_util as tf_test_util
 
 
+def create_and_fit_Sequential_model(layer):
+    model=keras.models.Sequential()
+    model.add(layer)
+    model.add(keras.layers.Dense(32))
+
+    model.compile(optimizer=RMSPropOptimizer(0.01),loss="mse")
+    layer_shape=(10,)+layer.input_shape[1:]
+    print(type(layer_shape))
+    input_batch=np.random.random_sample(size=layer_shape)
+    model.fit(input_batch,
+              epochs=1,
+              batch_size=5)
+    return model
 class normalization_test(test.TestCase):
 
     @tf_test_util.run_all_in_graph_and_eager_modes
@@ -32,30 +45,23 @@ def test_weights(self):
         self.assertEqual(len(layer.trainable_weights), 0)
         self.assertEqual(len(layer.weights), 0)
 
-        layer = keras.layers.LayerNormalization()
+        layer = LayerNormalization()
         layer.build((None, 3, 4))
         self.assertEqual(len(layer.trainable_weights), 2)
         self.assertEqual(len(layer.weights), 2)
 
-
-
+        layer = InstanceNormalization()
+        layer.build((None, 3, 4))
+        self.assertEqual(len(layer.trainable_weights),2)
+        self.assertEqual(len(layer.weights),2)
 
     @tf_test_util.run_all_in_graph_and_eager_modes
     def test_groupnorm_flat(self):
         # Testing for 1 == LayerNorm, 16 == GroupNorm, -1 == InstanceNorm
         groups=[-1,16,1]
         for i in groups:
-            model=keras.models.Sequential()
-            model.add(GroupNormalization(
-                 input_shape=(32,),groups=i))
-            model.add(keras.layers.Dense(32))
 
-            model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
-            model.fit(
-                    np.random.random((10,32)),
-                    np.random.random((10,32)),
-                    epochs=1,
-                    batch_size=10)
+            model=create_and_fit_Sequential_model(GroupNormalization(input_shape=(64,),groups=i))
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
             self.assertTrue(hasattr(model.layers[0], 'beta'))
 
@@ -76,6 +82,7 @@ def test_groupnorm_conv(self):
             model.fit(np.random.random((10,20, 20, 3)))
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
 
+
 """
 class LayerNormalizationTest(keras_parameterized.TestCase):
 

From 7a361ce5ccd50ef543827a143883d2827a7113b7 Mon Sep 17 00:00:00 2001
From: Moritz <moritz.kroeger@tu-dortmund.de>
Date: Sun, 10 Feb 2019 23:30:17 +0100
Subject: [PATCH 07/26] clean up

---
 .../layers/python/normalizations_test.py      | 176 ------------------
 1 file changed, 176 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index 2b29fc97f7..1b42340fd4 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -83,181 +83,5 @@ def test_groupnorm_conv(self):
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
 
 
-"""
-class LayerNormalizationTest(keras_parameterized.TestCase):
-
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_layernorm_regularization(self):
-    layer = keras.layers.LayerNormalization(
-        gamma_regularizer='l1', beta_regularizer='l1')
-    layer.build((None, 3, 4))
-    self.assertEqual(len(layer.losses), 2)
-    max_norm = keras.constraints.max_norm
-    layer = keras.layers.LayerNormalization(
-        gamma_constraint=max_norm, beta_constraint=max_norm)
-    layer.build((None, 3, 4))
-    self.assertEqual(layer.gamma.constraint, max_norm)
-    self.assertEqual(layer.beta.constraint, max_norm)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_layernorm_convnet(self):
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        model = keras.models.Sequential()
-        norm = keras.layers.LayerNormalization(input_shape=(3, 4, 4))
-        model.add(norm)
-        model.compile(loss='mse',
-                      optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                      run_eagerly=testing_utils.should_run_eagerly())
-
-        # centered on 5.0, variance 10.0
-        x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
-        model.fit(x, x, epochs=4, verbose=0)
-        out = model.predict(x)
-        out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
-        out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
-
-        np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
-        np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_layernorm_convnet_channel_last(self):
-    model = keras.models.Sequential()
-    norm = keras.layers.LayerNormalization(input_shape=(4, 4, 3))
-    model.add(norm)
-    model.compile(loss='mse',
-                  optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                  run_eagerly=testing_utils.should_run_eagerly())
-
-    # centered on 5.0, variance 10.0
-    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
-    model.fit(x, x, epochs=4, verbose=0)
-    out = model.predict(x)
-    out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
-    out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
-
-    np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
-    np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_layernorm_correctness(self):
-    _run_layernorm_correctness_test(
-        normalization.LayerNormalization, dtype='float32')
-
-  @keras_parameterized.run_all_keras_modes
-  def test_layernorm_mixed_precision(self):
-    _run_layernorm_correctness_test(
-        normalization.LayerNormalization, dtype='float16')
-
-  def doOutputTest(self,
-                   input_shape,
-                   tol=1e-5,
-                   norm_axis=None,
-                   params_axis=-1,
-                   dtype=None):
-    ndim = len(input_shape)
-    if norm_axis is None:
-      moments_axis = range(1, ndim)
-    elif isinstance(norm_axis, int):
-      if norm_axis < 0:
-        moments_axis = [norm_axis + ndim]
-      else:
-        moments_axis = [norm_axis]
-    else:
-      moments_axis = []
-      for dim in norm_axis:
-        if dim < 0:
-          dim = dim + ndim
-        moments_axis.append(dim)
-
-    moments_axis = tuple(moments_axis)
-    expected_shape = []
-    for i in range(ndim):
-      if i not in moments_axis:
-        expected_shape.append(input_shape[i])
-
-    expected_mean = np.zeros(expected_shape)
-    expected_var = np.ones(expected_shape)
-    for mu in [0.0, 1e2]:
-      for sigma in [1.0, 0.1]:
-        inputs = np.random.randn(*input_shape) * sigma + mu
-        inputs_t = constant_op.constant(inputs, shape=input_shape)
-        layer = normalization.LayerNormalization(
-            norm_axis=norm_axis, params_axis=params_axis, dtype=dtype)
-        outputs = layer(inputs_t)
-        beta = layer.beta
-        gamma = layer.gamma
-        for weight in layer.weights:
-          self.evaluate(weight.initializer)
-        outputs = self.evaluate(outputs)
-        beta = self.evaluate(beta)
-        gamma = self.evaluate(gamma)
-
-        # The mean and variance of the output should be close to 0 and 1
-        # respectively.
-
-        # Make sure that there are no NaNs
-        self.assertFalse(np.isnan(outputs).any())
-        mean = np.mean(outputs, axis=moments_axis)
-        var = np.var(outputs, axis=moments_axis)
-        # Layer-norm implemented in numpy
-        eps = 1e-12
-        expected_out = (
-            (gamma * (inputs - np.mean(
-                inputs, axis=moments_axis, keepdims=True)) /
-             np.sqrt(eps + np.var(
-                 inputs, axis=moments_axis, keepdims=True))) + beta)
-        self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
-        self.assertAllClose(expected_var, var, atol=tol)
-        # The full computation gets a bigger tolerance
-        self.assertAllClose(expected_out, outputs, atol=5 * tol)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def testOutput2DInput(self):
-    self.doOutputTest((10, 300))
-    self.doOutputTest((10, 300), norm_axis=[0])
-    self.doOutputTest((10, 300), params_axis=[0, 1])
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def testOutput2DInputDegenerateNormAxis(self):
-    with self.assertRaisesRegexp(ValueError, r'Invalid axis: 2'):
-      self.doOutputTest((10, 300), norm_axis=2)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def testOutput4DInput(self):
-    self.doOutputTest((100, 10, 10, 3))
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def testOutput4DInputNormOnInnermostAxis(self):
-    # Equivalent tests
-    shape = (100, 10, 10, 3)
-    self.doOutputTest(
-        shape, norm_axis=list(range(3, len(shape))), tol=1e-4, dtype='float64')
-    self.doOutputTest(shape, norm_axis=-1, tol=1e-4, dtype='float64')
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def testOutputSmallInput(self):
-    self.doOutputTest((10, 10, 10, 30))
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def testOutputSmallInputNormOnInnermostAxis(self):
-    self.doOutputTest((10, 10, 10, 30), norm_axis=3)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def testOutputSmallInputNormOnMixedAxes(self):
-    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3])
-    self.doOutputTest((10, 10, 10, 30), params_axis=[-2, -1])
-    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3],
-                      params_axis=[-3, -2, -1])
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def testOutputBigInput(self):
-    self.doOutputTest((1, 100, 100, 1))
-    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2])
-    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2],
-                      params_axis=[-2, -1])
-
-"""
 if __name__ == "__main__":
     test.main()

From 2c174fc7f037871a7022a612f8e0e769008fe2e6 Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Wed, 13 Feb 2019 22:09:39 +0100
Subject: [PATCH 08/26] found bug in BUILD File

---
 tensorflow_addons/layers/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index 1b1ae10b9d..cad1dc0905 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -4,7 +4,7 @@ package(default_visibility = ["//visibility:public"])
 
 py_library(
     name = "layers_py",
-    srcs = [
+    srcs = ([
         "__init__.py",
         "python/__init__.py",
         "python/maxout.py",

From 7c32461cc16abbc6189d1db757c75f9e6dcf5591 Mon Sep 17 00:00:00 2001
From: Smokrow <moritz.kroeger@tu-dortmund.de>
Date: Sun, 17 Feb 2019 23:55:23 +0100
Subject: [PATCH 09/26] fixed signature bug and added tests

---
 .../layers/python/normalizations.py           |  1 +
 .../layers/python/normalizations_test.py      | 79 +++++++++++++------
 2 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 52251557e8..01206b5dcf 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -137,6 +137,7 @@ def build(self, input_shape):
         else:
             self.beta = None
         self.built = True
+        super(GroupNormalization, self).build(input_shape)
 
     def call(self, inputs):
         input_shape = K.int_shape(inputs)
diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index 1b42340fd4..2e1035b34f 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -15,31 +15,33 @@
 
 from tensorflow_addons.layers.python.normalizations import GroupNormalization,LayerNormalization,InstanceNormalization
 import numpy as np
+import scipy as scipy
 import tensorflow as tf
-from tensorflow.python import keras as keras
+from tensorflow import keras as keras
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
-
 from tensorflow.python.platform import test
 from tensorflow.python.framework import test_util as tf_test_util
 
 
-def create_and_fit_Sequential_model(layer):
+def create_and_fit_Sequential_model(layer,shape):
+    #Helperfunction for quick evaluation
     model=keras.models.Sequential()
     model.add(layer)
     model.add(keras.layers.Dense(32))
+    model.add(keras.layers.Dense(1))
 
-    model.compile(optimizer=RMSPropOptimizer(0.01),loss="mse")
-    layer_shape=(10,)+layer.input_shape[1:]
-    print(type(layer_shape))
-    input_batch=np.random.random_sample(size=layer_shape)
-    model.fit(input_batch,
-              epochs=1,
-              batch_size=5)
+    model.compile(optimizer=RMSPropOptimizer(0.01),loss="categorical_crossentropy")
+    layer_shape=(10,)+shape
+    input_batch=np.random.rand(*layer_shape)
+    output_batch=np.random.rand(*(10,1))
+    model.fit(x=input_batch,y=output_batch, epochs=1, batch_size=1)
     return model
+
+
 class normalization_test(test.TestCase):
 
-    @tf_test_util.run_all_in_graph_and_eager_modes
     def test_weights(self):
+        #Check if weights get initialized
         layer = GroupNormalization(groups=1,scale=False, center=False)
         layer.build((None, 3, 4))
         self.assertEqual(len(layer.trainable_weights), 0)
@@ -55,31 +57,62 @@ def test_weights(self):
         self.assertEqual(len(layer.trainable_weights),2)
         self.assertEqual(len(layer.weights),2)
 
-    @tf_test_util.run_all_in_graph_and_eager_modes
+
     def test_groupnorm_flat(self):
+        #Check basic usage of groupnorm_flat
         # Testing for 1 == LayerNorm, 16 == GroupNorm, -1 == InstanceNorm
         groups=[-1,16,1]
+        shape=(64,)
         for i in groups:
-
-            model=create_and_fit_Sequential_model(GroupNormalization(input_shape=(64,),groups=i))
+            model=create_and_fit_Sequential_model(GroupNormalization(groups=i),shape)
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
             self.assertTrue(hasattr(model.layers[0], 'beta'))
 
-    @tf_test_util.run_all_in_graph_and_eager_modes
+
+    def test_layernorm_flat(self):
+        # Check basic usage of layernorm
+        model=create_and_fit_Sequential_model(LayerNormalization(),(64,))
+        self.assertTrue(hasattr(model.layers[0],'gamma'))
+        self.assertTrue(hasattr(model.layers[0],'beta'))
+
+
+    def test_instancenorm_flat(self):
+        # Check basic usage of instancenorm
+        model=create_and_fit_Sequential_model(InstanceNormalization(),(64,))
+        self.assertTrue(hasattr(model.layers[0],'gamma'))
+        self.assertTrue(hasattr(model.layers[0],'beta'))
+
+
+    def test_initializer(self):
+        # Check if the initializer for gamma and beta is working correctly
+
+        model=create_and_fit_Sequential_model(GroupNormalization(groups=32,
+                                                                 beta_initializer='random_normal',
+                                                                 beta_constraint='NonNeg',
+                                                                 gamma_initializer='random_normal',
+                                                                 gamma_constraint='NonNeg'),
+                                              (64,))
+        
+        weights=np.array(model.layers[0].get_weights())
+        negativ=weights[weights<0.0]
+
+        self.assertTrue(len(weights)==0)
+
+
     def test_groupnorm_conv(self):
+        # Check if Axis is working for CONV nets
         # Testing for 1 == LayerNorm, 5 == GroupNorm, -1 == InstanceNorm
-        #groups=[1,5,-1]
-        groups=[1]
+        groups=[-1,5,1]
         for i in groups:
-
             model = keras.models.Sequential()
-            model.add(GroupNormalization(
-                 input_shape=(20,20,3,),groups=i))
-
+            model.add(GroupNormalization(axis=1,groups=i,input_shape=(20,20,3)))
             model.add(keras.layers.Conv2D(5, (1, 1), padding='same'))
-
+            model.add(keras.layers.Flatten())
+            model.add(keras.layers.Dense(1,activation='softmax'))
             model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
-            model.fit(np.random.random((10,20, 20, 3)))
+            x=np.random.randint(1000,size=(10,20, 20, 3))
+            y=np.random.randint(1000,size=(10,1))
+            a=model.fit(x=x,y=y,epochs=1)
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
 
 

From 0b041622829386ceef0d2e3842b53a12b2973960 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Moritz=20Kr=C3=B6ger?= <moritz.kroeger@tu-dortmund.de>
Date: Mon, 18 Feb 2019 00:19:31 +0100
Subject: [PATCH 10/26] Update maxout.py

---
 tensorflow_addons/layers/python/maxout.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow_addons/layers/python/maxout.py b/tensorflow_addons/layers/python/maxout.py
index 55ffe57ebc..de7c697010 100644
--- a/tensorflow_addons/layers/python/maxout.py
+++ b/tensorflow_addons/layers/python/maxout.py
@@ -27,6 +27,7 @@
 
 
 @keras_utils.register_keras_custom_object
+class Maxout(Layer):
     """Applies Maxout to the input.
 
     "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron

From 095d91ee4247f84f53cf069a910ce5edba25d500 Mon Sep 17 00:00:00 2001
From: Smokrow <moritz.kroeger@tu-dortmund.de>
Date: Mon, 18 Feb 2019 00:30:00 +0100
Subject: [PATCH 11/26] small change to variable name

---
 tensorflow_addons/layers/python/normalizations_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index 2e1035b34f..a68f4c8ef1 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -95,8 +95,9 @@ def test_initializer(self):
         
         weights=np.array(model.layers[0].get_weights())
         negativ=weights[weights<0.0]
-
-        self.assertTrue(len(weights)==0)
+        print("------------------------------------------------------")
+        print(negativ)
+        self.assertTrue(len(negativ)==0)
 
 
     def test_groupnorm_conv(self):

From 3b6d4e66c5694660a7d2d0d4cadcef1fa666dd4f Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Sun, 24 Feb 2019 12:58:57 +0100
Subject: [PATCH 12/26] cleaned BUILD file

---
 tensorflow_addons/layers/BUILD | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index cad1dc0905..caad2f5a22 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -8,15 +8,16 @@ py_library(
         "__init__.py",
         "python/__init__.py",
         "python/maxout.py",
+        "python/normalizations.py",
         "python/poincare.py",
-        "python/wrappers.py",
-        "python/normalizations.py"
+        "python/wrappers.py"
     ]),
     srcs_version = "PY2AND3",
 )
 
 py_test(
     name = "layers_wrappers_py_test",
+    size= "small",
     srcs = [
         "python/wrappers_test.py",
     ],
@@ -40,24 +41,15 @@ py_test(
     ],
 )
 
-py_test(
-    name = "poincare_py_test",
-    size = "small",
-    srcs = [
-        "python/poincare_test.py",
-    ],
-    main = "python/poincare_test.py",
-    srcs_version = "PY2AND3",
-)
-
 py_test(
     name = "layers_normalizations_py_test",
+    size= "small",
     srcs = [
         "python/normalizations_test.py",
     ],
     main = "python/normalizations_test.py",
+    srcs_version = "PY2AND3",
     deps = [
             ":layers_py",
         ],
-    srcs_version = "PY2AND3",
 )

From b288cca37e78447ab543448e2b5beeff6efcd791 Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Sun, 24 Feb 2019 13:09:52 +0100
Subject: [PATCH 13/26] cleaned docstring

---
 .../layers/python/normalizations.py           | 46 +++++++++++++------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 52251557e8..32a69d6528 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -20,22 +20,26 @@
 from tensorflow.keras import backend as K
 from tensorflow.python.ops import nn
 
+
 class GroupNormalization(Layer):
     """Group normalization layer.
+
     Group Normalization divides the channels into groups and computes
-    within each group
-    the mean and variance for normalization.
+    within each group the mean and variance for normalization.
     Group Normalization's computation is independent
-     of batch sizes, and its accuracy is stable in a wide range of batch sizes.
+    of batch sizes, and its accuracy is stable in a wide range of batch sizes.
+
     Relation to Layer Normalization:
     If the number of groups is set to 1, then this operation becomes identical to
     Layer Normalization.
+
     Relation to Instance Normalization:
     If the number of groups is set to the
     input dimension (number of groups is equal
     to number of channels), then this operation becomes
     identical to Instance Normalization.
-    # Arguments
+
+    Arguments
         groups: Integer, the number of groups for Group Normalization.
             Can be in the range [1, N] where N is the input dimension.
             The input dimension must be divisible by the number of groups.
@@ -58,13 +62,15 @@ class GroupNormalization(Layer):
         gamma_regularizer: Optional regularizer for the gamma weight.
         beta_constraint: Optional constraint for the beta weight.
         gamma_constraint: Optional constraint for the gamma weight.
-    # Input shape
+    
+    Input shape
         Arbitrary. Use the keyword argument `input_shape`
         (tuple of integers, does not include the samples axis)
         when using this layer as the first layer in a model.
-    # Output shape
+    
+    Output shape
         Same shape as input.
-    # References
+    References
         - [Group Normalization](https://arxiv.org/abs/1803.08494)
     """
 
@@ -205,11 +211,13 @@ def compute_output_shape(self, input_shape):
 
 class LayerNormalization(GroupNormalization):
     """Layer normalization layer.
+
     Layer Normalization is an specific case of ```GroupNormalization```since it
     normalizes all features of a layer. The Groupsize is 1.
     Layer Normalization's computation is independent
     of batch sizes, and its accuracy is stable in a wide range of batch sizes.
-    # Arguments
+    
+    Arguments
         axis: Integer, the axis that should be normalized
             (typically the features axis).
             For instance, after a `Conv2D` layer with
@@ -229,13 +237,16 @@ class LayerNormalization(GroupNormalization):
         gamma_regularizer: Optional regularizer for the gamma weight.
         beta_constraint: Optional constraint for the beta weight.
         gamma_constraint: Optional constraint for the gamma weight.
-    # Input shape
+    
+    Input shape
         Arbitrary. Use the keyword argument `input_shape`
         (tuple of integers, does not include the samples axis)
         when using this layer as the first layer in a model.
-    # Output shape
+    
+    Output shape
         Same shape as input.
-    # References
+    
+    References
         - [Layer Normalization](https://arxiv.org/abs/1607.06450)
     """
     def __init__(self,**kwargs):
@@ -244,11 +255,13 @@ def __init__(self,**kwargs):
 
 class InstanceNormalization(GroupNormalization):
     """Instance normalization layer.
+    
     Instance Normalization is an specific case of ```GroupNormalization```since it
     normalizes all features of one channel. The Groupsize is equal to the channel size.
     Instance Normalization's computation is independent
     of batch sizes, and its accuracy is stable in a wide range of batch sizes.
-    # Arguments
+
+    Arguments
         axis: Integer, the axis that should be normalized
             (typically the features axis).
             For instance, after a `Conv2D` layer with
@@ -268,13 +281,16 @@ class InstanceNormalization(GroupNormalization):
         gamma_regularizer: Optional regularizer for the gamma weight.
         beta_constraint: Optional constraint for the beta weight.
         gamma_constraint: Optional constraint for the gamma weight.
-    # Input shape
+
+    Input shape
         Arbitrary. Use the keyword argument `input_shape`
         (tuple of integers, does not include the samples axis)
         when using this layer as the first layer in a model.
-    # Output shape
+
+    Output shape
         Same shape as input.
-    # References
+
+    References
         - [Layer Normalization](https://arxiv.org/abs/1607.06450)
     """
     def __init__(self,**kwargs):

From b7e3d779bb117b43afc908a5a573d80515b9bc8d Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Sun, 24 Feb 2019 18:10:42 +0100
Subject: [PATCH 14/26] did some refactoring

---
 .../layers/python/normalizations.py           | 94 +++++++++++++------
 1 file changed, 67 insertions(+), 27 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 32a69d6528..914e56b29b 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -15,9 +15,12 @@
 
 # Orginal implementation from keras_contrib/layer/normalization
 
-from tensorflow.keras.layers import Layer, InputSpec
-from tensorflow.keras import initializers, regularizers, constraints
 from tensorflow.keras import backend as K
+from tensorflow.keras import constraints
+from tensorflow.keras import initializers
+from tensorflow.keras import regularizers 
+from tensorflow.keras.layers import InputSpec
+from tensorflow.keras.layers import Layer
 from tensorflow.python.ops import nn
 
 
@@ -102,16 +105,35 @@ def __init__(self,
         self.gamma_constraint = constraints.get(gamma_constraint)
 
     def build(self, input_shape):
-        dim = input_shape[self.axis]
 
+        self._check_if_input_shape_is_None(input_shape)
+        self._set_number_of_groups_for_instance_norm(input_shape)
+        self._check_size_of_dimensions(input_shape) 
+        self._create_input_spec(input_shape)
+
+        self._add_gamma_weight(input_shape)
+        self._add_beta_weight(input_shape)
+        self.built = True
+
+    def _check_if_input_shape_is_None(self, input_shape):
+        dim = input_shape[self.axis]
         if dim is None:
             raise ValueError('Axis ' + str(self.axis) + ' of '
                              'input tensor should have a defined dimension '
                              'but the layer received an input with shape ' +
                              str(input_shape) + '.')
+
+
+    def _set_number_of_groups_for_instance_norm(self, input_shape):
+        dim=input_shape[self.axis]
+
         if self.groups==-1:
             self.groups=dim
 
+
+    def _check_size_of_dimensions(self,input_shape):
+
+        dim=input_shape[self.axis]
         if dim < self.groups:
             raise ValueError('Number of groups (' + str(self.groups) + ') cannot be '
                              'more than the number of channels (' +
@@ -122,9 +144,18 @@ def build(self, input_shape):
                              'multiple of the number of channels (' +
                              str(dim) + ').')
 
+
+    def _create_input_spec(self,input_shape):
+
+        dim=input_shape[self.axis]
         self.input_spec = InputSpec(ndim=len(input_shape),
                                     axes={self.axis: dim})
-        shape = (dim,)
+
+
+    def _add_gamma_weight(self,input_shape):
+
+        dim=input_shape[self.axis]
+        shape=(dim,)
 
         if self.scale:
             self.gamma = self.add_weight(shape=shape,
@@ -134,6 +165,12 @@ def build(self, input_shape):
                                          constraint=self.gamma_constraint)
         else:
             self.gamma = None
+
+    def _add_beta_weight(self,input_shape):
+
+        dim=input_shape[self.axis]
+        shape=(dim,)
+
         if self.center:
             self.beta = self.add_weight(shape=shape,
                                         name='beta',
@@ -142,47 +179,50 @@ def build(self, input_shape):
                                         constraint=self.beta_constraint)
         else:
             self.beta = None
-        self.built = True
 
-    def call(self, inputs):
-        input_shape = K.int_shape(inputs)
-        tensor_input_shape = K.shape(inputs)
 
-        # Prepare broadcasting shape.
-        reduction_axes = list(range(len(input_shape)))
-        del reduction_axes[self.axis]
+    def _create_broadcast_shape(self,input_shape):
         broadcast_shape = [1] * len(input_shape)
         broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
         broadcast_shape.insert(1, self.groups)
+        return broadcast_shape 
+
 
-        reshape_group_shape = K.shape(inputs)
-        group_axes = [reshape_group_shape[i] for i in range(len(input_shape))]
+    def _create_group_shape(self,input_shape):
+
+        group_axes = [tensor_input_shape[i] for i in range(len(input_shape))]
         group_axes[self.axis] = input_shape[self.axis] // self.groups
         group_axes.insert(1, self.groups)
 
         # reshape inputs to new group shape
         group_shape = [group_axes[0], self.groups] + group_axes[2:]
         group_shape = K.stack(group_shape)
-        inputs = K.reshape(inputs, group_shape)
+        return group_shape
+
+
+    def call(self, inputs):
+        input_shape = K.int_shape(inputs)
+        tensor_input_shape = K.shape(inputs)
+
+        reshaped_inputs = K.reshape(inputs, group_shape)
 
         group_reduction_axes = list(range(len(group_axes)))
         mean, variance = nn.moments(inputs, group_reduction_axes[2:],
                                     keep_dims=True)
         inputs = (inputs - mean) / (K.sqrt(variance + self.epsilon))
 
-        # prepare broadcast shape
-        inputs = K.reshape(inputs, group_shape)
-
-        outputs = inputs
-
-        # In this case we must explicitly broadcast all parameters.
-        if self.scale:
-            broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
-            outputs = outputs * broadcast_gamma
-
-        if self.center:
-            broadcast_beta = K.reshape(self.beta, broadcast_shape)
-            outputs = outputs + broadcast_beta
+        outputs = K.reshape(inputs, group_shape)
+        
+        if self.scale or self.center:
+            broadcast_shape=self._create_broadcast_shape(input_shape)
+            # In this case we must explicitly broadcast all parameters.
+            if self.scale:
+                broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
+                outputs = outputs * broadcast_gamma
+
+            if self.center:
+                broadcast_beta = K.reshape(self.beta, broadcast_shape)
+                outputs = outputs + broadcast_beta
 
         # finally we reshape the output back to the input shape
         outputs = K.reshape(outputs, tensor_input_shape)

From 55cb1580129f3da22f5ccf4f36dfd9a280b3fd14 Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Tue, 26 Feb 2019 10:44:03 +0100
Subject: [PATCH 15/26] refactored call function

---
 .../layers/python/normalizations.py           | 43 ++++++++++---------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 914e56b29b..ea363aeffb 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -188,41 +188,42 @@ def _create_broadcast_shape(self,input_shape):
         return broadcast_shape 
 
 
-    def _create_group_shape(self,input_shape):
+    def _reshape_into_groups(self,input_shape):
 
-        group_axes = [tensor_input_shape[i] for i in range(len(input_shape))]
-        group_axes[self.axis] = input_shape[self.axis] // self.groups
-        group_axes.insert(1, self.groups)
-
-        # reshape inputs to new group shape
-        group_shape = [group_axes[0], self.groups] + group_axes[2:]
+        group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
+        group_shape[self.axis] = input_shape[self.axis] // self.groups
+        group_shape.insert(1, self.groups)
         group_shape = K.stack(group_shape)
-        return group_shape
+        reshaped_inputs = K.reshape(inputs, group_shape)
+        return reshaped_inputs, group_shape
 
+    def _apply_scale_or_center(self,inputs, input_shape):
+        broadcast_shape=self._create_broadcast_shape(input_shape)
+        if self.scale:
+            broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
+            outputs = outputs * broadcast_gamma
+
+        if self.center:
+            broadcast_beta = K.reshape(self.beta, broadcast_shape)
+                outputs = outputs + broadcast_beta
+        return outputs
 
     def call(self, inputs):
+
         input_shape = K.int_shape(inputs)
         tensor_input_shape = K.shape(inputs)
 
-        reshaped_inputs = K.reshape(inputs, group_shape)
+        reshaped_inputs, group_shape=self._reshape_into_groups(input_shape)
 
-        group_reduction_axes = list(range(len(group_axes)))
-        mean, variance = nn.moments(inputs, group_reduction_axes[2:],
+        group_reduction_axes = list(range(len(group_shape)))
+        mean, variance = nn.moments(reshaped_inputs, group_reduction_axes[2:],
                                     keep_dims=True)
-        inputs = (inputs - mean) / (K.sqrt(variance + self.epsilon))
+        inputs = (reshaped_inputs - mean) / (K.sqrt(variance + self.epsilon))
 
         outputs = K.reshape(inputs, group_shape)
         
         if self.scale or self.center:
-            broadcast_shape=self._create_broadcast_shape(input_shape)
-            # In this case we must explicitly broadcast all parameters.
-            if self.scale:
-                broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
-                outputs = outputs * broadcast_gamma
-
-            if self.center:
-                broadcast_beta = K.reshape(self.beta, broadcast_shape)
-                outputs = outputs + broadcast_beta
+            outputs = self._apply_scale_or_center(outputs,input_shape)
 
         # finally we reshape the output back to the input shape
         outputs = K.reshape(outputs, tensor_input_shape)

From 540492ecac0d59efcf518fdc691b291b178b8177 Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Fri, 1 Mar 2019 20:05:09 +0100
Subject: [PATCH 16/26] fixed BUILD file

---
 tensorflow_addons/layers/BUILD | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index e2b6b4220e..8d3fea362b 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -11,11 +11,11 @@ py_library(
         "python/normalizations.py",
         "python/poincare.py",
         "python/wrappers.py"
-    ]),
+    ],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow_addons/utils:utils_py",
-    ],
+    ]
 )
 
 py_test(
@@ -28,7 +28,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
-    ],
+    ]
 )
 
 py_test(
@@ -41,7 +41,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":layers_py",
-    ],
+    ]
 )
 
 py_test(
@@ -54,5 +54,5 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
             ":layers_py",
-        ],
+        ]
 )

From f980aa5ed58325233c81ba911afaf6a32ec2ddff Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Fri, 1 Mar 2019 20:06:07 +0100
Subject: [PATCH 17/26] implemented batch_normalization from tf nn

---
 .../layers/python/normalizations.py           | 147 ++++++++++--------
 1 file changed, 85 insertions(+), 62 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 0ac9dbfa37..4c90352eb7 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -104,6 +104,7 @@ def __init__(self,
         self.beta_constraint = constraints.get(beta_constraint)
         self.gamma_constraint = constraints.get(gamma_constraint)
 
+
     def build(self, input_shape):
 
         self._check_if_input_shape_is_None(input_shape)
@@ -116,6 +117,83 @@ def build(self, input_shape):
         self.built = True
         super(GroupNormalization, self).build(input_shape)
 
+
+    def call(self, inputs):
+
+        input_shape = K.int_shape(inputs)
+        tensor_input_shape = K.shape(inputs)
+
+        reshaped_inputs, group_shape=self._reshape_into_groups(inputs,input_shape,tensor_input_shape)
+
+        normalized_inputs = self._apply_normalization(reshaped_inputs, input_shape)
+
+        outputs = K.reshape(normalized_inputs, tensor_input_shape)
+
+        return outputs
+
+
+    def get_config(self):
+        config = {
+            'groups': self.groups,
+            'axis': self.axis,
+            'epsilon': self.epsilon,
+            'center': self.center,
+            'scale': self.scale,
+            'beta_initializer': initializers.serialize(self.beta_initializer),
+            'gamma_initializer': initializers.serialize(self.gamma_initializer),
+            'beta_regularizer': regularizers.serialize(self.beta_regularizer),
+            'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
+            'beta_constraint': constraints.serialize(self.beta_constraint),
+            'gamma_constraint': constraints.serialize(self.gamma_constraint)
+        }
+        base_config = super(GroupNormalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+    def _reshape_into_groups(self,inputs,input_shape,tensor_input_shape):
+
+        group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
+        group_shape[self.axis] = input_shape[self.axis] // self.groups
+        group_shape.insert(1, self.groups)
+        group_shape = K.stack(group_shape)
+        reshaped_inputs = K.reshape(inputs, group_shape)
+        return reshaped_inputs, group_shape
+
+
+    def _apply_normalization(self, reshaped_inputs , input_shape):
+        
+        group_shape = K.int_shape(reshaped_inputs)
+        group_reduction_axes = list(range(len(group_shape)))
+        # Remember the ordering of the tensor is [batch, group , steps]. Jump the first 2 to calculate the variance and the mean
+        mean, variance = nn.moments(reshaped_inputs, group_reduction_axes[2:],
+                                    keep_dims=True)
+
+        gamma,beta= self._get_reshaped_weights(input_shape)
+        normalized_inputs= nn.batch_normalization(reshaped_inputs,
+                                                mean = mean,
+                                                variance = variance,
+                                                scale = gamma,
+                                                offset = beta,
+                                                variance_epsilon = self.epsilon)
+        return normalized_inputs
+
+
+    def _get_reshaped_weights(self, input_shape):
+        broadcast_shape=self._create_broadcast_shape(input_shape)
+        gamma=None
+        beta=None
+        if self.scale:
+            gamma = K.reshape(self.gamma, broadcast_shape)
+
+        if self.center:
+            beta = K.reshape(self.beta, broadcast_shape)
+        return gamma, beta
+
+
     def _check_if_input_shape_is_None(self, input_shape):
         dim = input_shape[self.axis]
         if dim is None:
@@ -189,68 +267,6 @@ def _create_broadcast_shape(self,input_shape):
         return broadcast_shape
 
 
-    def _reshape_into_groups(self,input_shape):
-
-        group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
-        group_shape[self.axis] = input_shape[self.axis] // self.groups
-        group_shape.insert(1, self.groups)
-        group_shape = K.stack(group_shape)
-        reshaped_inputs = K.reshape(inputs, group_shape)
-        return reshaped_inputs, group_shape
-
-    def _apply_scale_or_center(self,inputs, input_shape):
-        broadcast_shape=self._create_broadcast_shape(input_shape)
-        if self.scale:
-            broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
-            outputs = outputs * broadcast_gamma
-
-        if self.center:
-            broadcast_beta = K.reshape(self.beta, broadcast_shape)
-                outputs = outputs + broadcast_beta
-        return outputs
-
-    def call(self, inputs):
-
-        input_shape = K.int_shape(inputs)
-        tensor_input_shape = K.shape(inputs)
-
-        reshaped_inputs, group_shape=self._reshape_into_groups(input_shape)
-
-        group_reduction_axes = list(range(len(group_shape)))
-        mean, variance = nn.moments(reshaped_inputs, group_reduction_axes[2:],
-                                    keep_dims=True)
-        inputs = (reshaped_inputs - mean) / (K.sqrt(variance + self.epsilon))
-
-        outputs = K.reshape(inputs, group_shape)
-
-        if self.scale or self.center:
-            outputs = self._apply_scale_or_center(outputs,input_shape)
-
-        # finally we reshape the output back to the input shape
-        outputs = K.reshape(outputs, tensor_input_shape)
-
-        return outputs
-
-    def get_config(self):
-        config = {
-            'groups': self.groups,
-            'axis': self.axis,
-            'epsilon': self.epsilon,
-            'center': self.center,
-            'scale': self.scale,
-            'beta_initializer': initializers.serialize(self.beta_initializer),
-            'gamma_initializer': initializers.serialize(self.gamma_initializer),
-            'beta_regularizer': regularizers.serialize(self.beta_regularizer),
-            'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
-            'beta_constraint': constraints.serialize(self.beta_constraint),
-            'gamma_constraint': constraints.serialize(self.gamma_constraint)
-        }
-        base_config = super(GroupNormalization, self).get_config()
-        return dict(list(base_config.items()) + list(config.items()))
-
-    def compute_output_shape(self, input_shape):
-        return input_shape
-
 class LayerNormalization(GroupNormalization):
     """Layer normalization layer.
 
@@ -292,6 +308,8 @@ class LayerNormalization(GroupNormalization):
         - [Layer Normalization](https://arxiv.org/abs/1607.06450)
     """
     def __init__(self,**kwargs):
+        if "groups" in kwargs:
+            tf.logging.warning("The given value for groups will be overwritten.")
         kwargs["groups"]=1
         super(LayerNormalization,self).__init__(**kwargs)
 
@@ -336,5 +354,10 @@ class InstanceNormalization(GroupNormalization):
         - [Layer Normalization](https://arxiv.org/abs/1607.06450)
     """
     def __init__(self,**kwargs):
+
+        if "groups" in kwargs:
+            tf.logging.warning("The given value for groups will be overwritten.")
+
         kwargs["groups"]=-1
         super(InstanceNormalization,self).__init__(**kwargs)
+

From 576961d2027d980a32a06fac72a117e7951f0084 Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Mon, 4 Mar 2019 11:44:22 +0100
Subject: [PATCH 18/26] added normalization and reshape test

---
 .../layers/python/normalizations.py           |   6 +-
 .../layers/python/normalizations_test.py      | 179 +++++++++++++-----
 2 files changed, 139 insertions(+), 46 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 4c90352eb7..72b45f9f25 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -15,6 +15,7 @@
 
 # Orginal implementation from keras_contrib/layer/normalization
 
+import tensorflow as tf
 from tensorflow.keras import backend as K
 from tensorflow.keras import constraints
 from tensorflow.keras import initializers
@@ -78,7 +79,7 @@ class GroupNormalization(Layer):
     """
 
     def __init__(self,
-                 groups=32,
+                 groups=2,
                  axis=-1,
                  epsilon=1e-5,
                  center=True,
@@ -165,7 +166,7 @@ def _reshape_into_groups(self,inputs,input_shape,tensor_input_shape):
 
 
     def _apply_normalization(self, reshaped_inputs , input_shape):
-        
+
         group_shape = K.int_shape(reshaped_inputs)
         group_reduction_axes = list(range(len(group_shape)))
         # Remember the ordering of the tensor is [batch, group , steps]. Jump the first 2 to calculate the variance and the mean
@@ -360,4 +361,3 @@ def __init__(self,**kwargs):
 
         kwargs["groups"]=-1
         super(InstanceNormalization,self).__init__(**kwargs)
-
diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index a68f4c8ef1..3bc472ae91 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -13,36 +13,89 @@
 # limitations under the License.
 # =============================================================================
 
-from tensorflow_addons.layers.python.normalizations import GroupNormalization,LayerNormalization,InstanceNormalization
 import numpy as np
 import scipy as scipy
 import tensorflow as tf
 from tensorflow import keras as keras
+from tensorflow_addons.layers.python.normalizations import GroupNormalization
+from tensorflow_addons.layers.python.normalizations import InstanceNormalization
+from tensorflow_addons.layers.python.normalizations import LayerNormalization
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import normalization
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 from tensorflow.python.platform import test
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.training import gradient_descent
 
 
-def create_and_fit_Sequential_model(layer,shape):
-    #Helperfunction for quick evaluation
-    model=keras.models.Sequential()
+def create_and_fit_Sequential_model(layer, shape):
+    # Helperfunction for quick evaluation
+    model = keras.models.Sequential()
     model.add(layer)
     model.add(keras.layers.Dense(32))
     model.add(keras.layers.Dense(1))
 
-    model.compile(optimizer=RMSPropOptimizer(0.01),loss="categorical_crossentropy")
-    layer_shape=(10,)+shape
-    input_batch=np.random.rand(*layer_shape)
-    output_batch=np.random.rand(*(10,1))
-    model.fit(x=input_batch,y=output_batch, epochs=1, batch_size=1)
+    model.compile(optimizer=RMSPropOptimizer(0.01),
+                  loss="categorical_crossentropy")
+    layer_shape = (10,) + shape
+    input_batch = np.random.rand(*layer_shape)
+    output_batch = np.random.rand(*(10, 1))
+    model.fit(x=input_batch, y=output_batch, epochs=1, batch_size=1)
     return model
 
 
 class normalization_test(test.TestCase):
 
+# ------------Tests to ensure proper inheritance. If these suceed you can test for Instance norm and Layernorm by setting Groupnorm groups = -1 or 1
+    def test_inheritance(self):
+        self.assertTrue(issubclass(LayerNormalization, GroupNormalization))
+        self.assertTrue(issubclass(InstanceNormalization, GroupNormalization))
+        self.assertTrue(LayerNormalization.build==GroupNormalization.build)
+        self.assertTrue(InstanceNormalization.build==GroupNormalization.build)
+        self.assertTrue(LayerNormalization.call==GroupNormalization.call)
+        self.assertTrue(InstanceNormalization.call==GroupNormalization.call)
+
+
+    def test_groups_after_init(self):
+        layers=InstanceNormalization()
+        self.assertTrue(layers.groups==-1)
+        layers=LayerNormalization()
+        self.assertTrue(layers.groups==1)
+# -----------------------------------------------------------------------------------------------------------------------------------------
+
+    def test_reshape(self):
+        def run_reshape_test(axis, group, input_shape, expected_shape):
+
+            group_layer=GroupNormalization(groups=group,axis=axis)
+            group_layer._set_number_of_groups_for_instance_norm(input_shape)
+
+            inputs=np.ones(input_shape)
+            tensor_input_shape=tf.convert_to_tensor(input_shape)
+            reshaped_inputs, group_shape=group_layer._reshape_into_groups(inputs,(10,10,10),tensor_input_shape)
+            for i in range(len(expected_shape)):
+                self.assertEqual(int(group_shape[i]),expected_shape[i])
+
+        input_shape=(10,10,10)
+        expected_shape=[10,5,10,2]
+        run_reshape_test(2,5,input_shape,expected_shape)
+
+        input_shape=(10,10,10)
+        expected_shape=[10,2,5,10]
+        run_reshape_test(1,2,input_shape,expected_shape)
+
+        input_shape=(10,10,10)
+        expected_shape=[10,10,1,10]
+        run_reshape_test(1,-1,input_shape,expected_shape)
+
+        input_shape=(10,10,10)
+        expected_shape=[10,1,10,10]
+        run_reshape_test(1,1,input_shape,expected_shape)
+
+    @tf_test_util.run_in_graph_and_eager_modes
     def test_weights(self):
-        #Check if weights get initialized
-        layer = GroupNormalization(groups=1,scale=False, center=False)
+        # Check if weights get initialized
+        layer = GroupNormalization(groups=1, scale=False, center=False)
         layer.build((None, 3, 4))
         self.assertEqual(len(layer.trainable_weights), 0)
         self.assertEqual(len(layer.weights), 0)
@@ -54,66 +107,106 @@ def test_weights(self):
 
         layer = InstanceNormalization()
         layer.build((None, 3, 4))
-        self.assertEqual(len(layer.trainable_weights),2)
-        self.assertEqual(len(layer.weights),2)
+        self.assertEqual(len(layer.trainable_weights), 2)
+        self.assertEqual(len(layer.weights), 2)
+    
+    def test_apply_normalization(self):
+        
+        input_shape = (1,4)
+        expected_shape= (1,2,2)
+        reshaped_inputs= tf.constant([[[2.0,2.0],[3.0,3.0]]])
+        layer=GroupNormalization(groups=2,axis=1,scale=False, center= False)
+        normalized_input=layer._apply_normalization(reshaped_inputs, input_shape)
+        self.assertTrue(tf.reduce_all(tf.equal(normalized_input,tf.constant([[[0.0,0.0],[0.0,0.0]]]))))
 
 
+        
+
+    @tf_test_util.run_in_graph_and_eager_modes
     def test_groupnorm_flat(self):
-        #Check basic usage of groupnorm_flat
+        # Check basic usage of groupnorm_flat
         # Testing for 1 == LayerNorm, 16 == GroupNorm, -1 == InstanceNorm
-        groups=[-1,16,1]
-        shape=(64,)
+
+        groups = [-1, 16, 1]
+        shape = (64,)
         for i in groups:
-            model=create_and_fit_Sequential_model(GroupNormalization(groups=i),shape)
+            model = create_and_fit_Sequential_model(
+                GroupNormalization(groups=i), shape)
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
             self.assertTrue(hasattr(model.layers[0], 'beta'))
 
 
+    @tf_test_util.run_in_graph_and_eager_modes
     def test_layernorm_flat(self):
         # Check basic usage of layernorm
-        model=create_and_fit_Sequential_model(LayerNormalization(),(64,))
-        self.assertTrue(hasattr(model.layers[0],'gamma'))
-        self.assertTrue(hasattr(model.layers[0],'beta'))
 
+        model = create_and_fit_Sequential_model(LayerNormalization(), (64,))
+        self.assertTrue(hasattr(model.layers[0], 'gamma'))
+        self.assertTrue(hasattr(model.layers[0], 'beta'))
 
+
+    @tf_test_util.run_in_graph_and_eager_modes
     def test_instancenorm_flat(self):
         # Check basic usage of instancenorm
-        model=create_and_fit_Sequential_model(InstanceNormalization(),(64,))
-        self.assertTrue(hasattr(model.layers[0],'gamma'))
-        self.assertTrue(hasattr(model.layers[0],'beta'))
+
+        model = create_and_fit_Sequential_model(InstanceNormalization(), (64,))
+        self.assertTrue(hasattr(model.layers[0], 'gamma'))
+        self.assertTrue(hasattr(model.layers[0], 'beta'))
 
 
+    @tf_test_util.run_in_graph_and_eager_modes
     def test_initializer(self):
         # Check if the initializer for gamma and beta is working correctly
 
-        model=create_and_fit_Sequential_model(GroupNormalization(groups=32,
-                                                                 beta_initializer='random_normal',
-                                                                 beta_constraint='NonNeg',
-                                                                 gamma_initializer='random_normal',
-                                                                 gamma_constraint='NonNeg'),
-                                              (64,))
-        
-        weights=np.array(model.layers[0].get_weights())
-        negativ=weights[weights<0.0]
-        print("------------------------------------------------------")
-        print(negativ)
-        self.assertTrue(len(negativ)==0)
-
+        layer=GroupNormalization(groups=32,
+                                 beta_initializer='random_normal',
+                                 beta_constraint='NonNeg',
+                                 gamma_initializer='random_normal',
+                                 gamma_constraint='NonNeg')
+
+        model = create_and_fit_Sequential_model(layer,(64,))
+
+        weights = np.array(model.layers[0].get_weights())
+        negativ = weights[weights < 0.0]
+        self.assertTrue(len(negativ) == 0)
+
+
+    @tf_test_util.run_in_graph_and_eager_modes
+    def test_regularizations(self):
+
+        layer = GroupNormalization(
+                            gamma_regularizer='l1', 
+                            beta_regularizer='l1',
+                            groups=4,
+                            axis=2)
+        layer.build((None, 4, 4))
+        self.assertEqual(len(layer.losses), 2)
+        max_norm = keras.constraints.max_norm
+        layer = GroupNormalization(
+                    gamma_constraint=max_norm, 
+                    beta_constraint=max_norm)
+        layer.build((None, 3, 4))
+        self.assertEqual(layer.gamma.constraint, max_norm)
+        self.assertEqual(layer.beta.constraint, max_norm)
+ 
 
+    @tf_test_util.run_in_graph_and_eager_modes
     def test_groupnorm_conv(self):
         # Check if Axis is working for CONV nets
         # Testing for 1 == LayerNorm, 5 == GroupNorm, -1 == InstanceNorm
-        groups=[-1,5,1]
+
+        groups = [-1, 5, 1]
         for i in groups:
             model = keras.models.Sequential()
-            model.add(GroupNormalization(axis=1,groups=i,input_shape=(20,20,3)))
+            model.add(GroupNormalization(
+                axis=1, groups=i, input_shape=(20, 20, 3)))
             model.add(keras.layers.Conv2D(5, (1, 1), padding='same'))
             model.add(keras.layers.Flatten())
-            model.add(keras.layers.Dense(1,activation='softmax'))
+            model.add(keras.layers.Dense(1, activation='softmax'))
             model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
-            x=np.random.randint(1000,size=(10,20, 20, 3))
-            y=np.random.randint(1000,size=(10,1))
-            a=model.fit(x=x,y=y,epochs=1)
+            x = np.random.randint(1000, size=(10, 20, 20, 3))
+            y = np.random.randint(1000, size=(10, 1))
+            a = model.fit(x=x, y=y, epochs=1)
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
 
 

From 918eeb7a1af75b35ce0957db74a3439d7c0444ea Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Mon, 4 Mar 2019 21:16:15 +0100
Subject: [PATCH 19/26] added axis check

---
 tensorflow_addons/layers/python/normalizations.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 72b45f9f25..e49df0ac16 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -104,7 +104,7 @@ def __init__(self,
         self.gamma_regularizer = regularizers.get(gamma_regularizer)
         self.beta_constraint = constraints.get(beta_constraint)
         self.gamma_constraint = constraints.get(gamma_constraint)
-
+        self._check_axis()
 
     def build(self, input_shape):
 
@@ -224,7 +224,10 @@ def _check_size_of_dimensions(self,input_shape):
                              'multiple of the number of channels (' +
                              str(dim) + ').')
 
+    def _check_axis(self):
 
+        if self.axis==0:
+            raise ValueError("You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead")
     def _create_input_spec(self,input_shape):
 
         dim=input_shape[self.axis]

From d2c1afdf51f17e554c6911a2553988294292c9c9 Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Mon, 4 Mar 2019 21:16:40 +0100
Subject: [PATCH 20/26] added manual layer test

---
 .../layers/python/normalizations_test.py      | 101 ++++++++++++++----
 1 file changed, 83 insertions(+), 18 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index 3bc472ae91..841c99595f 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -29,20 +29,7 @@
 from tensorflow.python.training import gradient_descent
 
 
-def create_and_fit_Sequential_model(layer, shape):
-    # Helperfunction for quick evaluation
-    model = keras.models.Sequential()
-    model.add(layer)
-    model.add(keras.layers.Dense(32))
-    model.add(keras.layers.Dense(1))
-
-    model.compile(optimizer=RMSPropOptimizer(0.01),
-                  loss="categorical_crossentropy")
-    layer_shape = (10,) + shape
-    input_batch = np.random.rand(*layer_shape)
-    output_batch = np.random.rand(*(10, 1))
-    model.fit(x=input_batch, y=output_batch, epochs=1, batch_size=1)
-    return model
+
 
 
 class normalization_test(test.TestCase):
@@ -92,6 +79,79 @@ def run_reshape_test(axis, group, input_shape, expected_shape):
         expected_shape=[10,1,10,10]
         run_reshape_test(1,1,input_shape,expected_shape)
 
+    def test_call_function(self):
+
+        self._test_specific_layer(tf.random.normal((10,10,10)),1,1,False,True)
+
+    def _test_specific_layer(self,inputs, axis, groups, center, scale):
+
+        input_shape=inputs.shape
+
+        layer=GroupNormalization(axis=axis,groups=groups,center=center,scale=scale)
+
+        model= keras.models.Sequential()
+        model.add(layer)
+
+        outputs=model.predict(inputs)
+        self.assertFalse(np.isnan(outputs).any())
+
+        if groups is -1:
+            groups=input_shape[axis]
+        np_inputs=inputs.numpy()
+        reshaped_dims=list(np_inputs.shape)
+        reshaped_dims[axis]=reshaped_dims[axis]//groups
+        reshaped_dims.insert(1,groups)
+        #reshaped_dims=np.array([reshaped_dims[0],groups,i for i in reshaped_dims[1:]])
+        reshaped_inputs=np.reshape(np_inputs,tuple(reshaped_dims))
+        mean = np.mean(reshaped_inputs, axis=tuple(range(2,len(reshaped_dims))),keepdims=True)
+        variance = np.var(reshaped_inputs,axis=tuple(range(2,len(reshaped_dims))),keepdims=True)
+
+        gamma,beta=layer._get_reshaped_weights(input_shape)
+        print("GAMMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
+        print(gamma.shape)
+        print(reshaped_dims)
+        print(np_inputs.shape)
+        print(gamma)
+        print(beta)
+        gamma=np.repeat(gamma, input_shape[0],axis=0)
+        print("GAMMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
+        print(gamma.shape)
+        if gamma is None:
+            gamma=1.0
+        if beta is None:
+            beta=0.0
+        output_test=[]
+                
+        a=np_inputs-mean
+        output_test=((gamma*a)*(1/np.sqrt(variance+1e-5))+beta)
+        output_test=np.array(output_test)
+        print("OOOOOOUTPUUUUUUT")
+        print(output_test.shape)
+        output_test=np.reshape(output_test,input_shape.as_list())
+        output_test=output_test.flatten()
+        
+
+        outputs_tf= outputs.flatten()
+        for i in range(len(output_test)):
+                
+            self.assertAlmostEqual(output_test[i],outputs_tf[i],places=5)
+        return outputs
+
+    def _create_and_fit_Sequential_model(self,layer, shape):
+        # Helperfunction for quick evaluation
+        model = keras.models.Sequential()
+        model.add(layer)
+        model.add(keras.layers.Dense(32))
+        model.add(keras.layers.Dense(1))
+
+        model.compile(optimizer=RMSPropOptimizer(0.01),
+                      loss="categorical_crossentropy")
+        layer_shape = (10,) + shape
+        input_batch = np.random.rand(*layer_shape)
+        output_batch = np.random.rand(*(10, 1))
+        model.fit(x=input_batch, y=output_batch, epochs=1, batch_size=1)
+        return model
+
     @tf_test_util.run_in_graph_and_eager_modes
     def test_weights(self):
         # Check if weights get initialized
@@ -119,9 +179,14 @@ def test_apply_normalization(self):
         normalized_input=layer._apply_normalization(reshaped_inputs, input_shape)
         self.assertTrue(tf.reduce_all(tf.equal(normalized_input,tf.constant([[[0.0,0.0],[0.0,0.0]]]))))
 
+    def test_axis_error(self):
+
+        with self.assertRaises(ValueError):
+            GroupNormalization(axis=0)
 
         
 
+
     @tf_test_util.run_in_graph_and_eager_modes
     def test_groupnorm_flat(self):
         # Check basic usage of groupnorm_flat
@@ -130,7 +195,7 @@ def test_groupnorm_flat(self):
         groups = [-1, 16, 1]
         shape = (64,)
         for i in groups:
-            model = create_and_fit_Sequential_model(
+            model = self._create_and_fit_Sequential_model(
                 GroupNormalization(groups=i), shape)
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
             self.assertTrue(hasattr(model.layers[0], 'beta'))
@@ -140,7 +205,7 @@ def test_groupnorm_flat(self):
     def test_layernorm_flat(self):
         # Check basic usage of layernorm
 
-        model = create_and_fit_Sequential_model(LayerNormalization(), (64,))
+        model = self._create_and_fit_Sequential_model(LayerNormalization(), (64,))
         self.assertTrue(hasattr(model.layers[0], 'gamma'))
         self.assertTrue(hasattr(model.layers[0], 'beta'))
 
@@ -149,7 +214,7 @@ def test_layernorm_flat(self):
     def test_instancenorm_flat(self):
         # Check basic usage of instancenorm
 
-        model = create_and_fit_Sequential_model(InstanceNormalization(), (64,))
+        model = self._create_and_fit_Sequential_model(InstanceNormalization(), (64,))
         self.assertTrue(hasattr(model.layers[0], 'gamma'))
         self.assertTrue(hasattr(model.layers[0], 'beta'))
 
@@ -164,7 +229,7 @@ def test_initializer(self):
                                  gamma_initializer='random_normal',
                                  gamma_constraint='NonNeg')
 
-        model = create_and_fit_Sequential_model(layer,(64,))
+        model = self._create_and_fit_Sequential_model(layer,(64,))
 
         weights = np.array(model.layers[0].get_weights())
         negativ = weights[weights < 0.0]

From 4ebd90757e848c6d8020129e21cfe978d3ac0bd9 Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Tue, 5 Mar 2019 22:29:16 +0100
Subject: [PATCH 21/26] added tests to check normalization with numpy

---
 .../layers/python/normalizations_test.py      | 238 +++++++++---------
 1 file changed, 119 insertions(+), 119 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index 841c99595f..3b83d70fb6 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -29,115 +29,119 @@
 from tensorflow.python.training import gradient_descent
 
 
-
-
-
 class normalization_test(test.TestCase):
 
-# ------------Tests to ensure proper inheritance. If these suceed you can test for Instance norm and Layernorm by setting Groupnorm groups = -1 or 1
+    # ------------Tests to ensure proper inheritance. If these suceed you can test for Instance norm and Layernorm by setting Groupnorm groups = -1 or 1
     def test_inheritance(self):
         self.assertTrue(issubclass(LayerNormalization, GroupNormalization))
         self.assertTrue(issubclass(InstanceNormalization, GroupNormalization))
-        self.assertTrue(LayerNormalization.build==GroupNormalization.build)
-        self.assertTrue(InstanceNormalization.build==GroupNormalization.build)
-        self.assertTrue(LayerNormalization.call==GroupNormalization.call)
-        self.assertTrue(InstanceNormalization.call==GroupNormalization.call)
-
+        self.assertTrue(LayerNormalization.build == GroupNormalization.build)
+        self.assertTrue(InstanceNormalization.build ==
+                        GroupNormalization.build)
+        self.assertTrue(LayerNormalization.call == GroupNormalization.call)
+        self.assertTrue(InstanceNormalization.call == GroupNormalization.call)
 
     def test_groups_after_init(self):
-        layers=InstanceNormalization()
-        self.assertTrue(layers.groups==-1)
-        layers=LayerNormalization()
-        self.assertTrue(layers.groups==1)
+        layers = InstanceNormalization()
+        self.assertTrue(layers.groups == -1)
+        layers = LayerNormalization()
+        self.assertTrue(layers.groups == 1)
 # -----------------------------------------------------------------------------------------------------------------------------------------
 
     def test_reshape(self):
         def run_reshape_test(axis, group, input_shape, expected_shape):
 
-            group_layer=GroupNormalization(groups=group,axis=axis)
+            group_layer = GroupNormalization(groups=group, axis=axis)
             group_layer._set_number_of_groups_for_instance_norm(input_shape)
 
-            inputs=np.ones(input_shape)
-            tensor_input_shape=tf.convert_to_tensor(input_shape)
-            reshaped_inputs, group_shape=group_layer._reshape_into_groups(inputs,(10,10,10),tensor_input_shape)
+            inputs = np.ones(input_shape)
+            tensor_input_shape = tf.convert_to_tensor(input_shape)
+            reshaped_inputs, group_shape = group_layer._reshape_into_groups(
+                inputs, (10, 10, 10), tensor_input_shape)
             for i in range(len(expected_shape)):
-                self.assertEqual(int(group_shape[i]),expected_shape[i])
-
-        input_shape=(10,10,10)
-        expected_shape=[10,5,10,2]
-        run_reshape_test(2,5,input_shape,expected_shape)
-
-        input_shape=(10,10,10)
-        expected_shape=[10,2,5,10]
-        run_reshape_test(1,2,input_shape,expected_shape)
-
-        input_shape=(10,10,10)
-        expected_shape=[10,10,1,10]
-        run_reshape_test(1,-1,input_shape,expected_shape)
-
-        input_shape=(10,10,10)
-        expected_shape=[10,1,10,10]
-        run_reshape_test(1,1,input_shape,expected_shape)
-
-    def test_call_function(self):
-
-        self._test_specific_layer(tf.random.normal((10,10,10)),1,1,False,True)
-
-    def _test_specific_layer(self,inputs, axis, groups, center, scale):
-
-        input_shape=inputs.shape
-
-        layer=GroupNormalization(axis=axis,groups=groups,center=center,scale=scale)
-
-        model= keras.models.Sequential()
+                self.assertEqual(int(group_shape[i]), expected_shape[i])
+
+        input_shape = (10, 10, 10)
+        expected_shape = [10, 5, 10, 2]
+        run_reshape_test(2, 5, input_shape, expected_shape)
+
+        input_shape = (10, 10, 10)
+        expected_shape = [10, 2, 5, 10]
+        run_reshape_test(1, 2, input_shape, expected_shape)
+
+        input_shape = (10, 10, 10)
+        expected_shape = [10, 10, 1, 10]
+        run_reshape_test(1, -1, input_shape, expected_shape)
+
+        input_shape = (10, 10, 10)
+        expected_shape = [10, 1, 10, 10]
+        run_reshape_test(1, 1, input_shape, expected_shape)
+
+    def test_feature_input(self):
+        shape = (10, 100)
+        for center in [True, False]:
+            for scale in [True, False]:
+                for groups in [-1, 1, 2, 5]:
+                    self._test_random_shape_on_all_axis_except_batch(
+                        shape, groups, center, scale)
+
+    def test_picture_input(self):
+        shape = (10, 30, 30, 3)
+        for center in [True, False]:
+            for scale in [True, False]:
+                for groups in [-1, 1, 3]:
+                    self._test_random_shape_on_all_axis_except_batch(
+                        shape, groups, center, scale)
+
+    def _test_random_shape_on_all_axis_except_batch(self, shape, groups, center, scale):
+        inputs = tf.random.normal((shape))
+        for axis in range(1, len(shape)):
+            self._test_specific_layer(inputs, axis, groups, center, scale)
+
+    def _test_specific_layer(self, inputs, axis, groups, center, scale):
+
+        input_shape = inputs.shape
+
+        # Get Output from Keras model
+        layer = GroupNormalization(
+            axis=axis, groups=groups, center=center, scale=scale)
+        model = keras.models.Sequential()
         model.add(layer)
-
-        outputs=model.predict(inputs)
+        outputs = model.predict(inputs)
         self.assertFalse(np.isnan(outputs).any())
 
+        # Create shapes
         if groups is -1:
-            groups=input_shape[axis]
-        np_inputs=inputs.numpy()
-        reshaped_dims=list(np_inputs.shape)
-        reshaped_dims[axis]=reshaped_dims[axis]//groups
-        reshaped_dims.insert(1,groups)
-        #reshaped_dims=np.array([reshaped_dims[0],groups,i for i in reshaped_dims[1:]])
-        reshaped_inputs=np.reshape(np_inputs,tuple(reshaped_dims))
-        mean = np.mean(reshaped_inputs, axis=tuple(range(2,len(reshaped_dims))),keepdims=True)
-        variance = np.var(reshaped_inputs,axis=tuple(range(2,len(reshaped_dims))),keepdims=True)
-
-        gamma,beta=layer._get_reshaped_weights(input_shape)
-        print("GAMMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
-        print(gamma.shape)
-        print(reshaped_dims)
-        print(np_inputs.shape)
-        print(gamma)
-        print(beta)
-        gamma=np.repeat(gamma, input_shape[0],axis=0)
-        print("GAMMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
-        print(gamma.shape)
+            groups = input_shape[axis]
+        np_inputs = inputs.numpy()
+        reshaped_dims = list(np_inputs.shape)
+        reshaped_dims[axis] = reshaped_dims[axis] // groups
+        reshaped_dims.insert(1, groups)
+        reshaped_inputs = np.reshape(np_inputs, tuple(reshaped_dims))
+
+        # Calculate mean and variance
+        mean = np.mean(reshaped_inputs, axis=tuple(
+            range(2, len(reshaped_dims))), keepdims=True)
+        variance = np.var(reshaped_inputs, axis=tuple(
+            range(2, len(reshaped_dims))), keepdims=True)
+
+        # Get gamma and beta initalized by layer
+        gamma, beta = layer._get_reshaped_weights(input_shape)
         if gamma is None:
-            gamma=1.0
+            gamma = 1.0
         if beta is None:
-            beta=0.0
-        output_test=[]
-                
-        a=np_inputs-mean
-        output_test=((gamma*a)*(1/np.sqrt(variance+1e-5))+beta)
-        output_test=np.array(output_test)
-        print("OOOOOOUTPUUUUUUT")
-        print(output_test.shape)
-        output_test=np.reshape(output_test,input_shape.as_list())
-        output_test=output_test.flatten()
-        
-
-        outputs_tf= outputs.flatten()
-        for i in range(len(output_test)):
-                
-            self.assertAlmostEqual(output_test[i],outputs_tf[i],places=5)
-        return outputs
-
-    def _create_and_fit_Sequential_model(self,layer, shape):
+            beta = 0.0
+
+        # Get ouput from Numpy
+        zeroed = reshaped_inputs - mean
+        rsqrt = 1 / np.sqrt(variance + 1e-5)
+        output_test = gamma * zeroed * rsqrt + beta
+
+        # compare outputs
+        output_test = np.reshape(output_test, input_shape.as_list())
+        self.assertAlmostEqual(np.mean(output_test - outputs), 0, places=7)
+
+    def _create_and_fit_Sequential_model(self, layer, shape):
         # Helperfunction for quick evaluation
         model = keras.models.Sequential()
         model.add(layer)
@@ -154,7 +158,7 @@ def _create_and_fit_Sequential_model(self,layer, shape):
 
     @tf_test_util.run_in_graph_and_eager_modes
     def test_weights(self):
-        # Check if weights get initialized
+        # Check if weights get initialized correctly
         layer = GroupNormalization(groups=1, scale=False, center=False)
         layer.build((None, 3, 4))
         self.assertEqual(len(layer.trainable_weights), 0)
@@ -169,24 +173,23 @@ def test_weights(self):
         layer.build((None, 3, 4))
         self.assertEqual(len(layer.trainable_weights), 2)
         self.assertEqual(len(layer.weights), 2)
-    
+
     def test_apply_normalization(self):
-        
-        input_shape = (1,4)
-        expected_shape= (1,2,2)
-        reshaped_inputs= tf.constant([[[2.0,2.0],[3.0,3.0]]])
-        layer=GroupNormalization(groups=2,axis=1,scale=False, center= False)
-        normalized_input=layer._apply_normalization(reshaped_inputs, input_shape)
-        self.assertTrue(tf.reduce_all(tf.equal(normalized_input,tf.constant([[[0.0,0.0],[0.0,0.0]]]))))
+
+        input_shape = (1, 4)
+        expected_shape = (1, 2, 2)
+        reshaped_inputs = tf.constant([[[2.0, 2.0], [3.0, 3.0]]])
+        layer = GroupNormalization(groups=2, axis=1, scale=False, center=False)
+        normalized_input = layer._apply_normalization(
+            reshaped_inputs, input_shape)
+        self.assertTrue(tf.reduce_all(
+            tf.equal(normalized_input, tf.constant([[[0.0, 0.0], [0.0, 0.0]]]))))
 
     def test_axis_error(self):
 
         with self.assertRaises(ValueError):
             GroupNormalization(axis=0)
 
-        
-
-
     @tf_test_util.run_in_graph_and_eager_modes
     def test_groupnorm_flat(self):
         # Check basic usage of groupnorm_flat
@@ -200,60 +203,57 @@ def test_groupnorm_flat(self):
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
             self.assertTrue(hasattr(model.layers[0], 'beta'))
 
-
     @tf_test_util.run_in_graph_and_eager_modes
     def test_layernorm_flat(self):
         # Check basic usage of layernorm
 
-        model = self._create_and_fit_Sequential_model(LayerNormalization(), (64,))
+        model = self._create_and_fit_Sequential_model(
+            LayerNormalization(), (64,))
         self.assertTrue(hasattr(model.layers[0], 'gamma'))
         self.assertTrue(hasattr(model.layers[0], 'beta'))
 
-
     @tf_test_util.run_in_graph_and_eager_modes
     def test_instancenorm_flat(self):
         # Check basic usage of instancenorm
 
-        model = self._create_and_fit_Sequential_model(InstanceNormalization(), (64,))
+        model = self._create_and_fit_Sequential_model(
+            InstanceNormalization(), (64,))
         self.assertTrue(hasattr(model.layers[0], 'gamma'))
         self.assertTrue(hasattr(model.layers[0], 'beta'))
 
-
     @tf_test_util.run_in_graph_and_eager_modes
     def test_initializer(self):
         # Check if the initializer for gamma and beta is working correctly
 
-        layer=GroupNormalization(groups=32,
-                                 beta_initializer='random_normal',
-                                 beta_constraint='NonNeg',
-                                 gamma_initializer='random_normal',
-                                 gamma_constraint='NonNeg')
+        layer = GroupNormalization(groups=32,
+                                   beta_initializer='random_normal',
+                                   beta_constraint='NonNeg',
+                                   gamma_initializer='random_normal',
+                                   gamma_constraint='NonNeg')
 
-        model = self._create_and_fit_Sequential_model(layer,(64,))
+        model = self._create_and_fit_Sequential_model(layer, (64,))
 
         weights = np.array(model.layers[0].get_weights())
         negativ = weights[weights < 0.0]
         self.assertTrue(len(negativ) == 0)
 
-
     @tf_test_util.run_in_graph_and_eager_modes
     def test_regularizations(self):
 
         layer = GroupNormalization(
-                            gamma_regularizer='l1', 
-                            beta_regularizer='l1',
-                            groups=4,
-                            axis=2)
+            gamma_regularizer='l1',
+            beta_regularizer='l1',
+            groups=4,
+            axis=2)
         layer.build((None, 4, 4))
         self.assertEqual(len(layer.losses), 2)
         max_norm = keras.constraints.max_norm
         layer = GroupNormalization(
-                    gamma_constraint=max_norm, 
-                    beta_constraint=max_norm)
+            gamma_constraint=max_norm,
+            beta_constraint=max_norm)
         layer.build((None, 3, 4))
         self.assertEqual(layer.gamma.constraint, max_norm)
         self.assertEqual(layer.beta.constraint, max_norm)
- 
 
     @tf_test_util.run_in_graph_and_eager_modes
     def test_groupnorm_conv(self):

From 37244c4d8ed862659a40625f4ab24f66fb9d9a1d Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Tue, 5 Mar 2019 23:10:42 +0100
Subject: [PATCH 22/26] Included some comments from @ppwwyyxx

---
 tensorflow_addons/layers/python/normalizations.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index e49df0ac16..57325960fd 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -30,8 +30,9 @@ class GroupNormalization(Layer):
 
     Group Normalization divides the channels into groups and computes
     within each group the mean and variance for normalization.
-    Group Normalization's computation is independent
-    of batch sizes, and its accuracy is stable in a wide range of batch sizes.
+    Empirically, its accuracy is more stable than batch norm in a wide 
+    range of small batch sizes, if learning rate is adjusted linearly 
+    with batch sizes.
 
     Relation to Layer Normalization:
     If the number of groups is set to 1, then this operation becomes identical to
@@ -276,8 +277,9 @@ class LayerNormalization(GroupNormalization):
 
     Layer Normalization is an specific case of ```GroupNormalization```since it
     normalizes all features of a layer. The Groupsize is 1.
-    Layer Normalization's computation is independent
-    of batch sizes, and its accuracy is stable in a wide range of batch sizes.
+    Empirically, its accuracy is more stable than batch norm in a wide 
+    range of small batch sizes, if learning rate is adjusted linearly 
+    with batch sizes.
 
     Arguments
         axis: Integer, the axis that should be normalized
@@ -322,8 +324,9 @@ class InstanceNormalization(GroupNormalization):
 
     Instance Normalization is an specific case of ```GroupNormalization```since it
     normalizes all features of one channel. The Groupsize is equal to the channel size.
-    Instance Normalization's computation is independent
-    of batch sizes, and its accuracy is stable in a wide range of batch sizes.
+    Empirically, its accuracy is more stable than batch norm in a wide 
+    range of small batch sizes, if learning rate is adjusted linearly 
+    with batch sizes.
 
     Arguments
         axis: Integer, the axis that should be normalized

From 06694664b691b09e78106542d9b2ef1400e76950 Mon Sep 17 00:00:00 2001
From: smokrow <moritz.kroeger@tu-dortmund.de>
Date: Tue, 5 Mar 2019 23:11:41 +0100
Subject: [PATCH 23/26] beautified

---
 .../layers/python/normalizations.py           | 108 +++++++++---------
 1 file changed, 53 insertions(+), 55 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 57325960fd..579fb026ad 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -30,8 +30,8 @@ class GroupNormalization(Layer):
 
     Group Normalization divides the channels into groups and computes
     within each group the mean and variance for normalization.
-    Empirically, its accuracy is more stable than batch norm in a wide 
-    range of small batch sizes, if learning rate is adjusted linearly 
+    Empirically, its accuracy is more stable than batch norm in a wide
+    range of small batch sizes, if learning rate is adjusted linearly
     with batch sizes.
 
     Relation to Layer Normalization:
@@ -119,21 +119,21 @@ def build(self, input_shape):
         self.built = True
         super(GroupNormalization, self).build(input_shape)
 
-
     def call(self, inputs):
 
         input_shape = K.int_shape(inputs)
         tensor_input_shape = K.shape(inputs)
 
-        reshaped_inputs, group_shape=self._reshape_into_groups(inputs,input_shape,tensor_input_shape)
+        reshaped_inputs, group_shape = self._reshape_into_groups(
+            inputs, input_shape, tensor_input_shape)
 
-        normalized_inputs = self._apply_normalization(reshaped_inputs, input_shape)
+        normalized_inputs = self._apply_normalization(
+            reshaped_inputs, input_shape)
 
         outputs = K.reshape(normalized_inputs, tensor_input_shape)
 
         return outputs
 
-
     def get_config(self):
         config = {
             'groups': self.groups,
@@ -151,12 +151,10 @@ def get_config(self):
         base_config = super(GroupNormalization, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-
     def compute_output_shape(self, input_shape):
         return input_shape
 
-
-    def _reshape_into_groups(self,inputs,input_shape,tensor_input_shape):
+    def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
 
         group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
         group_shape[self.axis] = input_shape[self.axis] // self.groups
@@ -165,8 +163,7 @@ def _reshape_into_groups(self,inputs,input_shape,tensor_input_shape):
         reshaped_inputs = K.reshape(inputs, group_shape)
         return reshaped_inputs, group_shape
 
-
-    def _apply_normalization(self, reshaped_inputs , input_shape):
+    def _apply_normalization(self, reshaped_inputs, input_shape):
 
         group_shape = K.int_shape(reshaped_inputs)
         group_reduction_axes = list(range(len(group_shape)))
@@ -174,20 +171,19 @@ def _apply_normalization(self, reshaped_inputs , input_shape):
         mean, variance = nn.moments(reshaped_inputs, group_reduction_axes[2:],
                                     keep_dims=True)
 
-        gamma,beta= self._get_reshaped_weights(input_shape)
-        normalized_inputs= nn.batch_normalization(reshaped_inputs,
-                                                mean = mean,
-                                                variance = variance,
-                                                scale = gamma,
-                                                offset = beta,
-                                                variance_epsilon = self.epsilon)
+        gamma, beta = self._get_reshaped_weights(input_shape)
+        normalized_inputs = nn.batch_normalization(reshaped_inputs,
+                                                   mean=mean,
+                                                   variance=variance,
+                                                   scale=gamma,
+                                                   offset=beta,
+                                                   variance_epsilon=self.epsilon)
         return normalized_inputs
 
-
     def _get_reshaped_weights(self, input_shape):
-        broadcast_shape=self._create_broadcast_shape(input_shape)
-        gamma=None
-        beta=None
+        broadcast_shape = self._create_broadcast_shape(input_shape)
+        gamma = None
+        beta = None
         if self.scale:
             gamma = K.reshape(self.gamma, broadcast_shape)
 
@@ -195,7 +191,6 @@ def _get_reshaped_weights(self, input_shape):
             beta = K.reshape(self.beta, broadcast_shape)
         return gamma, beta
 
-
     def _check_if_input_shape_is_None(self, input_shape):
         dim = input_shape[self.axis]
         if dim is None:
@@ -204,17 +199,15 @@ def _check_if_input_shape_is_None(self, input_shape):
                              'but the layer received an input with shape ' +
                              str(input_shape) + '.')
 
-
     def _set_number_of_groups_for_instance_norm(self, input_shape):
-        dim=input_shape[self.axis]
-
-        if self.groups==-1:
-            self.groups=dim
+        dim = input_shape[self.axis]
 
+        if self.groups == -1:
+            self.groups = dim
 
-    def _check_size_of_dimensions(self,input_shape):
+    def _check_size_of_dimensions(self, input_shape):
 
-        dim=input_shape[self.axis]
+        dim = input_shape[self.axis]
         if dim < self.groups:
             raise ValueError('Number of groups (' + str(self.groups) + ') cannot be '
                              'more than the number of channels (' +
@@ -227,19 +220,20 @@ def _check_size_of_dimensions(self,input_shape):
 
     def _check_axis(self):
 
-        if self.axis==0:
-            raise ValueError("You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead")
-    def _create_input_spec(self,input_shape):
+        if self.axis == 0:
+            raise ValueError(
+                "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead")
+
+    def _create_input_spec(self, input_shape):
 
-        dim=input_shape[self.axis]
+        dim = input_shape[self.axis]
         self.input_spec = InputSpec(ndim=len(input_shape),
                                     axes={self.axis: dim})
 
+    def _add_gamma_weight(self, input_shape):
 
-    def _add_gamma_weight(self,input_shape):
-
-        dim=input_shape[self.axis]
-        shape=(dim,)
+        dim = input_shape[self.axis]
+        shape = (dim,)
 
         if self.scale:
             self.gamma = self.add_weight(shape=shape,
@@ -250,10 +244,10 @@ def _add_gamma_weight(self,input_shape):
         else:
             self.gamma = None
 
-    def _add_beta_weight(self,input_shape):
+    def _add_beta_weight(self, input_shape):
 
-        dim=input_shape[self.axis]
-        shape=(dim,)
+        dim = input_shape[self.axis]
+        shape = (dim,)
 
         if self.center:
             self.beta = self.add_weight(shape=shape,
@@ -264,8 +258,7 @@ def _add_beta_weight(self,input_shape):
         else:
             self.beta = None
 
-
-    def _create_broadcast_shape(self,input_shape):
+    def _create_broadcast_shape(self, input_shape):
         broadcast_shape = [1] * len(input_shape)
         broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
         broadcast_shape.insert(1, self.groups)
@@ -277,8 +270,8 @@ class LayerNormalization(GroupNormalization):
 
     Layer Normalization is an specific case of ```GroupNormalization```since it
     normalizes all features of a layer. The Groupsize is 1.
-    Empirically, its accuracy is more stable than batch norm in a wide 
-    range of small batch sizes, if learning rate is adjusted linearly 
+    Empirically, its accuracy is more stable than batch norm in a wide
+    range of small batch sizes, if learning rate is adjusted linearly
     with batch sizes.
 
     Arguments
@@ -313,19 +306,22 @@ class LayerNormalization(GroupNormalization):
     References
         - [Layer Normalization](https://arxiv.org/abs/1607.06450)
     """
-    def __init__(self,**kwargs):
+
+    def __init__(self, **kwargs):
         if "groups" in kwargs:
-            tf.logging.warning("The given value for groups will be overwritten.")
-        kwargs["groups"]=1
-        super(LayerNormalization,self).__init__(**kwargs)
+            tf.logging.warning(
+                "The given value for groups will be overwritten.")
+        kwargs["groups"] = 1
+        super(LayerNormalization, self).__init__(**kwargs)
+
 
 class InstanceNormalization(GroupNormalization):
     """Instance normalization layer.
 
     Instance Normalization is an specific case of ```GroupNormalization```since it
     normalizes all features of one channel. The Groupsize is equal to the channel size.
-    Empirically, its accuracy is more stable than batch norm in a wide 
-    range of small batch sizes, if learning rate is adjusted linearly 
+    Empirically, its accuracy is more stable than batch norm in a wide
+    range of small batch sizes, if learning rate is adjusted linearly
     with batch sizes.
 
     Arguments
@@ -360,10 +356,12 @@ class InstanceNormalization(GroupNormalization):
     References
         - [Layer Normalization](https://arxiv.org/abs/1607.06450)
     """
-    def __init__(self,**kwargs):
+
+    def __init__(self, **kwargs):
 
         if "groups" in kwargs:
-            tf.logging.warning("The given value for groups will be overwritten.")
+            tf.logging.warning(
+                "The given value for groups will be overwritten.")
 
-        kwargs["groups"]=-1
-        super(InstanceNormalization,self).__init__(**kwargs)
+        kwargs["groups"] = -1
+        super(InstanceNormalization, self).__init__(**kwargs)

From b4613aecd05feb74f7b4464b2812fd01e9605eff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Moritz=20Kr=C3=B6ger?= <moritz.kroeger@tu-dortmund.de>
Date: Thu, 7 Mar 2019 20:54:02 +0100
Subject: [PATCH 24/26] Update normalizations.py

removed wrong documentation
---
 tensorflow_addons/layers/python/normalizations.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 579fb026ad..239e77245d 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -58,9 +58,6 @@ class GroupNormalization(Layer):
             If False, `beta` is ignored.
         scale: If True, multiply by `gamma`.
             If False, `gamma` is not used.
-            When the next layer is linear (also e.g. `nn.relu`),
-            this can be disabled since the scaling
-            will be done by the next layer.
         beta_initializer: Initializer for the beta weight.
         gamma_initializer: Initializer for the gamma weight.
         beta_regularizer: Optional regularizer for the beta weight.
@@ -285,9 +282,6 @@ class LayerNormalization(GroupNormalization):
             If False, `beta` is ignored.
         scale: If True, multiply by `gamma`.
             If False, `gamma` is not used.
-            When the next layer is linear (also e.g. `nn.relu`),
-            this can be disabled since the scaling
-            will be done by the next layer.
         beta_initializer: Initializer for the beta weight.
         gamma_initializer: Initializer for the gamma weight.
         beta_regularizer: Optional regularizer for the beta weight.
@@ -335,9 +329,6 @@ class InstanceNormalization(GroupNormalization):
             If False, `beta` is ignored.
         scale: If True, multiply by `gamma`.
             If False, `gamma` is not used.
-            When the next layer is linear (also e.g. `nn.relu`),
-            this can be disabled since the scaling
-            will be done by the next layer.
         beta_initializer: Initializer for the beta weight.
         gamma_initializer: Initializer for the gamma weight.
         beta_regularizer: Optional regularizer for the beta weight.

From fcd163935468ff182cd95246335211828c466b24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Moritz=20Kr=C3=B6ger?= <moritz.kroeger@tu-dortmund.de>
Date: Thu, 7 Mar 2019 21:00:32 +0100
Subject: [PATCH 25/26] Update normalizations.py

Removed explanation of layers. Will be added to colab
---
 .../layers/python/normalizations.py            | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index 239e77245d..adc3929dcd 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -48,11 +48,7 @@ class GroupNormalization(Layer):
         groups: Integer, the number of groups for Group Normalization.
             Can be in the range [1, N] where N is the input dimension.
             The input dimension must be divisible by the number of groups.
-        axis: Integer, the axis that should be normalized
-            (typically the features axis).
-            For instance, after a `Conv2D` layer with
-            `data_format="channels_first"`,
-            set `axis=1` in `BatchNormalization`.
+        axis: Integer, the axis that should be normalized.
         epsilon: Small float added to variance to avoid dividing by zero.
         center: If True, add offset of `beta` to normalized tensor.
             If False, `beta` is ignored.
@@ -272,11 +268,7 @@ class LayerNormalization(GroupNormalization):
     with batch sizes.
 
     Arguments
-        axis: Integer, the axis that should be normalized
-            (typically the features axis).
-            For instance, after a `Conv2D` layer with
-            `data_format="channels_first"`,
-            set `axis=1` in `BatchNormalization`.
+        axis: Integer, the axis that should be normalized.
         epsilon: Small float added to variance to avoid dividing by zero.
         center: If True, add offset of `beta` to normalized tensor.
             If False, `beta` is ignored.
@@ -319,11 +311,7 @@ class InstanceNormalization(GroupNormalization):
     with batch sizes.
 
     Arguments
-        axis: Integer, the axis that should be normalized
-            (typically the features axis).
-            For instance, after a `Conv2D` layer with
-            `data_format="channels_first"`,
-            set `axis=1` in `BatchNormalization`.
+        axis: Integer, the axis that should be normalized.
         epsilon: Small float added to variance to avoid dividing by zero.
         center: If True, add offset of `beta` to normalized tensor.
             If False, `beta` is ignored.

From 429ded25e67b1c8abe4a5c26c6204cf437fbe7de Mon Sep 17 00:00:00 2001
From: Sean Morgan <seanmorgan@outlook.com>
Date: Sat, 9 Mar 2019 14:55:27 -0500
Subject: [PATCH 26/26] * Standardize formatting with project * Remove
 tf.logging as part of TF2 * Add normaliztion layers to init * Update READMEs

---
 README.md                                     |   3 +
 tensorflow_addons/layers/README.md            |   3 +
 tensorflow_addons/layers/__init__.py          |   3 +
 .../layers/python/normalizations.py           | 171 ++++++++++--------
 .../layers/python/normalizations_test.py      | 125 ++++++-------
 5 files changed, 166 insertions(+), 139 deletions(-)

diff --git a/README.md b/README.md
index c165bf06e9..4a1e2fc174 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,9 @@ developments that cannot be integrated into core TensorFlow
 |:----------------------- |:----------- |:---------------------------- |
 | tfa.activations | Sparsemax | https://arxiv.org/abs/1602.02068    |
 | tfa.image | transform |                                           |
+| tfa.layers | GroupNormalization | https://arxiv.org/abs/1803.08494 |
+| tfa.layers | InstanceNormalization | https://arxiv.org/abs/1607.08022 |
+| tfa.layers | LayerNormalization  | https://arxiv.org/abs/1607.06450 |
 | tfa.layers | Maxout | https://arxiv.org/abs/1302.4389             |
 | tfa.layers | PoinareNormalize | https://arxiv.org/abs/1705.08039  |
 | tfa.layers | WeightNormalization | https://arxiv.org/abs/1602.07868 |
diff --git a/tensorflow_addons/layers/README.md b/tensorflow_addons/layers/README.md
index c9832c87c1..9e34f0ac5d 100644
--- a/tensorflow_addons/layers/README.md
+++ b/tensorflow_addons/layers/README.md
@@ -3,6 +3,9 @@
 ## Contents
 | Layer  | Reference                                     |
 |:----------------------- |:-----------------------------|
+| GroupNormalization | https://arxiv.org/abs/1803.08494 |
+| InstanceNormalization | https://arxiv.org/abs/1607.08022 |
+| LayerNormalization | https://arxiv.org/abs/1607.06450 |
 | Maxout | https://arxiv.org/abs/1302.4389               |
 | PoinareNormalize | https://arxiv.org/abs/1705.08039    |
 | WeightNormalization | https://arxiv.org/abs/1602.07868 |
diff --git a/tensorflow_addons/layers/__init__.py b/tensorflow_addons/layers/__init__.py
index 0e06709ac7..c5e0497726 100644
--- a/tensorflow_addons/layers/__init__.py
+++ b/tensorflow_addons/layers/__init__.py
@@ -19,6 +19,9 @@
 from __future__ import print_function
 
 from tensorflow_addons.layers.python.maxout import Maxout
+from tensorflow_addons.layers.python.normalizations import GroupNormalization
+from tensorflow_addons.layers.python.normalizations import InstanceNormalization
+from tensorflow_addons.layers.python.normalizations import LayerNormalization
 from tensorflow_addons.layers.python.poincare import PoincareNormalize
 from tensorflow_addons.layers.python.sparsemax import Sparsemax
 from tensorflow_addons.layers.python.wrappers import WeightNormalization
diff --git a/tensorflow_addons/layers/python/normalizations.py b/tensorflow_addons/layers/python/normalizations.py
index adc3929dcd..2a07a3d802 100644
--- a/tensorflow_addons/layers/python/normalizations.py
+++ b/tensorflow_addons/layers/python/normalizations.py
@@ -11,21 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
 
 # Orginal implementation from keras_contrib/layer/normalization
+# =============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 
+import logging
 import tensorflow as tf
-from tensorflow.keras import backend as K
-from tensorflow.keras import constraints
-from tensorflow.keras import initializers
-from tensorflow.keras import regularizers
-from tensorflow.keras.layers import InputSpec
-from tensorflow.keras.layers import Layer
-from tensorflow.python.ops import nn
+from tensorflow_addons.utils.python import keras_utils
 
 
-class GroupNormalization(Layer):
+@keras_utils.register_keras_custom_object
+class GroupNormalization(tf.keras.layers.Layer):
     """Group normalization layer.
 
     Group Normalization divides the channels into groups and computes
@@ -35,8 +34,8 @@ class GroupNormalization(Layer):
     with batch sizes.
 
     Relation to Layer Normalization:
-    If the number of groups is set to 1, then this operation becomes identical to
-    Layer Normalization.
+    If the number of groups is set to 1, then this operation becomes identical
+    to Layer Normalization.
 
     Relation to Instance Normalization:
     If the number of groups is set to the
@@ -92,17 +91,17 @@ def __init__(self,
         self.epsilon = epsilon
         self.center = center
         self.scale = scale
-        self.beta_initializer = initializers.get(beta_initializer)
-        self.gamma_initializer = initializers.get(gamma_initializer)
-        self.beta_regularizer = regularizers.get(beta_regularizer)
-        self.gamma_regularizer = regularizers.get(gamma_regularizer)
-        self.beta_constraint = constraints.get(beta_constraint)
-        self.gamma_constraint = constraints.get(gamma_constraint)
+        self.beta_initializer = tf.keras.initializers.get(beta_initializer)
+        self.gamma_initializer = tf.keras.initializers.get(gamma_initializer)
+        self.beta_regularizer = tf.keras.regularizers.get(beta_regularizer)
+        self.gamma_regularizer = tf.keras.regularizers.get(gamma_regularizer)
+        self.beta_constraint = tf.keras.constraints.get(beta_constraint)
+        self.gamma_constraint = tf.keras.constraints.get(gamma_constraint)
         self._check_axis()
 
     def build(self, input_shape):
 
-        self._check_if_input_shape_is_None(input_shape)
+        self._check_if_input_shape_is_none(input_shape)
         self._set_number_of_groups_for_instance_norm(input_shape)
         self._check_size_of_dimensions(input_shape)
         self._create_input_spec(input_shape)
@@ -114,32 +113,43 @@ def build(self, input_shape):
 
     def call(self, inputs):
 
-        input_shape = K.int_shape(inputs)
-        tensor_input_shape = K.shape(inputs)
+        input_shape = tf.keras.backend.int_shape(inputs)
+        tensor_input_shape = tf.shape(inputs)
 
         reshaped_inputs, group_shape = self._reshape_into_groups(
             inputs, input_shape, tensor_input_shape)
 
-        normalized_inputs = self._apply_normalization(
-            reshaped_inputs, input_shape)
+        normalized_inputs = self._apply_normalization(reshaped_inputs,
+                                                      input_shape)
 
-        outputs = K.reshape(normalized_inputs, tensor_input_shape)
+        outputs = tf.reshape(normalized_inputs, tensor_input_shape)
 
         return outputs
 
     def get_config(self):
         config = {
-            'groups': self.groups,
-            'axis': self.axis,
-            'epsilon': self.epsilon,
-            'center': self.center,
-            'scale': self.scale,
-            'beta_initializer': initializers.serialize(self.beta_initializer),
-            'gamma_initializer': initializers.serialize(self.gamma_initializer),
-            'beta_regularizer': regularizers.serialize(self.beta_regularizer),
-            'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
-            'beta_constraint': constraints.serialize(self.beta_constraint),
-            'gamma_constraint': constraints.serialize(self.gamma_constraint)
+            'groups':
+            self.groups,
+            'axis':
+            self.axis,
+            'epsilon':
+            self.epsilon,
+            'center':
+            self.center,
+            'scale':
+            self.scale,
+            'beta_initializer':
+            tf.keras.initializers.serialize(self.beta_initializer),
+            'gamma_initializer':
+            tf.keras.initializers.serialize(self.gamma_initializer),
+            'beta_regularizer':
+            tf.keras.regularizers.serialize(self.beta_regularizer),
+            'gamma_regularizer':
+            tf.keras.regularizers.serialize(self.gamma_regularizer),
+            'beta_constraint':
+            tf.keras.constraints.serialize(self.beta_constraint),
+            'gamma_constraint':
+            tf.keras.constraints.serialize(self.gamma_constraint)
         }
         base_config = super(GroupNormalization, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -152,25 +162,27 @@ def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
         group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
         group_shape[self.axis] = input_shape[self.axis] // self.groups
         group_shape.insert(1, self.groups)
-        group_shape = K.stack(group_shape)
-        reshaped_inputs = K.reshape(inputs, group_shape)
+        group_shape = tf.stack(group_shape)
+        reshaped_inputs = tf.reshape(inputs, group_shape)
         return reshaped_inputs, group_shape
 
     def _apply_normalization(self, reshaped_inputs, input_shape):
 
-        group_shape = K.int_shape(reshaped_inputs)
+        group_shape = tf.keras.backend.int_shape(reshaped_inputs)
         group_reduction_axes = list(range(len(group_shape)))
-        # Remember the ordering of the tensor is [batch, group , steps]. Jump the first 2 to calculate the variance and the mean
-        mean, variance = nn.moments(reshaped_inputs, group_reduction_axes[2:],
-                                    keep_dims=True)
+        # Remember the ordering of the tensor is [batch, group , steps]. Jump
+        # the first 2 to calculate the variance and the mean
+        mean, variance = tf.nn.moments(
+            reshaped_inputs, group_reduction_axes[2:], keepdims=True)
 
         gamma, beta = self._get_reshaped_weights(input_shape)
-        normalized_inputs = nn.batch_normalization(reshaped_inputs,
-                                                   mean=mean,
-                                                   variance=variance,
-                                                   scale=gamma,
-                                                   offset=beta,
-                                                   variance_epsilon=self.epsilon)
+        normalized_inputs = tf.nn.batch_normalization(
+            reshaped_inputs,
+            mean=mean,
+            variance=variance,
+            scale=gamma,
+            offset=beta,
+            variance_epsilon=self.epsilon)
         return normalized_inputs
 
     def _get_reshaped_weights(self, input_shape):
@@ -178,13 +190,13 @@ def _get_reshaped_weights(self, input_shape):
         gamma = None
         beta = None
         if self.scale:
-            gamma = K.reshape(self.gamma, broadcast_shape)
+            gamma = tf.reshape(self.gamma, broadcast_shape)
 
         if self.center:
-            beta = K.reshape(self.beta, broadcast_shape)
+            beta = tf.reshape(self.beta, broadcast_shape)
         return gamma, beta
 
-    def _check_if_input_shape_is_None(self, input_shape):
+    def _check_if_input_shape_is_none(self, input_shape):
         dim = input_shape[self.axis]
         if dim is None:
             raise ValueError('Axis ' + str(self.axis) + ' of '
@@ -202,26 +214,27 @@ def _check_size_of_dimensions(self, input_shape):
 
         dim = input_shape[self.axis]
         if dim < self.groups:
-            raise ValueError('Number of groups (' + str(self.groups) + ') cannot be '
-                             'more than the number of channels (' +
-                             str(dim) + ').')
+            raise ValueError(
+                'Number of groups (' + str(self.groups) + ') cannot be '
+                'more than the number of channels (' + str(dim) + ').')
 
         if dim % self.groups != 0:
-            raise ValueError('Number of groups (' + str(self.groups) + ') must be a '
-                             'multiple of the number of channels (' +
-                             str(dim) + ').')
+            raise ValueError(
+                'Number of groups (' + str(self.groups) + ') must be a '
+                'multiple of the number of channels (' + str(dim) + ').')
 
     def _check_axis(self):
 
         if self.axis == 0:
             raise ValueError(
-                "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead")
+                "You are trying to normalize your batch axis. Do you want to "
+                "use tf.layer.batch_normalization instead")
 
     def _create_input_spec(self, input_shape):
 
         dim = input_shape[self.axis]
-        self.input_spec = InputSpec(ndim=len(input_shape),
-                                    axes={self.axis: dim})
+        self.input_spec = tf.keras.layers.InputSpec(
+            ndim=len(input_shape), axes={self.axis: dim})
 
     def _add_gamma_weight(self, input_shape):
 
@@ -229,11 +242,12 @@ def _add_gamma_weight(self, input_shape):
         shape = (dim,)
 
         if self.scale:
-            self.gamma = self.add_weight(shape=shape,
-                                         name='gamma',
-                                         initializer=self.gamma_initializer,
-                                         regularizer=self.gamma_regularizer,
-                                         constraint=self.gamma_constraint)
+            self.gamma = self.add_weight(
+                shape=shape,
+                name='gamma',
+                initializer=self.gamma_initializer,
+                regularizer=self.gamma_regularizer,
+                constraint=self.gamma_constraint)
         else:
             self.gamma = None
 
@@ -243,11 +257,12 @@ def _add_beta_weight(self, input_shape):
         shape = (dim,)
 
         if self.center:
-            self.beta = self.add_weight(shape=shape,
-                                        name='beta',
-                                        initializer=self.beta_initializer,
-                                        regularizer=self.beta_regularizer,
-                                        constraint=self.beta_constraint)
+            self.beta = self.add_weight(
+                shape=shape,
+                name='beta',
+                initializer=self.beta_initializer,
+                regularizer=self.beta_regularizer,
+                constraint=self.beta_constraint)
         else:
             self.beta = None
 
@@ -258,6 +273,7 @@ def _create_broadcast_shape(self, input_shape):
         return broadcast_shape
 
 
+@keras_utils.register_keras_custom_object
 class LayerNormalization(GroupNormalization):
     """Layer normalization layer.
 
@@ -295,19 +311,19 @@ class LayerNormalization(GroupNormalization):
 
     def __init__(self, **kwargs):
         if "groups" in kwargs:
-            tf.logging.warning(
-                "The given value for groups will be overwritten.")
+            logging.warning("The given value for groups will be overwritten.")
         kwargs["groups"] = 1
         super(LayerNormalization, self).__init__(**kwargs)
 
 
+@keras_utils.register_keras_custom_object
 class InstanceNormalization(GroupNormalization):
     """Instance normalization layer.
 
-    Instance Normalization is an specific case of ```GroupNormalization```since it
-    normalizes all features of one channel. The Groupsize is equal to the channel size.
-    Empirically, its accuracy is more stable than batch norm in a wide
-    range of small batch sizes, if learning rate is adjusted linearly
+    Instance Normalization is an specific case of ```GroupNormalization```since
+    it normalizes all features of one channel. The Groupsize is equal to the
+    channel size. Empirically, its accuracy is more stable than batch norm in a
+    wide range of small batch sizes, if learning rate is adjusted linearly
     with batch sizes.
 
     Arguments
@@ -333,14 +349,13 @@ class InstanceNormalization(GroupNormalization):
         Same shape as input.
 
     References
-        - [Layer Normalization](https://arxiv.org/abs/1607.06450)
+        - [Instance Normalization: The Missing Ingredient for Fast Stylization]
+        (https://arxiv.org/abs/1607.08022)
     """
 
     def __init__(self, **kwargs):
-
         if "groups" in kwargs:
-            tf.logging.warning(
-                "The given value for groups will be overwritten.")
+            logging.warning("The given value for groups will be overwritten.")
 
         kwargs["groups"] = -1
         super(InstanceNormalization, self).__init__(**kwargs)
diff --git a/tensorflow_addons/layers/python/normalizations_test.py b/tensorflow_addons/layers/python/normalizations_test.py
index 3b83d70fb6..f3bf95afae 100644
--- a/tensorflow_addons/layers/python/normalizations_test.py
+++ b/tensorflow_addons/layers/python/normalizations_test.py
@@ -12,32 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 
 import numpy as np
-import scipy as scipy
 import tensorflow as tf
-from tensorflow import keras as keras
+
 from tensorflow_addons.layers.python.normalizations import GroupNormalization
 from tensorflow_addons.layers.python.normalizations import InstanceNormalization
 from tensorflow_addons.layers.python.normalizations import LayerNormalization
-from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.layers import normalization
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
-from tensorflow.python.platform import test
-from tensorflow.python.training import gradient_descent
+from tensorflow_addons.utils.python import test_utils
 
 
-class normalization_test(test.TestCase):
+class NormalizationTest(tf.test.TestCase):
 
-    # ------------Tests to ensure proper inheritance. If these suceed you can test for Instance norm and Layernorm by setting Groupnorm groups = -1 or 1
+    # ------------Tests to ensure proper inheritance. If these suceed you can
+    # test for Instance norm and Layernorm by setting Groupnorm groups = -1 or 1
     def test_inheritance(self):
         self.assertTrue(issubclass(LayerNormalization, GroupNormalization))
         self.assertTrue(issubclass(InstanceNormalization, GroupNormalization))
         self.assertTrue(LayerNormalization.build == GroupNormalization.build)
-        self.assertTrue(InstanceNormalization.build ==
-                        GroupNormalization.build)
+        self.assertTrue(
+            InstanceNormalization.build == GroupNormalization.build)
         self.assertTrue(LayerNormalization.call == GroupNormalization.call)
         self.assertTrue(InstanceNormalization.call == GroupNormalization.call)
 
@@ -46,11 +43,11 @@ def test_groups_after_init(self):
         self.assertTrue(layers.groups == -1)
         layers = LayerNormalization()
         self.assertTrue(layers.groups == 1)
-# -----------------------------------------------------------------------------------------------------------------------------------------
+
+    # ------------------------------------------------------------------------------
 
     def test_reshape(self):
         def run_reshape_test(axis, group, input_shape, expected_shape):
-
             group_layer = GroupNormalization(groups=group, axis=axis)
             group_layer._set_number_of_groups_for_instance_norm(input_shape)
 
@@ -93,7 +90,8 @@ def test_picture_input(self):
                     self._test_random_shape_on_all_axis_except_batch(
                         shape, groups, center, scale)
 
-    def _test_random_shape_on_all_axis_except_batch(self, shape, groups, center, scale):
+    def _test_random_shape_on_all_axis_except_batch(self, shape, groups,
+                                                    center, scale):
         inputs = tf.random.normal((shape))
         for axis in range(1, len(shape)):
             self._test_specific_layer(inputs, axis, groups, center, scale)
@@ -105,7 +103,7 @@ def _test_specific_layer(self, inputs, axis, groups, center, scale):
         # Get Output from Keras model
         layer = GroupNormalization(
             axis=axis, groups=groups, center=center, scale=scale)
-        model = keras.models.Sequential()
+        model = tf.keras.models.Sequential()
         model.add(layer)
         outputs = model.predict(inputs)
         self.assertFalse(np.isnan(outputs).any())
@@ -120,10 +118,14 @@ def _test_specific_layer(self, inputs, axis, groups, center, scale):
         reshaped_inputs = np.reshape(np_inputs, tuple(reshaped_dims))
 
         # Calculate mean and variance
-        mean = np.mean(reshaped_inputs, axis=tuple(
-            range(2, len(reshaped_dims))), keepdims=True)
-        variance = np.var(reshaped_inputs, axis=tuple(
-            range(2, len(reshaped_dims))), keepdims=True)
+        mean = np.mean(
+            reshaped_inputs,
+            axis=tuple(range(2, len(reshaped_dims))),
+            keepdims=True)
+        variance = np.var(
+            reshaped_inputs,
+            axis=tuple(range(2, len(reshaped_dims))),
+            keepdims=True)
 
         # Get gamma and beta initalized by layer
         gamma, beta = layer._get_reshaped_weights(input_shape)
@@ -143,20 +145,21 @@ def _test_specific_layer(self, inputs, axis, groups, center, scale):
 
     def _create_and_fit_Sequential_model(self, layer, shape):
         # Helperfunction for quick evaluation
-        model = keras.models.Sequential()
+        model = tf.keras.models.Sequential()
         model.add(layer)
-        model.add(keras.layers.Dense(32))
-        model.add(keras.layers.Dense(1))
+        model.add(tf.keras.layers.Dense(32))
+        model.add(tf.keras.layers.Dense(1))
 
-        model.compile(optimizer=RMSPropOptimizer(0.01),
-                      loss="categorical_crossentropy")
+        model.compile(
+            optimizer=tf.keras.optimizers.RMSprop(0.01),
+            loss="categorical_crossentropy")
         layer_shape = (10,) + shape
         input_batch = np.random.rand(*layer_shape)
         output_batch = np.random.rand(*(10, 1))
         model.fit(x=input_batch, y=output_batch, epochs=1, batch_size=1)
         return model
 
-    @tf_test_util.run_in_graph_and_eager_modes
+    @test_utils.run_in_graph_and_eager_modes
     def test_weights(self):
         # Check if weights get initialized correctly
         layer = GroupNormalization(groups=1, scale=False, center=False)
@@ -180,17 +183,19 @@ def test_apply_normalization(self):
         expected_shape = (1, 2, 2)
         reshaped_inputs = tf.constant([[[2.0, 2.0], [3.0, 3.0]]])
         layer = GroupNormalization(groups=2, axis=1, scale=False, center=False)
-        normalized_input = layer._apply_normalization(
-            reshaped_inputs, input_shape)
-        self.assertTrue(tf.reduce_all(
-            tf.equal(normalized_input, tf.constant([[[0.0, 0.0], [0.0, 0.0]]]))))
+        normalized_input = layer._apply_normalization(reshaped_inputs,
+                                                      input_shape)
+        self.assertTrue(
+            tf.reduce_all(
+                tf.equal(normalized_input,
+                         tf.constant([[[0.0, 0.0], [0.0, 0.0]]]))))
 
     def test_axis_error(self):
 
         with self.assertRaises(ValueError):
             GroupNormalization(axis=0)
 
-    @tf_test_util.run_in_graph_and_eager_modes
+    @test_utils.run_in_graph_and_eager_modes
     def test_groupnorm_flat(self):
         # Check basic usage of groupnorm_flat
         # Testing for 1 == LayerNorm, 16 == GroupNorm, -1 == InstanceNorm
@@ -203,33 +208,34 @@ def test_groupnorm_flat(self):
             self.assertTrue(hasattr(model.layers[0], 'gamma'))
             self.assertTrue(hasattr(model.layers[0], 'beta'))
 
-    @tf_test_util.run_in_graph_and_eager_modes
+    @test_utils.run_in_graph_and_eager_modes
     def test_layernorm_flat(self):
         # Check basic usage of layernorm
 
-        model = self._create_and_fit_Sequential_model(
-            LayerNormalization(), (64,))
+        model = self._create_and_fit_Sequential_model(LayerNormalization(),
+                                                      (64,))
         self.assertTrue(hasattr(model.layers[0], 'gamma'))
         self.assertTrue(hasattr(model.layers[0], 'beta'))
 
-    @tf_test_util.run_in_graph_and_eager_modes
+    @test_utils.run_in_graph_and_eager_modes
     def test_instancenorm_flat(self):
         # Check basic usage of instancenorm
 
-        model = self._create_and_fit_Sequential_model(
-            InstanceNormalization(), (64,))
+        model = self._create_and_fit_Sequential_model(InstanceNormalization(),
+                                                      (64,))
         self.assertTrue(hasattr(model.layers[0], 'gamma'))
         self.assertTrue(hasattr(model.layers[0], 'beta'))
 
-    @tf_test_util.run_in_graph_and_eager_modes
+    @test_utils.run_in_graph_and_eager_modes
     def test_initializer(self):
         # Check if the initializer for gamma and beta is working correctly
 
-        layer = GroupNormalization(groups=32,
-                                   beta_initializer='random_normal',
-                                   beta_constraint='NonNeg',
-                                   gamma_initializer='random_normal',
-                                   gamma_constraint='NonNeg')
+        layer = GroupNormalization(
+            groups=32,
+            beta_initializer='random_normal',
+            beta_constraint='NonNeg',
+            gamma_initializer='random_normal',
+            gamma_constraint='NonNeg')
 
         model = self._create_and_fit_Sequential_model(layer, (64,))
 
@@ -237,38 +243,35 @@ def test_initializer(self):
         negativ = weights[weights < 0.0]
         self.assertTrue(len(negativ) == 0)
 
-    @tf_test_util.run_in_graph_and_eager_modes
+    @test_utils.run_in_graph_and_eager_modes
     def test_regularizations(self):
 
         layer = GroupNormalization(
-            gamma_regularizer='l1',
-            beta_regularizer='l1',
-            groups=4,
-            axis=2)
+            gamma_regularizer='l1', beta_regularizer='l1', groups=4, axis=2)
         layer.build((None, 4, 4))
         self.assertEqual(len(layer.losses), 2)
-        max_norm = keras.constraints.max_norm
+        max_norm = tf.keras.constraints.max_norm
         layer = GroupNormalization(
-            gamma_constraint=max_norm,
-            beta_constraint=max_norm)
+            gamma_constraint=max_norm, beta_constraint=max_norm)
         layer.build((None, 3, 4))
         self.assertEqual(layer.gamma.constraint, max_norm)
         self.assertEqual(layer.beta.constraint, max_norm)
 
-    @tf_test_util.run_in_graph_and_eager_modes
+    @test_utils.run_in_graph_and_eager_modes
     def test_groupnorm_conv(self):
         # Check if Axis is working for CONV nets
         # Testing for 1 == LayerNorm, 5 == GroupNorm, -1 == InstanceNorm
 
         groups = [-1, 5, 1]
         for i in groups:
-            model = keras.models.Sequential()
-            model.add(GroupNormalization(
-                axis=1, groups=i, input_shape=(20, 20, 3)))
-            model.add(keras.layers.Conv2D(5, (1, 1), padding='same'))
-            model.add(keras.layers.Flatten())
-            model.add(keras.layers.Dense(1, activation='softmax'))
-            model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+            model = tf.keras.models.Sequential()
+            model.add(
+                GroupNormalization(axis=1, groups=i, input_shape=(20, 20, 3)))
+            model.add(tf.keras.layers.Conv2D(5, (1, 1), padding='same'))
+            model.add(tf.keras.layers.Flatten())
+            model.add(tf.keras.layers.Dense(1, activation='softmax'))
+            model.compile(
+                optimizer=tf.keras.optimizers.RMSprop(0.01), loss='mse')
             x = np.random.randint(1000, size=(10, 20, 20, 3))
             y = np.random.randint(1000, size=(10, 1))
             a = model.fit(x=x, y=y, epochs=1)
@@ -276,4 +279,4 @@ def test_groupnorm_conv(self):
 
 
 if __name__ == "__main__":
-    test.main()
+    tf.test.main()