unit8co
diff --git a/‎CHANGELOG.md
+3-3 b/‎CHANGELOG.md
+3-3
diff --git a/‎darts/models/components/layer_norm_variants.py
+56 b/‎darts/models/components/layer_norm_variants.py
+56
diff --git a/‎darts/models/forecasting/nhits.py
+5-5 b/‎darts/models/forecasting/nhits.py
+5-5
diff --git a/‎darts/models/forecasting/tft_model.py
+40-6 b/‎darts/models/forecasting/tft_model.py
+40-6
@@ -1,11 +1,11 @@
 
 # Changelog
 
-Darts is still in an early development phase and we cannot always guarantee backwards compatibility. Changes that may **break code which uses a previous release of Darts** are marked with a "&#x1F534;".
+Darts is still in an early development phase, and we cannot always guarantee backwards compatibility. Changes that may **break code which uses a previous release of Darts** are marked with a "&#x1F534;".
 
 ## [Unreleased](https://github.com/unit8co/darts/tree/master)
-
 - Added support for retraining model(s) every `n` iteration and on custom condition in `historical_forecasts` method of `ForecastingModel` abstract class. Addressed issues [#135](https://github.com/unit8co/darts/issues/135) and [#623](https://github.com/unit8co/darts/issues/623) by [Francesco Bruzzesi](https://github.com/fbruzzesi).
+- New LayerNorm alternatives, RMSNorm and LayerNormNoBias [#1113](https://github.com/unit8co/darts/issues/1113) by [Greg DeVos](https://github.com/gdevos010).
 
 [Full Changelog](https://github.com/unit8co/darts/compare/0.21.0...master)
 
@@ -49,7 +49,7 @@ Darts is still in an early development phase and we cannot always guarantee back
 - Added support for static covariates in `TimeSeries` class. [#966](https://github.com/unit8co/darts/pull/966) by [Dennis Bader](https://github.com/dennisbader).
 - Added support for static covariates in TFT model. [#966](https://github.com/unit8co/darts/pull/966) by [Dennis Bader](https://github.com/dennisbader).
 - Support for storing hierarchy of components in `TimeSeries` (in view of hierarchical reconciliation) [#1012](https://github.com/unit8co/darts/pull/1012) by [Julien Herzen](https://github.com/hrzn).
-- New Reconciliation transformers for forececast reconciliation: bottom up, top down and MinT. [#1012](https://github.com/unit8co/darts/pull/1012) by [Julien Herzen](https://github.com/hrzn).
+- New Reconciliation transformers for forecast reconciliation: bottom up, top down and MinT. [#1012](https://github.com/unit8co/darts/pull/1012) by [Julien Herzen](https://github.com/hrzn).
 - Added support for Monte Carlo Dropout, as a way to capture model uncertainty with torch models at inference time. [#1013](https://github.com/unit8co/darts/pull/1013) by [Julien Herzen](https://github.com/hrzn).
 - New datasets: ETT and Electricity. [#617](https://github.com/unit8co/darts/pull/617)
   by [Greg DeVos](https://github.com/gdevos010)
 
@@ -0,0 +1,56 @@
+"""
+MIT License
+
+Copyright (c) 2020 Phil Wang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import torch
+import torch.nn as nn
+
+
+class RMSNorm(nn.Module):
+    """An alternate to layer normalization, without mean centering and the learned bias [1]
+
+    References
+    ----------
+    .. [1] Zhang, Biao, and Rico Sennrich. "Root mean square layer normalization." Advances in Neural Information
+           Processing Systems 32 (2019).
+    """
+
+    def __init__(self, dim, eps=1e-8):
+        super().__init__()
+        self.scale = dim**-0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+        return x / norm.clamp(min=self.eps) * self.g
+
+
+class LayerNormNoBias(nn.LayerNorm):
+    def __init__(self, input_size, **kwargs):
+        super().__init__(input_size, elementwise_affine=False, **kwargs)
+
+
+class LayerNorm(nn.LayerNorm):
+    def __init__(self, input_size, **kwargs) -> None:
+        super().__init__(input_size, **kwargs)
@@ -158,7 +158,7 @@ def __init__(
 
         self.layers = nn.Sequential(*layers)
 
-        # Fully connected layer producing forecast/backcast expansion coeffcients (waveform generator parameters).
+        # Fully connected layer producing forecast/backcast expansion coefficients (waveform generator parameters).
         # The coefficients are emitted for each parameter of the likelihood for the forecast.
         self.backcast_linear_layer = nn.Linear(
             in_features=layer_width, out_features=n_theta_backcast
@@ -413,7 +413,7 @@ def __init__(
         self.stacks = nn.ModuleList(self.stacks_list)
 
         # setting the last backcast "branch" to be not trainable (without next block/stack, it doesn't need to be
-        # backpropagated). Removing this lines would cause logtensorboard to crash, since no gradient is stored
+        # backpropagated). Removing this line would cause logtensorboard to crash, since no gradient is stored
         # on this params (the last block backcast is not part of the final output of the net).
         self.stacks_list[-1].blocks[-1].backcast_linear_layer.requires_grad_(False)
 
@@ -476,7 +476,7 @@ def __init__(
 
         N-HiTS is similar to N-BEATS (implemented in :class:`NBEATSModel`),
         but attempts to provide better performance at lower computational cost by introducing
-        multi-rate sampling of the inputs and mulit-scale interpolation of the outputs.
+        multi-rate sampling of the inputs and multi-scale interpolation of the outputs.
 
         Similar to :class:`NBEATSModel`, in addition to the univariate version presented in the paper,
         this implementation also supports multivariate series (and covariates) by flattening the model inputs
@@ -489,7 +489,7 @@ def __init__(
         This parameter can be a tuple of tuples, of size (num_stacks x num_blocks), specifying the kernel
         size for each block in each stack. If left to ``None``, some default values will be used based on
         ``input_chunk_length``.
-        Similarly, the multi-scale interpolation is controled by ``n_freq_downsample``, which gives the
+        Similarly, the multi-scale interpolation is controlled by ``n_freq_downsample``, which gives the
         downsampling factors to be used in each block of each stack. If left to ``None``, some default
         values will be used based on the ``output_chunk_length``.
 
@@ -545,7 +545,7 @@ def __init__(
             The PyTorch optimizer class to be used. Default: ``torch.optim.Adam``.
         optimizer_kwargs
             Optionally, some keyword arguments for the PyTorch optimizer (e.g., ``{'lr': 1e-3}``
-            for specifying a learning rate). Otherwise the default values of the selected ``optimizer_cls``
+            for specifying a learning rate). Otherwise, the default values of the selected ``optimizer_cls``
             will be used. Default: ``None``.
         lr_scheduler_cls
             Optionally, the PyTorch learning rate scheduler class to be used. Specifying ``None`` corresponds
 
@@ -12,8 +12,8 @@
 from torch.nn import LSTM as _LSTM
 
 from darts import TimeSeries
-from darts.logging import get_logger, raise_if, raise_if_not
-from darts.models.components import glu_variants
+from darts.logging import get_logger, raise_if, raise_if_not, raise_log
+from darts.models.components import glu_variants, layer_norm_variants
 from darts.models.components.glu_variants import GLU_FFN
 from darts.models.forecasting.pl_forecasting_module import PLMixedCovariatesModule
 from darts.models.forecasting.tft_submodels import (
@@ -55,6 +55,7 @@ def __init__(
         categorical_embedding_sizes: Dict[str, Tuple[int, int]],
         dropout: float,
         add_relative_index: bool,
+        norm_type: Union[str, nn.Module],
         **kwargs,
     ):
 
@@ -102,6 +103,8 @@ def __init__(
         likelihood
             The likelihood model to be used for probabilistic forecasts. By default, the TFT uses
             a ``QuantileRegression`` likelihood.
+        norm_type: str | nn.Module
+            The type of LayerNorm variant to use.
         **kwargs
             all parameters required for :class:`darts.model.forecasting_models.PLForecastingModule` base class.
         """
@@ -121,6 +124,16 @@ def __init__(
         self.dropout = dropout
         self.add_relative_index = add_relative_index
 
+        if isinstance(norm_type, str):
+            try:
+                self.layer_norm = getattr(layer_norm_variants, norm_type)
+            except AttributeError:
+                raise_log(
+                    AttributeError("please provide a valid layer norm type"),
+                )
+        else:
+            self.layer_norm = norm_type
+
         # initialize last batch size to check if new mask needs to be generated
         self.batch_size_last = -1
         self.attention_mask = None
@@ -173,6 +186,7 @@ def __init__(
             prescalers=self.prescalers_linear,
             single_variable_grns={},
             context_size=None,  # no context for static variables
+            layer_norm=self.layer_norm,
         )
 
         # variable selection for encoder and decoder
@@ -192,6 +206,7 @@ def __init__(
             context_size=self.hidden_size,
             prescalers=self.prescalers_linear,
             single_variable_grns={},
+            layer_norm=self.layer_norm,
         )
 
         self.decoder_vsn = _VariableSelectionNetwork(
@@ -202,6 +217,7 @@ def __init__(
             context_size=self.hidden_size,
             prescalers=self.prescalers_linear,
             single_variable_grns={},
+            layer_norm=self.layer_norm,
         )
 
         # static encoders
@@ -211,6 +227,7 @@ def __init__(
             hidden_size=self.hidden_size,
             output_size=self.hidden_size,
             dropout=self.dropout,
+            layer_norm=self.layer_norm,
         )
 
         # for hidden state of the lstm
@@ -219,6 +236,7 @@ def __init__(
             hidden_size=self.hidden_size,
             output_size=self.hidden_size,
             dropout=self.dropout,
+            layer_norm=self.layer_norm,
         )
 
         # for cell state of the lstm
@@ -227,6 +245,7 @@ def __init__(
             hidden_size=self.hidden_size,
             output_size=self.hidden_size,
             dropout=self.dropout,
+            layer_norm=self.layer_norm,
         )
 
         # for post lstm static enrichment
@@ -235,6 +254,7 @@ def __init__(
             hidden_size=self.hidden_size,
             output_size=self.hidden_size,
             dropout=self.dropout,
+            layer_norm=self.layer_norm,
         )
 
         # lstm encoder (history) and decoder (future) for local processing
@@ -255,7 +275,9 @@ def __init__(
         )
 
         # post lstm GateAddNorm
-        self.post_lstm_gan = _GateAddNorm(input_size=self.hidden_size, dropout=dropout)
+        self.post_lstm_gan = _GateAddNorm(
+            input_size=self.hidden_size, dropout=dropout, layer_norm=self.layer_norm
+        )
 
         # static enrichment and processing past LSTM
         self.static_enrichment_grn = _GatedResidualNetwork(
@@ -264,6 +286,7 @@ def __init__(
             output_size=self.hidden_size,
             dropout=self.dropout,
             context_size=self.hidden_size,
+            layer_norm=self.layer_norm,
         )
 
         # attention for long-range processing
@@ -272,14 +295,17 @@ def __init__(
             n_head=self.num_attention_heads,
             dropout=self.dropout,
         )
-        self.post_attn_gan = _GateAddNorm(self.hidden_size, dropout=self.dropout)
+        self.post_attn_gan = _GateAddNorm(
+            self.hidden_size, dropout=self.dropout, layer_norm=self.layer_norm
+        )
 
         if self.feed_forward == "GatedResidualNetwork":
             self.feed_forward_block = _GatedResidualNetwork(
                 self.hidden_size,
                 self.hidden_size,
                 self.hidden_size,
                 dropout=self.dropout,
+                layer_norm=self.layer_norm,
             )
         else:
             raise_if_not(
@@ -293,7 +319,9 @@ def __init__(
             )
 
         # output processing -> no dropout at this late stage
-        self.pre_output_gan = _GateAddNorm(self.hidden_size, dropout=None)
+        self.pre_output_gan = _GateAddNorm(
+            self.hidden_size, dropout=None, layer_norm=self.layer_norm
+        )
 
         self.output_layer = nn.Linear(self.hidden_size, self.n_targets * self.loss_size)
 
@@ -637,6 +665,7 @@ def __init__(
         add_relative_index: bool = False,
         loss_fn: Optional[nn.Module] = None,
         likelihood: Optional[Likelihood] = None,
+        norm_type: Union[str, nn.Module] = "LayerNorm",
         **kwargs,
     ):
         """Temporal Fusion Transformers (TFT) for Interpretable Time Series Forecasting.
@@ -699,13 +728,16 @@ def __init__(
             This allows to use the TFTModel without having to pass future_covariates to :func:`fit()` and
             :func:`train()`. It gives a value to the position of each step from input and output chunk relative
             to the prediction point. The values are normalized with ``input_chunk_length``.
-        loss_fn
+        loss_fn: nn.Module
             PyTorch loss function used for training. By default, the TFT model is probabilistic and uses a
             ``likelihood`` instead (``QuantileRegression``). To make the model deterministic, you can set the `
             `likelihood`` to None and give a ``loss_fn`` argument.
         likelihood
             The likelihood model to be used for probabilistic forecasts. By default, the TFT uses
             a ``QuantileRegression`` likelihood.
+        norm_type: str | nn.Module
+            The type of LayerNorm variant to use.  Default: ``LayerNorm``. Available options are
+            ["LayerNorm", "RMSNorm", "LayerNormNoBias"], or provide a custom nn.Module.
         **kwargs
             Optional arguments to initialize the pytorch_lightning.Module, pytorch_lightning.Trainer, and
             Darts' :class:`TorchForecastingModel`.
@@ -862,6 +894,7 @@ def __init__(
         )
         self.add_relative_index = add_relative_index
         self.output_dim: Optional[Tuple[int, int]] = None
+        self.norm_type = norm_type
 
     def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Module:
         """
@@ -1049,6 +1082,7 @@ def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Modu
             hidden_continuous_size=self.hidden_continuous_size,
             categorical_embedding_sizes=self.categorical_embedding_sizes,
             add_relative_index=self.add_relative_index,
+            norm_type=self.norm_type,
             **self.pl_module_params,
         )