diff --git a/pypots/base.py b/pypots/base.py index 699fc098..71647319 100644 --- a/pypots/base.py +++ b/pypots/base.py @@ -285,7 +285,10 @@ def save( f"‼️ File {saving_path} exists. Argument `overwrite` is True. Overwriting now..." ) else: - logger.error(f"❌ File {saving_path} exists. Saving operation aborted.") + logger.error( + f"❌ File {saving_path} exists. Saving operation aborted. " + f"Use the arg `overwrite=True` to force overwrite." + ) try: create_dir_if_not_exist(saving_dir) diff --git a/pypots/classification/raindrop/core.py b/pypots/classification/raindrop/core.py index dbe2b653..5e6deb99 100644 --- a/pypots/classification/raindrop/core.py +++ b/pypots/classification/raindrop/core.py @@ -21,8 +21,8 @@ def __init__( n_features, n_layers, d_model, - d_ffn, n_heads, + d_ffn, n_classes, dropout=0.3, max_len=215, @@ -41,8 +41,8 @@ def __init__( n_features, n_layers, d_model, - d_ffn, n_heads, + d_ffn, n_classes, dropout, max_len, diff --git a/pypots/classification/raindrop/model.py b/pypots/classification/raindrop/model.py index 9adaeaa0..58f90930 100644 --- a/pypots/classification/raindrop/model.py +++ b/pypots/classification/raindrop/model.py @@ -43,12 +43,12 @@ class Raindrop(BaseNNClassifier): The dimension of the Transformer encoder backbone. It is the input dimension of the multi-head self-attention layers. - d_ffn : - The dimension of the layer in the Feed-Forward Networks (FFN). - n_heads : The number of heads in the multi-head self-attention mechanism. + d_ffn : + The dimension of the layer in the Feed-Forward Networks (FFN). + dropout : The dropout rate for all fully-connected layers in the model. @@ -112,8 +112,8 @@ def __init__( n_classes, n_layers, d_model, - d_ffn, n_heads, + d_ffn, dropout, d_static=0, aggregation="mean", @@ -147,8 +147,8 @@ def __init__( n_features, n_layers, d_model, - d_ffn, n_heads, + d_ffn, n_classes, dropout, n_steps, diff --git a/pypots/imputation/autoformer/core.py b/pypots/imputation/autoformer/core.py index 9880c52c..fb883c4e 100644 --- a/pypots/imputation/autoformer/core.py +++ b/pypots/imputation/autoformer/core.py @@ -8,13 +8,7 @@ import torch.nn as nn -from ...nn.modules.autoformer import ( - SeasonalLayerNorm, - AutoformerEncoderLayer, - AutoCorrelation, - AutoCorrelationLayer, -) -from ...nn.modules.informer import InformerEncoder +from ...nn.modules.autoformer import AutoformerEncoder from ...nn.modules.saits import SaitsLoss, SaitsEmbedding @@ -24,8 +18,8 @@ def __init__( n_steps, n_features, n_layers, - n_heads, d_model, + n_heads, d_ffn, factor, moving_avg_window_size, @@ -44,23 +38,15 @@ def __init__( with_pos=False, dropout=dropout, ) - self.encoder = InformerEncoder( - [ - AutoformerEncoderLayer( - AutoCorrelationLayer( - AutoCorrelation(factor, dropout), - d_model, - n_heads, - ), - d_model, - d_ffn, - moving_avg_window_size, - dropout, - activation, - ) - for _ in range(n_layers) - ], - norm_layer=SeasonalLayerNorm(d_model), + self.encoder = AutoformerEncoder( + n_layers, + d_model, + n_heads, + d_ffn, + factor, + moving_avg_window_size, + dropout, + activation, ) # for the imputation task, the output dim is the same as input dim diff --git a/pypots/imputation/autoformer/model.py b/pypots/imputation/autoformer/model.py index 323d6b01..fb695885 100644 --- a/pypots/imputation/autoformer/model.py +++ b/pypots/imputation/autoformer/model.py @@ -37,12 +37,12 @@ class Autoformer(BaseNNImputer): n_layers : The number of layers in the Autoformer model. - n_heads : - The number of heads in each layer of Autoformer. - d_model : The dimension of the model. + n_heads : + The number of heads in each layer of Autoformer. + d_ffn : The dimension of the feed-forward network. @@ -107,8 +107,8 @@ def __init__( n_steps: int, n_features: int, n_layers: int, - n_heads: int, d_model: int, + n_heads: int, d_ffn: int, factor: int, moving_avg_window_size: int, @@ -152,8 +152,8 @@ def __init__( self.n_steps, self.n_features, self.n_layers, - self.n_heads, self.d_model, + self.n_heads, self.d_ffn, self.factor, self.moving_avg_window_size, diff --git a/pypots/imputation/crossformer/core.py b/pypots/imputation/crossformer/core.py index a410b287..e26f27ca 100644 --- a/pypots/imputation/crossformer/core.py +++ b/pypots/imputation/crossformer/core.py @@ -23,8 +23,8 @@ def __init__( n_steps, n_features, n_layers, - n_heads, d_model, + n_heads, d_ffn, factor, seg_len, diff --git a/pypots/imputation/crossformer/model.py b/pypots/imputation/crossformer/model.py index 7a158bee..cd248096 100644 --- a/pypots/imputation/crossformer/model.py +++ b/pypots/imputation/crossformer/model.py @@ -37,12 +37,12 @@ class Crossformer(BaseNNImputer): n_layers : The number of layers in the 1st and 2nd DMSA blocks in the SAITS model. - n_heads: - The number of heads in the multi-head attention mechanism. - d_model : The dimension of the model. + n_heads: + The number of heads in the multi-head attention mechanism. + d_ffn : The dimension of the feed-forward network. @@ -110,8 +110,8 @@ def __init__( n_steps: int, n_features: int, n_layers: int, - n_heads: int, d_model: int, + n_heads: int, d_ffn: int, factor: int, seg_len: int, @@ -157,8 +157,8 @@ def __init__( self.n_steps, self.n_features, self.n_layers, - self.n_heads, self.d_model, + self.n_heads, self.d_ffn, self.factor, self.seg_len, diff --git a/pypots/imputation/etsformer/core.py b/pypots/imputation/etsformer/core.py index ce1e8cde..92c61f5d 100644 --- a/pypots/imputation/etsformer/core.py +++ b/pypots/imputation/etsformer/core.py @@ -24,8 +24,8 @@ def __init__( n_features, n_e_layers, n_d_layers, - n_heads, d_model, + n_heads, d_ffn, dropout, top_k, @@ -53,7 +53,7 @@ def __init__( n_steps, n_steps, top_k, - dim_feedforward=d_ffn, + d_ffn=d_ffn, dropout=dropout, activation=activation, ) diff --git a/pypots/imputation/etsformer/model.py b/pypots/imputation/etsformer/model.py index cb0eb5f5..6a6fae30 100644 --- a/pypots/imputation/etsformer/model.py +++ b/pypots/imputation/etsformer/model.py @@ -40,12 +40,12 @@ class ETSformer(BaseNNImputer): n_d_layers : The number of layers in the ETSformer decoder. - n_heads : - The number of heads in each layer of ETSformer. - d_model : The dimension of the model. + n_heads : + The number of heads in each layer of ETSformer. + d_ffn : The dimension of the feed-forward network. @@ -108,8 +108,8 @@ def __init__( n_features, n_e_layers, n_d_layers, - n_heads, d_model, + n_heads, d_ffn, top_k, dropout: float = 0, @@ -153,8 +153,8 @@ def __init__( self.n_features, self.n_e_layers, self.n_d_layers, - self.n_heads, self.d_model, + self.n_heads, self.d_ffn, self.dropout, self.top_k, diff --git a/pypots/imputation/fedformer/core.py b/pypots/imputation/fedformer/core.py index 3cb9abf5..617a1462 100644 --- a/pypots/imputation/fedformer/core.py +++ b/pypots/imputation/fedformer/core.py @@ -18,8 +18,8 @@ def __init__( n_steps, n_features, n_layers, - n_heads, d_model, + n_heads, d_ffn, moving_avg_window_size, dropout, @@ -43,8 +43,8 @@ def __init__( self.encoder = FEDformerEncoder( n_steps, n_layers, - n_heads, d_model, + n_heads, d_ffn, moving_avg_window_size, dropout, diff --git a/pypots/imputation/fedformer/model.py b/pypots/imputation/fedformer/model.py index 11c8a325..ea750e9b 100644 --- a/pypots/imputation/fedformer/model.py +++ b/pypots/imputation/fedformer/model.py @@ -115,8 +115,8 @@ def __init__( n_steps, n_features, n_layers, - n_heads, d_model, + n_heads, d_ffn, moving_avg_window_size, dropout: float = 0, @@ -164,8 +164,8 @@ def __init__( self.n_steps, self.n_features, self.n_layers, - self.n_heads, self.d_model, + self.n_heads, self.d_ffn, self.moving_avg_window_size, self.dropout, diff --git a/pypots/imputation/informer/core.py b/pypots/imputation/informer/core.py index 0084d2d6..e9199b02 100644 --- a/pypots/imputation/informer/core.py +++ b/pypots/imputation/informer/core.py @@ -24,8 +24,8 @@ def __init__( n_steps, n_features, n_layers, - n_heads, d_model, + n_heads, d_ffn, factor, dropout, @@ -47,11 +47,11 @@ def __init__( [ InformerEncoderLayer( MultiHeadAttention( - n_heads, + ProbAttention(False, factor, dropout), d_model, + n_heads, d_model // n_heads, d_model // n_heads, - ProbAttention(False, factor, dropout), ), d_model, d_ffn, diff --git a/pypots/imputation/informer/model.py b/pypots/imputation/informer/model.py index d60770c7..457a383e 100644 --- a/pypots/imputation/informer/model.py +++ b/pypots/imputation/informer/model.py @@ -37,12 +37,12 @@ class Informer(BaseNNImputer): n_layers : The number of layers in the Informer model. - n_heads : - The number of heads in each layer of Informer. - d_model : The dimension of the model. + n_heads : + The number of heads in each layer of Informer. + d_ffn : The dimension of the feed-forward network. @@ -104,8 +104,8 @@ def __init__( n_steps: int, n_features: int, n_layers: int, - n_heads: int, d_model: int, + n_heads: int, d_ffn: int, factor: int, dropout: float = 0, @@ -147,8 +147,8 @@ def __init__( self.n_steps, self.n_features, self.n_layers, - self.n_heads, self.d_model, + self.n_heads, self.d_ffn, self.factor, self.dropout, diff --git a/pypots/imputation/itransformer/core.py b/pypots/imputation/itransformer/core.py index bd620fdf..5747f12e 100644 --- a/pypots/imputation/itransformer/core.py +++ b/pypots/imputation/itransformer/core.py @@ -20,10 +20,10 @@ def __init__( n_features: int, n_layers: int, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float, attn_dropout: float, ORT_weight: float = 1, @@ -41,10 +41,10 @@ def __init__( self.encoder = TransformerEncoder( n_layers, d_model, - d_ffn, n_heads, d_k, d_v, + d_ffn, dropout, attn_dropout, ) diff --git a/pypots/imputation/itransformer/model.py b/pypots/imputation/itransformer/model.py index 757d0e27..045bd2dc 100644 --- a/pypots/imputation/itransformer/model.py +++ b/pypots/imputation/itransformer/model.py @@ -42,9 +42,6 @@ class iTransformer(BaseNNImputer): The dimension of the model's backbone. It is the input dimension of the multi-head self-attention layers. - d_ffn : - The dimension of the layer in the Feed-Forward Networks (FFN). - n_heads : The number of heads in the multi-head self-attention mechanism. ``d_model`` must be divisible by ``n_heads``, and the result should be equal to ``d_k``. @@ -58,6 +55,9 @@ class iTransformer(BaseNNImputer): d_v : The dimension of the `values` (V) in the DMSA mechanism. + d_ffn : + The dimension of the layer in the Feed-Forward Networks (FFN). + dropout : The dropout rate for all fully-connected layers in the model. @@ -117,10 +117,10 @@ def __init__( n_features: int, n_layers: int, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float = 0, attn_dropout: float = 0, ORT_weight: int = 1, @@ -174,10 +174,10 @@ def __init__( self.n_features, self.n_layers, self.d_model, - self.d_ffn, self.n_heads, self.d_k, self.d_v, + self.d_ffn, self.dropout, self.attn_dropout, self.ORT_weight, diff --git a/pypots/imputation/patchtst/core.py b/pypots/imputation/patchtst/core.py index 8d394384..9a356173 100644 --- a/pypots/imputation/patchtst/core.py +++ b/pypots/imputation/patchtst/core.py @@ -18,11 +18,11 @@ def __init__( n_steps: int, n_features: int, n_layers: int, - n_heads: int, d_model: int, - d_ffn: int, + n_heads: int, d_k: int, d_v: int, + d_ffn: int, patch_len: int, stride: int, dropout: float, @@ -40,7 +40,14 @@ def __init__( d_model, patch_len, stride, padding, dropout ) self.encoder = PatchtstEncoder( - n_layers, n_heads, d_model, d_ffn, d_k, d_v, dropout, attn_dropout + n_layers, + d_model, + n_heads, + d_k, + d_v, + d_ffn, + dropout, + attn_dropout, ) self.head = PredictionHead(d_model, n_patches, n_steps, dropout) self.output_projection = nn.Linear(d_model, n_features) diff --git a/pypots/imputation/patchtst/model.py b/pypots/imputation/patchtst/model.py index d1b5274e..5d3f4bf4 100644 --- a/pypots/imputation/patchtst/model.py +++ b/pypots/imputation/patchtst/model.py @@ -47,6 +47,9 @@ class PatchTST(BaseNNImputer): n_layers : The number of layers in the PatchTST model. + d_model : + The dimension of the model. + n_heads : The number of heads in each layer of PatchTST. @@ -59,9 +62,6 @@ class PatchTST(BaseNNImputer): d_v : The dimension of the `values` (V) in the DMSA mechanism. - d_model : - The dimension of the model. - d_ffn : The dimension of the feed-forward network. @@ -122,10 +122,10 @@ def __init__( patch_len: int, stride: int, n_layers: int, + d_model: int, n_heads: int, d_k: int, d_v: int, - d_model: int, d_ffn: int, dropout: float, attn_dropout: float, @@ -180,11 +180,11 @@ def __init__( self.n_steps, self.n_features, self.n_layers, - self.n_heads, self.d_model, - self.d_ffn, + self.n_heads, self.d_k, self.d_v, + self.d_ffn, self.patch_len, self.stride, self.dropout, diff --git a/pypots/imputation/saits/core.py b/pypots/imputation/saits/core.py index 7e793583..f5189ab3 100644 --- a/pypots/imputation/saits/core.py +++ b/pypots/imputation/saits/core.py @@ -23,10 +23,10 @@ def __init__( n_steps: int, n_features: int, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float, attn_dropout: float, diagonal_attention_mask: bool = True, @@ -43,14 +43,14 @@ def __init__( self.customized_loss_func = customized_loss_func self.encoder = BackboneSAITS( - n_layers, n_steps, n_features, + n_layers, d_model, - d_ffn, n_heads, d_k, d_v, + d_ffn, dropout, attn_dropout, ) diff --git a/pypots/imputation/saits/model.py b/pypots/imputation/saits/model.py index eb8c22f2..a1ce27cb 100644 --- a/pypots/imputation/saits/model.py +++ b/pypots/imputation/saits/model.py @@ -41,9 +41,6 @@ class SAITS(BaseNNImputer): The dimension of the model's backbone. It is the input dimension of the multi-head DMSA layers. - d_ffn : - The dimension of the layer in the Feed-Forward Networks (FFN). - n_heads : The number of heads in the multi-head DMSA mechanism. ``d_model`` must be divisible by ``n_heads``, and the result should be equal to ``d_k``. @@ -57,6 +54,9 @@ class SAITS(BaseNNImputer): d_v : The dimension of the `values` (V) in the DMSA mechanism. + d_ffn : + The dimension of the layer in the Feed-Forward Networks (FFN). + dropout : The dropout rate for all fully-connected layers in the model. @@ -124,10 +124,10 @@ def __init__( n_features: int, n_layers: int, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float = 0, attn_dropout: float = 0, diagonal_attention_mask: bool = True, @@ -184,10 +184,10 @@ def __init__( self.n_steps, self.n_features, self.d_model, - self.d_ffn, self.n_heads, self.d_k, self.d_v, + self.d_ffn, self.dropout, self.attn_dropout, self.diagonal_attention_mask, diff --git a/pypots/imputation/transformer/core.py b/pypots/imputation/transformer/core.py index dc0d7abe..e769a3aa 100644 --- a/pypots/imputation/transformer/core.py +++ b/pypots/imputation/transformer/core.py @@ -20,10 +20,10 @@ def __init__( n_features: int, n_layers: int, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float, attn_dropout: float, ORT_weight: float = 1, @@ -44,10 +44,10 @@ def __init__( self.encoder = TransformerEncoder( n_layers, d_model, - d_ffn, n_heads, d_k, d_v, + d_ffn, dropout, attn_dropout, ) diff --git a/pypots/imputation/transformer/model.py b/pypots/imputation/transformer/model.py index cf3c2178..047caccc 100644 --- a/pypots/imputation/transformer/model.py +++ b/pypots/imputation/transformer/model.py @@ -43,9 +43,6 @@ class Transformer(BaseNNImputer): The dimension of the model's backbone. It is the input dimension of the multi-head self-attention layers. - d_ffn : - The dimension of the layer in the Feed-Forward Networks (FFN). - n_heads : The number of heads in the multi-head self-attention mechanism. ``d_model`` must be divisible by ``n_heads``, and the result should be equal to ``d_k``. @@ -59,6 +56,9 @@ class Transformer(BaseNNImputer): d_v : The dimension of the `values` (V) in the DMSA mechanism. + d_ffn : + The dimension of the layer in the Feed-Forward Networks (FFN). + dropout : The dropout rate for all fully-connected layers in the model. @@ -118,10 +118,10 @@ def __init__( n_features: int, n_layers: int, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float = 0, attn_dropout: float = 0, ORT_weight: int = 1, @@ -175,10 +175,10 @@ def __init__( self.n_features, self.n_layers, self.d_model, - self.d_ffn, self.n_heads, self.d_k, self.d_v, + self.d_ffn, self.dropout, self.attn_dropout, self.ORT_weight, diff --git a/pypots/nn/modules/autoformer/__init__.py b/pypots/nn/modules/autoformer/__init__.py index 8ab5d103..fde0a8da 100644 --- a/pypots/nn/modules/autoformer/__init__.py +++ b/pypots/nn/modules/autoformer/__init__.py @@ -20,7 +20,6 @@ from .auto_encoder import AutoformerEncoder from .layers import ( AutoCorrelation, - AutoCorrelationLayer, SeasonalLayerNorm, MovingAvgBlock, SeriesDecompositionBlock, @@ -30,7 +29,6 @@ __all__ = [ "AutoCorrelation", - "AutoCorrelationLayer", "SeasonalLayerNorm", "MovingAvgBlock", "SeriesDecompositionBlock", diff --git a/pypots/nn/modules/autoformer/auto_encoder.py b/pypots/nn/modules/autoformer/auto_encoder.py index 9760a738..d8d8473a 100644 --- a/pypots/nn/modules/autoformer/auto_encoder.py +++ b/pypots/nn/modules/autoformer/auto_encoder.py @@ -11,7 +11,6 @@ SeasonalLayerNorm, AutoformerEncoderLayer, AutoCorrelation, - AutoCorrelationLayer, ) from ..informer.auto_encoder import InformerEncoder @@ -20,8 +19,8 @@ class AutoformerEncoder(nn.Module): def __init__( self, n_layers, - n_heads, d_model, + n_heads, d_ffn, factor, moving_avg_window_size, @@ -33,12 +32,9 @@ def __init__( self.encoder = InformerEncoder( [ AutoformerEncoderLayer( - AutoCorrelationLayer( - AutoCorrelation(factor, dropout), - d_model, - n_heads, - ), + AutoCorrelation(factor, dropout), d_model, + n_heads, d_ffn, moving_avg_window_size, dropout, diff --git a/pypots/nn/modules/autoformer/layers.py b/pypots/nn/modules/autoformer/layers.py index 791c4373..b8daa873 100644 --- a/pypots/nn/modules/autoformer/layers.py +++ b/pypots/nn/modules/autoformer/layers.py @@ -6,14 +6,17 @@ # License: BSD-3-Clause import math +from typing import Tuple, Optional import torch import torch.fft import torch.nn as nn import torch.nn.functional as F +from ..transformer.attention import AttentionOperator, MultiHeadAttention -class AutoCorrelation(nn.Module): + +class AutoCorrelation(AttentionOperator): """ AutoCorrelation Mechanism with the following two phases: (1) period-based dependencies discovery @@ -132,63 +135,46 @@ def time_delay_agg_full(self, values, corr): delays_agg = delays_agg + pattern * (tmp_corr[..., i].unsqueeze(-1)) return delays_agg - def forward(self, queries, keys, values, attn_mask): - B, L, H, E = queries.shape - _, S, _, D = values.shape + def forward( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # q, k, v all have 4 dimensions [batch_size, n_steps, n_heads, d_tensor] + # d_tensor could be d_q, d_k, d_v + + B, L, H, E = q.shape + _, S, _, D = v.shape if L > S: - zeros = torch.zeros_like(queries[:, : (L - S), :]).float() - values = torch.cat([values, zeros], dim=1) - keys = torch.cat([keys, zeros], dim=1) + zeros = torch.zeros_like(q[:, : (L - S), :]).float() + v = torch.cat([v, zeros], dim=1) + k = torch.cat([k, zeros], dim=1) else: - values = values[:, :L, :, :] - keys = keys[:, :L, :, :] + v = v[:, :L, :, :] + k = k[:, :L, :, :] # period-based dependencies - q_fft = torch.fft.rfft(queries.permute(0, 2, 3, 1).contiguous(), dim=-1) - k_fft = torch.fft.rfft(keys.permute(0, 2, 3, 1).contiguous(), dim=-1) + q_fft = torch.fft.rfft(q.permute(0, 2, 3, 1).contiguous(), dim=-1) + k_fft = torch.fft.rfft(k.permute(0, 2, 3, 1).contiguous(), dim=-1) res = q_fft * torch.conj(k_fft) corr = torch.fft.irfft(res, dim=-1) # time delay agg if self.training: V = self.time_delay_agg_training( - values.permute(0, 2, 3, 1).contiguous(), corr + v.permute(0, 2, 3, 1).contiguous(), corr ).permute(0, 3, 1, 2) else: V = self.time_delay_agg_inference( - values.permute(0, 2, 3, 1).contiguous(), corr + v.permute(0, 2, 3, 1).contiguous(), corr ).permute(0, 3, 1, 2) - return V.contiguous(), corr.permute(0, 3, 1, 2) - - -class AutoCorrelationLayer(nn.Module): - def __init__(self, correlation, d_model, n_heads, d_keys=None, d_values=None): - super().__init__() - - d_keys = d_keys or (d_model // n_heads) - d_values = d_values or (d_model // n_heads) - - self.inner_correlation = correlation - self.query_projection = nn.Linear(d_model, d_keys * n_heads) - self.key_projection = nn.Linear(d_model, d_keys * n_heads) - self.value_projection = nn.Linear(d_model, d_values * n_heads) - self.out_projection = nn.Linear(d_values * n_heads, d_model) - self.n_heads = n_heads - - def forward(self, queries, keys, values, attn_mask): - B, L, _ = queries.shape - _, S, _ = keys.shape - H = self.n_heads - - queries = self.query_projection(queries).view(B, L, H, -1) - keys = self.key_projection(keys).view(B, S, H, -1) - values = self.value_projection(values).view(B, S, H, -1) - - out, attn = self.inner_correlation(queries, keys, values, attn_mask) - out = out.view(B, L, -1) - - return self.out_projection(out), attn + attn = corr.permute(0, 3, 1, 2) + output = V.contiguous() + return output, attn class SeasonalLayerNorm(nn.Module): @@ -244,21 +230,28 @@ class AutoformerEncoderLayer(nn.Module): def __init__( self, - attention, - d_model, - d_ff=None, - moving_avg=25, - dropout=0.1, + attn_opt: AttentionOperator, + d_model: int, + n_heads: int, + d_ffn: int, + moving_avg: int = 25, + dropout: float = 0.1, activation="relu", ): super().__init__() - d_ff = d_ff or 4 * d_model - self.attention = attention + d_ffn = d_ffn or 4 * d_model + self.attention = MultiHeadAttention( + attn_opt, + d_model, + n_heads, + d_model // n_heads, + d_model // n_heads, + ) self.conv1 = nn.Conv1d( - in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False + in_channels=d_model, out_channels=d_ffn, kernel_size=1, bias=False ) self.conv2 = nn.Conv1d( - in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False + in_channels=d_ffn, out_channels=d_model, kernel_size=1, bias=False ) self.series_decomp1 = SeriesDecompositionBlock(moving_avg) self.series_decomp2 = SeriesDecompositionBlock(moving_avg) @@ -283,10 +276,11 @@ class AutoformerDecoderLayer(nn.Module): def __init__( self, - self_attention, - cross_attention, + self_attn_opt, + cross_attn_opt, d_model, - c_out, + n_heads, + d_out, d_ff=None, moving_avg=25, dropout=0.1, @@ -294,8 +288,20 @@ def __init__( ): super().__init__() d_ff = d_ff or 4 * d_model - self.self_attention = self_attention - self.cross_attention = cross_attention + self.self_attention = MultiHeadAttention( + self_attn_opt, + d_model, + n_heads, + d_model // n_heads, + d_model // n_heads, + ) + self.cross_attention = MultiHeadAttention( + cross_attn_opt, + d_model, + n_heads, + d_model // n_heads, + d_model // n_heads, + ) self.conv1 = nn.Conv1d( in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False ) @@ -308,7 +314,7 @@ def __init__( self.dropout = nn.Dropout(dropout) self.projection = nn.Conv1d( in_channels=d_model, - out_channels=c_out, + out_channels=d_out, kernel_size=3, stride=1, padding=1, diff --git a/pypots/nn/modules/crossformer/layers.py b/pypots/nn/modules/crossformer/layers.py index 74654dc6..0553a8d7 100644 --- a/pypots/nn/modules/crossformer/layers.py +++ b/pypots/nn/modules/crossformer/layers.py @@ -33,25 +33,25 @@ def __init__( super().__init__() d_ff = 4 * d_model if d_ff is None else d_ff self.time_attention = MultiHeadAttention( - n_heads, + ScaledDotProductAttention(d_k**0.5, attn_dropout), d_model, + n_heads, d_k, d_v, - ScaledDotProductAttention(d_k**0.5, attn_dropout), ) self.dim_sender = MultiHeadAttention( - n_heads, + ScaledDotProductAttention(d_k**0.5, attn_dropout), d_model, + n_heads, d_k, d_v, - ScaledDotProductAttention(d_k**0.5, attn_dropout), ) self.dim_receiver = MultiHeadAttention( - n_heads, + ScaledDotProductAttention(d_k**0.5, attn_dropout), d_model, + n_heads, d_k, d_v, - ScaledDotProductAttention(d_k**0.5, attn_dropout), ) self.router = nn.Parameter(torch.randn(seg_num, factor, d_model)) diff --git a/pypots/nn/modules/etsformer/auto_encoder.py b/pypots/nn/modules/etsformer/auto_encoder.py index 40d44e77..b30d7197 100644 --- a/pypots/nn/modules/etsformer/auto_encoder.py +++ b/pypots/nn/modules/etsformer/auto_encoder.py @@ -28,12 +28,12 @@ class ETSformerDecoder(nn.Module): def __init__(self, layers): super().__init__() self.d_model = layers[0].d_model - self.c_out = layers[0].c_out + self.d_out = layers[0].d_out self.pred_len = layers[0].pred_len - self.nhead = layers[0].nhead + self.n_head = layers[0].n_heads self.layers = nn.ModuleList(layers) - self.pred = nn.Linear(self.d_model, self.c_out) + self.pred = nn.Linear(self.d_model, self.d_out) def forward(self, growths, seasons): growth_repr = [] diff --git a/pypots/nn/modules/etsformer/layers.py b/pypots/nn/modules/etsformer/layers.py index 04832081..60e44798 100644 --- a/pypots/nn/modules/etsformer/layers.py +++ b/pypots/nn/modules/etsformer/layers.py @@ -109,20 +109,20 @@ def forward(self, x): class GrowthLayer(nn.Module): - def __init__(self, d_model, nhead, d_head=None, dropout=0.1): + def __init__(self, d_model, n_heads, d_head=None, dropout=0.1): super().__init__() - self.d_head = d_head or (d_model // nhead) + self.d_head = d_head or (d_model // n_heads) self.d_model = d_model - self.nhead = nhead + self.n_heads = n_heads - self.z0 = nn.Parameter(torch.randn(self.nhead, self.d_head)) - self.in_proj = nn.Linear(self.d_model, self.d_head * self.nhead) - self.es = ExponentialSmoothing(self.d_head, self.nhead, dropout=dropout) - self.out_proj = nn.Linear(self.d_head * self.nhead, self.d_model) + self.z0 = nn.Parameter(torch.randn(self.n_heads, self.d_head)) + self.in_proj = nn.Linear(self.d_model, self.d_head * self.n_heads) + self.es = ExponentialSmoothing(self.d_head, self.n_heads, dropout=dropout) + self.out_proj = nn.Linear(self.d_head * self.n_heads, self.d_model) assert ( - self.d_head * self.nhead == self.d_model - ), "d_model must be divisible by nhead" + self.d_head * self.n_heads == self.d_model + ), "d_model must be divisible by n_heads" def forward(self, inputs): """ @@ -130,7 +130,7 @@ def forward(self, inputs): :return: shape: (batch, seq_len, dim) """ b, t, d = inputs.shape - values = self.in_proj(inputs).view(b, t, self.nhead, -1) + values = self.in_proj(inputs).view(b, t, self.n_heads, -1) values = torch.cat([repeat(self.z0, "h d -> b 1 h d", b=b), values], dim=1) values = values[:, 1:] - values[:, :-1] out = self.es(values) @@ -219,32 +219,30 @@ def __init__( self, d_model, n_heads, - c_out, + d_out, seq_len, pred_len, k, - dim_feedforward=None, + d_ffn=None, dropout=0.1, activation="sigmoid", layer_norm_eps=1e-5, ): super().__init__() self.d_model = d_model - self.nhead = n_heads - self.c_out = c_out + self.n_heads = n_heads + self.d_out = d_out self.seq_len = seq_len self.pred_len = pred_len - dim_feedforward = dim_feedforward or 4 * d_model - self.dim_feedforward = dim_feedforward + d_ffn = d_ffn or 4 * d_model + self.d_ffn = d_ffn self.growth_layer = GrowthLayer(d_model, n_heads, dropout=dropout) self.seasonal_layer = FourierLayer(d_model, pred_len, k=k) - self.level_layer = LevelLayer(d_model, c_out, dropout=dropout) + self.level_layer = LevelLayer(d_model, d_out, dropout=dropout) # Implementation of Feedforward model - self.ff = Feedforward( - d_model, dim_feedforward, dropout=dropout, activation=activation - ) + self.ff = Feedforward(d_model, d_ffn, dropout=dropout, activation=activation) self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) @@ -271,11 +269,11 @@ def _season_block(self, x): class DampingLayer(nn.Module): - def __init__(self, pred_len, nhead, dropout=0.1): + def __init__(self, pred_len, n_heads, dropout=0.1): super().__init__() self.pred_len = pred_len - self.nhead = nhead - self._damping_factor = nn.Parameter(torch.randn(1, nhead)) + self.n_heads = n_heads + self._damping_factor = nn.Parameter(torch.randn(1, n_heads)) self.dropout = nn.Dropout(dropout) def forward(self, x): @@ -286,7 +284,7 @@ def forward(self, x): powers = powers.view(self.pred_len, 1) damping_factors = self.damping_factor**powers damping_factors = damping_factors.cumsum(dim=0) - x = x.view(b, t, self.nhead, -1) + x = x.view(b, t, self.n_heads, -1) x = self.dropout(x) * damping_factors.unsqueeze(-1) return x.view(b, t, d) @@ -296,14 +294,14 @@ def damping_factor(self): class ETSformerDecoderLayer(nn.Module): - def __init__(self, d_model, nhead, c_out, pred_len, dropout=0.1): + def __init__(self, d_model, n_heads, d_out, pred_len, dropout=0.1): super().__init__() self.d_model = d_model - self.nhead = nhead - self.c_out = c_out + self.n_heads = n_heads + self.d_out = d_out self.pred_len = pred_len - self.growth_damping = DampingLayer(pred_len, nhead, dropout=dropout) + self.growth_damping = DampingLayer(pred_len, n_heads, dropout=dropout) self.dropout1 = nn.Dropout(dropout) def forward(self, growth, season): diff --git a/pypots/nn/modules/fedformer/autoencoder.py b/pypots/nn/modules/fedformer/autoencoder.py index c3a91dda..84cae344 100644 --- a/pypots/nn/modules/fedformer/autoencoder.py +++ b/pypots/nn/modules/fedformer/autoencoder.py @@ -16,7 +16,6 @@ from ....nn.modules.autoformer import ( AutoformerEncoderLayer, AutoformerDecoderLayer, - AutoCorrelationLayer, SeasonalLayerNorm, ) from ....nn.modules.informer import InformerEncoder, InformerDecoder @@ -27,8 +26,8 @@ def __init__( self, n_steps, n_layers, - n_heads, d_model, + n_heads, d_ffn, moving_avg_window_size, dropout, @@ -57,12 +56,9 @@ def __init__( self.encoder = InformerEncoder( [ AutoformerEncoderLayer( - AutoCorrelationLayer( - encoder_self_att, # instead of multi-head attention in transformer - d_model, - n_heads, - ), + encoder_self_att, # instead of multi-head attention in transformer d_model, + n_heads, d_ffn, moving_avg_window_size, dropout, @@ -134,9 +130,10 @@ def __init__( self.decoder = InformerDecoder( [ AutoformerDecoderLayer( - AutoCorrelationLayer(decoder_self_att, d_model, n_heads), - AutoCorrelationLayer(decoder_cross_att, d_model, n_heads), + decoder_self_att, + decoder_cross_att, d_model, + n_heads, d_output, d_ffn, moving_avg=moving_avg_window_size, diff --git a/pypots/nn/modules/fedformer/layers.py b/pypots/nn/modules/fedformer/layers.py index 24ad5907..d02bc996 100644 --- a/pypots/nn/modules/fedformer/layers.py +++ b/pypots/nn/modules/fedformer/layers.py @@ -7,7 +7,7 @@ import math from functools import partial -from typing import List, Tuple +from typing import List, Tuple, Optional import numpy as np import torch @@ -17,6 +17,8 @@ from torch import Tensor from torch import nn +from ..transformer.attention import AttentionOperator + def legendreDer(k, x): def _legendre(k, x): @@ -395,7 +397,7 @@ def evenOdd(self, x): return x -class MultiWaveletTransform(nn.Module): +class MultiWaveletTransform(AttentionOperator): """ 1D multiwavelet block. """ @@ -422,19 +424,29 @@ def __init__( self.ich = ich self.MWT_CZ = nn.ModuleList(MWT_CZ1d(k, alpha, L, c, base) for i in range(nCZ)) - def forward(self, queries, keys, values, attn_mask): - B, L, H, E = queries.shape - _, S, _, D = values.shape + def forward( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, None]: + # q, k, v all have 4 dimensions [batch_size, n_steps, n_heads, d_tensor] + # d_tensor could be d_q, d_k, d_v + + B, L, H, E = q.shape + _, S, _, D = v.shape if L > S: - zeros = torch.zeros_like(queries[:, : (L - S), :]).float() - values = torch.cat([values, zeros], dim=1) - keys = torch.cat([keys, zeros], dim=1) + zeros = torch.zeros_like(q[:, : (L - S), :]).float() + v = torch.cat([v, zeros], dim=1) + # k = torch.cat([k, zeros], dim=1) else: - values = values[:, :L, :, :] - keys = keys[:, :L, :, :] - values = values.view(B, L, -1) + v = v[:, :L, :, :] + # k = k[:, :L, :, :] + v = v.reshape(B, L, -1) - V = self.Lk0(values).view(B, L, self.c, -1) + V = self.Lk0(v).view(B, L, self.c, -1) for i in range(self.nCZ): V = self.MWT_CZ[i](V) if i < self.nCZ - 1: @@ -442,7 +454,7 @@ def forward(self, queries, keys, values, attn_mask): V = self.Lk1(V.view(B, L, -1)) V = V.view(B, L, -1, D) - return (V.contiguous(), None) + return V.contiguous(), None class FourierCrossAttentionW(nn.Module): @@ -531,7 +543,7 @@ def forward(self, q, k, v, mask): return (out, None) -class MultiWaveletCross(nn.Module): +class MultiWaveletCross(AttentionOperator): """ 1D Multiwavelet Cross Attention layer. """ @@ -554,7 +566,6 @@ def __init__( **kwargs, ): super().__init__() - # print("base", base) self.c = c self.k = k @@ -620,7 +631,17 @@ def __init__( self.out = nn.Linear(c * k, ich) self.modes1 = modes - def forward(self, q, k, v, mask=None): + def forward( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, None]: + # q, k, v all have 4 dimensions [batch_size, n_steps, n_heads, d_tensor] + # d_tensor could be d_q, d_k, d_v + B, N, H, E = q.shape # (B, N, H, E) torch.Size([3, 768, 8, 2]) _, S, _, _ = k.shape # (B, S, H, E) torch.Size([3, 96, 8, 2]) @@ -680,11 +701,11 @@ def forward(self, q, k, v, mask=None): dq, sq = Ud_q[i], Us_q[i] dv, sv = Ud_v[i], Us_v[i] Ud += [ - self.attn1(dq[0], dk[0], dv[0], mask)[0] - + self.attn2(dq[1], dk[1], dv[1], mask)[0] + self.attn1(dq[0], dk[0], dv[0], attn_mask)[0] + + self.attn2(dq[1], dk[1], dv[1], attn_mask)[0] ] - Us += [self.attn3(sq, sk, sv, mask)[0]] - v = self.attn4(q, k, v, mask)[0] + Us += [self.attn3(sq, sk, sv, attn_mask)[0]] + v = self.attn4(q, k, v, attn_mask)[0] # reconstruct for i in range(ns - 1 - self.L, -1, -1): @@ -692,7 +713,7 @@ def forward(self, q, k, v, mask=None): v = torch.cat((v, Ud[i]), -1) v = self.evenOdd(v) v = self.out(v[:, :N, :, :].contiguous().view(B, N, -1)) - return (v.contiguous(), None) + return v.contiguous(), None def wavelet_transform(self, x): xa = torch.cat( @@ -736,7 +757,7 @@ def get_frequency_modes(seq_len, modes=64, mode_select_method="random"): # ########## fourier layer ############# -class FourierBlock(nn.Module): +class FourierBlock(AttentionOperator): def __init__( self, in_channels, out_channels, seq_len, modes=0, mode_select_method="random" ): @@ -769,8 +790,17 @@ def compl_mul1d(self, input, weights): # (batch, in_channel, x ), (in_channel, out_channel, x) -> (batch, out_channel, x) return torch.einsum("bhi,hio->bho", input, weights) - def forward(self, q, k, v, mask): - # size = [B, L, H, E] + def forward( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, None]: + # q, k, v all have 4 dimensions [batch_size, n_steps, n_heads, d_tensor] + # d_tensor could be d_q, d_k, d_v + B, L, H, E = q.shape x = q.permute(0, 2, 3, 1) # Compute Fourier coefficients @@ -783,11 +813,11 @@ def forward(self, q, k, v, mask): ) # Return to time domain x = torch.fft.irfft(out_ft, n=x.size(-1)) - return (x, None) + return x, None # ########## Fourier Cross Former #################### -class FourierCrossAttention(nn.Module): +class FourierCrossAttention(AttentionOperator): def __init__( self, in_channels, @@ -863,8 +893,17 @@ def compl_mul1d(self, order, x, weights): else: return torch.einsum(order, x.real, weights.real) - def forward(self, q, k, v, mask): - # size = [B, L, H, E] + def forward( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, None]: + # q, k, v all have 4 dimensions [batch_size, n_steps, n_heads, d_tensor] + # d_tensor could be d_q, d_k, d_v + B, L, H, E = q.shape xq = q.permute(0, 2, 3, 1) # size = [B, H, E, L] xk = k.permute(0, 2, 3, 1) @@ -912,4 +951,4 @@ def forward(self, q, k, v, mask): out = torch.fft.irfft( out_ft / self.in_channels / self.out_channels, n=xq.size(-1) ) - return (out, None) + return out, None diff --git a/pypots/nn/modules/informer/layers.py b/pypots/nn/modules/informer/layers.py index f3f0602c..d7f92dc3 100644 --- a/pypots/nn/modules/informer/layers.py +++ b/pypots/nn/modules/informer/layers.py @@ -134,6 +134,8 @@ def forward( attn_mask: Optional[torch.Tensor] = None, **kwargs, ): + # q, k, v all have 4 dimensions [batch_size, n_steps, n_heads, d_tensor] + # d_tensor could be d_q, d_k, d_v B, L_Q, H, D = q.shape _, L_K, _, _ = k.shape diff --git a/pypots/nn/modules/patchtst/auto_encoder.py b/pypots/nn/modules/patchtst/auto_encoder.py index 9f4c0926..8263817d 100644 --- a/pypots/nn/modules/patchtst/auto_encoder.py +++ b/pypots/nn/modules/patchtst/auto_encoder.py @@ -14,11 +14,11 @@ class PatchtstEncoder(nn.Module): def __init__( self, n_layers: int, - n_heads: int, d_model: int, - d_ffn: int, + n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float, attn_dropout: float, ): @@ -30,10 +30,10 @@ def __init__( self.encoder = TransformerEncoder( n_layers, d_model, - d_ffn, n_heads, d_k, d_v, + d_ffn, dropout, attn_dropout, ) diff --git a/pypots/nn/modules/raindrop/backbone.py b/pypots/nn/modules/raindrop/backbone.py index 3d23146b..06f74d06 100644 --- a/pypots/nn/modules/raindrop/backbone.py +++ b/pypots/nn/modules/raindrop/backbone.py @@ -21,8 +21,8 @@ def __init__( n_features, n_layers, d_model, - d_ffn, n_heads, + d_ffn, n_classes, dropout=0.3, max_len=215, diff --git a/pypots/nn/modules/saits/backbone.py b/pypots/nn/modules/saits/backbone.py index fdf45f3c..0b0911c6 100644 --- a/pypots/nn/modules/saits/backbone.py +++ b/pypots/nn/modules/saits/backbone.py @@ -21,14 +21,14 @@ class BackboneSAITS(nn.Module): def __init__( self, - n_layers: int, n_steps: int, n_features: int, + n_layers: int, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float, attn_dropout: float, ): @@ -48,12 +48,12 @@ def __init__( self.layer_stack_for_first_block = nn.ModuleList( [ TransformerEncoderLayer( + ScaledDotProductAttention(d_k**0.5, attn_dropout), d_model, - d_ffn, n_heads, d_k, d_v, - ScaledDotProductAttention(d_k**0.5, attn_dropout), + d_ffn, dropout, ) for _ in range(n_layers) @@ -72,12 +72,12 @@ def __init__( self.layer_stack_for_second_block = nn.ModuleList( [ TransformerEncoderLayer( + ScaledDotProductAttention(d_k**0.5, attn_dropout), d_model, - d_ffn, n_heads, d_k, d_v, - ScaledDotProductAttention(d_k**0.5, attn_dropout), + d_ffn, dropout, ) for _ in range(n_layers) diff --git a/pypots/nn/modules/transformer/attention.py b/pypots/nn/modules/transformer/attention.py index 1d3b0a52..ecc4f85e 100644 --- a/pypots/nn/modules/transformer/attention.py +++ b/pypots/nn/modules/transformer/attention.py @@ -93,9 +93,12 @@ def forward( The scaled dot-product attention map. """ - # q, k, v all have 4 dimensions [batch_size, n_heads, n_steps, d_tensor] + # q, k, v all have 4 dimensions [batch_size, n_steps, n_heads, d_tensor] # d_tensor could be d_q, d_k, d_v + # transpose for attention dot product: [batch_size, n_heads, n_steps, d_k or d_v] + q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2) + # dot product q with k.T to obtain similarity attn = torch.matmul(q / self.temperature, k.transpose(2, 3)) @@ -118,30 +121,30 @@ class MultiHeadAttention(nn.Module): Parameters ---------- - n_heads: - The number of heads in multi-head attention. + attn_opt: + The attention operator, e.g. the self-attention proposed in Transformer. d_model: The dimension of the input tensor. + n_heads: + The number of heads in multi-head attention. + d_k: The dimension of the key and query tensor. d_v: The dimension of the value tensor. - attention_operator: - The attention operator, e.g. the self-attention proposed in Transformer. - """ def __init__( self, - n_heads: int, + attn_opt: AttentionOperator, d_model: int, + n_heads: int, d_k: int, d_v: int, - attention_operator: AttentionOperator, ): super().__init__() @@ -153,7 +156,7 @@ def __init__( self.w_ks = nn.Linear(d_model, n_heads * d_k, bias=False) self.w_vs = nn.Linear(d_model, n_heads * d_v, bias=False) - self.attention_operator = attention_operator + self.attention_operator = attn_opt self.fc = nn.Linear(n_heads * d_v, d_model, bias=False) def forward( @@ -190,10 +193,8 @@ def forward( The attention map. """ - # the input q, k, v currently have 3 dimensions [batch_size, n_steps, d_tensor] - # d_tensor could be n_heads*d_k, n_heads*d_v + # the shapes of q, k, v are the same [batch_size, n_steps, d_model] - # keep useful variables batch_size, q_len = q.size(0), q.size(1) k_len = k.size(1) v_len = v.size(1) @@ -202,9 +203,7 @@ def forward( q = self.w_qs(q).view(batch_size, q_len, self.n_heads, self.d_k) k = self.w_ks(k).view(batch_size, k_len, self.n_heads, self.d_k) v = self.w_vs(v).view(batch_size, v_len, self.n_heads, self.d_v) - - # transpose for self-attention calculation -> [batch_size, n_steps, d_k or d_v, n_heads] - q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2) + # for generalization, we don't do transposing here but leave it for the attention operator if necessary if attn_mask is not None: # broadcasting on the head axis diff --git a/pypots/nn/modules/transformer/auto_encoder.py b/pypots/nn/modules/transformer/auto_encoder.py index bbc66f84..494e9718 100644 --- a/pypots/nn/modules/transformer/auto_encoder.py +++ b/pypots/nn/modules/transformer/auto_encoder.py @@ -27,9 +27,6 @@ class TransformerEncoder(nn.Module): The dimension of the module manipulation space. The input tensor will be projected to a space with d_model dimensions. - d_ffn: - The dimension of the hidden layer in the feed-forward network. - n_heads: The number of heads in multi-head attention. @@ -39,6 +36,9 @@ class TransformerEncoder(nn.Module): d_v: The dimension of the value tensor. + d_ffn: + The dimension of the hidden layer in the feed-forward network. + dropout: The dropout rate. @@ -51,10 +51,10 @@ def __init__( self, n_layers: int, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float, attn_dropout: float, ): @@ -63,12 +63,12 @@ def __init__( self.enc_layer_stack = nn.ModuleList( [ TransformerEncoderLayer( + ScaledDotProductAttention(d_k**0.5, attn_dropout), d_model, - d_ffn, n_heads, d_k, d_v, - ScaledDotProductAttention(d_k**0.5, attn_dropout), + d_ffn, dropout, ) for _ in range(n_layers) @@ -114,22 +114,19 @@ class TransformerDecoder(nn.Module): Parameters ---------- - n_layers: - The number of layers in the decoder. - n_steps: The number of time steps in the input tensor. n_features: The number of features in the input tensor. + n_layers: + The number of layers in the decoder. + d_model: The dimension of the module manipulation space. The input tensor will be projected to a space with d_model dimensions. - d_ffn: - The dimension of the hidden layer in the feed-forward network. - n_heads: The number of heads in multi-head attention. @@ -139,6 +136,9 @@ class TransformerDecoder(nn.Module): d_v: The dimension of the value tensor. + d_ffn: + The dimension of the hidden layer in the feed-forward network. + dropout: The dropout rate. @@ -149,14 +149,14 @@ class TransformerDecoder(nn.Module): def __init__( self, - n_layers: int, n_steps: int, n_features: int, + n_layers: int, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, + d_ffn: int, dropout: float, attn_dropout: float, ): @@ -167,13 +167,13 @@ def __init__( self.layer_stack = nn.ModuleList( [ TransformerDecoderLayer( + ScaledDotProductAttention(d_k**0.5, attn_dropout), + ScaledDotProductAttention(d_k**0.5, attn_dropout), d_model, - d_ffn, n_heads, d_k, d_v, - ScaledDotProductAttention(d_k**0.5, attn_dropout), - ScaledDotProductAttention(d_k**0.5, attn_dropout), + d_ffn, dropout, ) for _ in range(n_layers) diff --git a/pypots/nn/modules/transformer/layers.py b/pypots/nn/modules/transformer/layers.py index 02bb408b..0cda956c 100644 --- a/pypots/nn/modules/transformer/layers.py +++ b/pypots/nn/modules/transformer/layers.py @@ -70,12 +70,12 @@ class TransformerEncoderLayer(nn.Module): Parameters ---------- + attn_opt: + The attention operator for the multi-head attention module in the encoder layer. + d_model: The dimension of the input tensor. - d_ffn: - The dimension of the hidden layer. - n_heads: The number of heads in multi-head attention. @@ -85,8 +85,8 @@ class TransformerEncoderLayer(nn.Module): d_v: The dimension of the value tensor. - slf_attn_opt: - The attention operator for the self multi-head attention module in the encoder layer. + d_ffn: + The dimension of the hidden layer. dropout: The dropout rate. @@ -95,16 +95,22 @@ class TransformerEncoderLayer(nn.Module): def __init__( self, + attn_opt: AttentionOperator, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, - slf_attn_opt: AttentionOperator, + d_ffn: int, dropout: float = 0.1, ): super().__init__() - self.slf_attn = MultiHeadAttention(n_heads, d_model, d_k, d_v, slf_attn_opt) + self.slf_attn = MultiHeadAttention( + attn_opt, + d_model, + n_heads, + d_k, + d_v, + ) self.dropout = nn.Dropout(dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.pos_ffn = PositionWiseFeedForward(d_model, d_ffn, dropout) @@ -158,12 +164,15 @@ class TransformerDecoderLayer(nn.Module): Parameters ---------- + slf_attn_opt: + The attention operator for the multi-head attention module in the decoder layer. + + enc_attn_opt: + The attention operator for the encoding multi-head attention module in the decoder layer. + d_model: The dimension of the input tensor. - d_ffn: - The dimension of the hidden layer. - n_heads: The number of heads in multi-head attention. @@ -173,11 +182,8 @@ class TransformerDecoderLayer(nn.Module): d_v: The dimension of the value tensor. - slf_attn_opt: - The attention operator for the self multi-head attention module in the decoder layer. - - enc_attn_opt: - The attention operator for the encoding multi-head attention module in the decoder layer. + d_ffn: + The dimension of the hidden layer. dropout: The dropout rate. @@ -186,18 +192,30 @@ class TransformerDecoderLayer(nn.Module): def __init__( self, + slf_attn_opt: AttentionOperator, + enc_attn_opt: AttentionOperator, d_model: int, - d_ffn: int, n_heads: int, d_k: int, d_v: int, - slf_attn_opt: AttentionOperator, - enc_attn_opt: AttentionOperator, + d_ffn: int, dropout: float = 0.1, ): super().__init__() - self.slf_attn = MultiHeadAttention(n_heads, d_model, d_k, d_v, slf_attn_opt) - self.enc_attn = MultiHeadAttention(n_heads, d_model, d_k, d_v, enc_attn_opt) + self.slf_attn = MultiHeadAttention( + slf_attn_opt, + d_model, + n_heads, + d_k, + d_v, + ) + self.enc_attn = MultiHeadAttention( + enc_attn_opt, + d_model, + n_heads, + d_k, + d_v, + ) self.pos_ffn = PositionWiseFeedForward(d_model, d_ffn, dropout) def forward( diff --git a/tests/classification/raindrop.py b/tests/classification/raindrop.py index 10363b78..1909c076 100644 --- a/tests/classification/raindrop.py +++ b/tests/classification/raindrop.py @@ -42,8 +42,8 @@ class TestRaindrop(unittest.TestCase): DATA["n_classes"], n_layers=2, d_model=DATA["n_features"] * 4, - d_ffn=32, n_heads=2, + d_ffn=32, dropout=0.3, d_static=0, aggregation="mean", diff --git a/tests/imputation/autoformer.py b/tests/imputation/autoformer.py index f68da280..3050f0f2 100644 --- a/tests/imputation/autoformer.py +++ b/tests/imputation/autoformer.py @@ -46,8 +46,8 @@ class TestAutoformer(unittest.TestCase): DATA["n_steps"], DATA["n_features"], n_layers=2, - n_heads=2, d_model=32, + n_heads=2, d_ffn=32, factor=3, moving_avg_window_size=3, diff --git a/tests/imputation/crossformer.py b/tests/imputation/crossformer.py index e33459ca..c11792c2 100644 --- a/tests/imputation/crossformer.py +++ b/tests/imputation/crossformer.py @@ -46,8 +46,8 @@ class TestCrossformer(unittest.TestCase): DATA["n_steps"], DATA["n_features"], n_layers=2, - n_heads=2, d_model=32, + n_heads=2, d_ffn=32, factor=10, seg_len=12, diff --git a/tests/imputation/etsformer.py b/tests/imputation/etsformer.py index 3ade3dfd..94bf57b7 100644 --- a/tests/imputation/etsformer.py +++ b/tests/imputation/etsformer.py @@ -47,8 +47,8 @@ class TestETSformer(unittest.TestCase): DATA["n_features"], n_e_layers=2, n_d_layers=2, - n_heads=2, d_model=32, + n_heads=2, d_ffn=32, top_k=3, dropout=0, diff --git a/tests/imputation/fedformer.py b/tests/imputation/fedformer.py index fe563582..fe72721a 100644 --- a/tests/imputation/fedformer.py +++ b/tests/imputation/fedformer.py @@ -46,8 +46,8 @@ class TestFEDformer(unittest.TestCase): DATA["n_steps"], DATA["n_features"], n_layers=1, - n_heads=2, d_model=32, + n_heads=2, d_ffn=32, moving_avg_window_size=3, dropout=0, diff --git a/tests/imputation/informer.py b/tests/imputation/informer.py index 63689b03..78dbbedf 100644 --- a/tests/imputation/informer.py +++ b/tests/imputation/informer.py @@ -46,8 +46,8 @@ class TestInformer(unittest.TestCase): DATA["n_steps"], DATA["n_features"], n_layers=2, - n_heads=2, d_model=32, + n_heads=2, d_ffn=32, factor=3, dropout=0, diff --git a/tests/imputation/itransformer.py b/tests/imputation/itransformer.py index 18db52ca..d47f1e1f 100644 --- a/tests/imputation/itransformer.py +++ b/tests/imputation/itransformer.py @@ -47,10 +47,10 @@ class TestiTransformer(unittest.TestCase): DATA["n_features"], n_layers=2, d_model=32, - d_ffn=32, n_heads=2, d_k=16, d_v=16, + d_ffn=32, dropout=0.1, epochs=EPOCHS, saving_path=saving_path, diff --git a/tests/imputation/patchtst.py b/tests/imputation/patchtst.py index fcfdff4b..5dd6fe21 100644 --- a/tests/imputation/patchtst.py +++ b/tests/imputation/patchtst.py @@ -47,10 +47,10 @@ class TestPatchTST(unittest.TestCase): DATA["n_features"], n_layers=2, d_model=64, - d_ffn=32, n_heads=2, d_k=16, d_v=16, + d_ffn=32, patch_len=DATA["n_steps"], stride=8, dropout=0.1, diff --git a/tests/imputation/saits.py b/tests/imputation/saits.py index 325b28d2..69dfe94b 100644 --- a/tests/imputation/saits.py +++ b/tests/imputation/saits.py @@ -47,10 +47,10 @@ class TestSAITS(unittest.TestCase): DATA["n_features"], n_layers=2, d_model=32, - d_ffn=32, n_heads=2, d_k=16, d_v=16, + d_ffn=32, dropout=0.1, epochs=EPOCHS, saving_path=saving_path, diff --git a/tests/imputation/transformer.py b/tests/imputation/transformer.py index 06839b95..e509c899 100644 --- a/tests/imputation/transformer.py +++ b/tests/imputation/transformer.py @@ -47,10 +47,10 @@ class TestTransformer(unittest.TestCase): DATA["n_features"], n_layers=2, d_model=32, - d_ffn=32, n_heads=2, d_k=16, d_v=16, + d_ffn=32, dropout=0.1, epochs=EPOCHS, saving_path=saving_path,