From 9998d38af701bb8b5af8068234afcbeab2cb9d81 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Wed, 4 Sep 2024 17:54:12 +0200
Subject: [PATCH 001/132] include MambAttn

---
 mambular/arch_utils/mambattn_arch.py     | 138 ++++++++
 mambular/base_models/__init__.py         |   2 +
 mambular/base_models/mambattn.py         | 226 +++++++++++++
 mambular/configs/mambattention_config.py | 125 +++++++
 mambular/models/__init__.py              |   8 +
 mambular/models/mambattention.py         | 399 +++++++++++++++++++++++
 6 files changed, 898 insertions(+)
 create mode 100644 mambular/arch_utils/mambattn_arch.py
 create mode 100644 mambular/base_models/mambattn.py
 create mode 100644 mambular/configs/mambattention_config.py
 create mode 100644 mambular/models/mambattention.py

diff --git a/mambular/arch_utils/mambattn_arch.py b/mambular/arch_utils/mambattn_arch.py
new file mode 100644
index 0000000..33eae60
--- /dev/null
+++ b/mambular/arch_utils/mambattn_arch.py
@@ -0,0 +1,138 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .normalization_layers import RMSNorm
+from .mamba_arch import ResidualBlock
+
+
+class MambAttn(nn.Module):
+    """Mamba model composed of alternating MambaBlocks and Attention layers.
+
+    Attributes:
+        config (MambaConfig): Configuration object for the Mamba model.
+        layers (nn.ModuleList): List of alternating ResidualBlock (Mamba layers) and attention layers constituting the model.
+    """
+
+    def __init__(
+        self,
+        d_model=32,
+        n_layers=8,
+        n_attention_layers=1,  # Introduce attention layer count
+        n_mamba_per_attention=1,  # Ratio of Mamba layers to attention layers
+        n_heads=4,  # Number of attention heads
+        expand_factor=2,
+        bias=False,
+        d_conv=8,
+        conv_bias=True,
+        dropout=0.0,
+        attn_dropout=0.1,
+        dt_rank="auto",
+        d_state=16,
+        dt_scale=1.0,
+        dt_init="random",
+        dt_max=0.1,
+        last_layer="attn",  # Define the desired last layer type
+        dt_min=1e-03,
+        dt_init_floor=1e-04,
+        norm=RMSNorm,
+        activation=F.silu,
+        bidirectional=False,
+        use_learnable_interaction=False,
+        layer_norm_eps=1e-05,
+        AD_weight_decay=False,
+        BC_layer_norm=True,
+    ):
+        super().__init__()
+
+        # Define Mamba and Attention layers alternation
+        self.layers = nn.ModuleList()
+
+        total_blocks = n_layers + n_attention_layers  # Total blocks to be created
+        attention_count = 0
+
+        for i in range(total_blocks):
+            if (i + 1) % (
+                n_mamba_per_attention + 1
+            ) == 0:  # Insert attention layer after N Mamba layers
+                self.layers.append(
+                    nn.MultiheadAttention(
+                        embed_dim=d_model, num_heads=n_heads, dropout=attn_dropout
+                    )
+                )
+                attention_count += 1
+            else:
+                self.layers.append(
+                    ResidualBlock(
+                        d_model,
+                        expand_factor,
+                        bias,
+                        d_conv,
+                        conv_bias,
+                        dropout,
+                        dt_rank,
+                        d_state,
+                        dt_scale,
+                        dt_init,
+                        dt_max,
+                        dt_min,
+                        dt_init_floor,
+                        norm,
+                        activation,
+                        bidirectional,
+                        use_learnable_interaction,
+                        layer_norm_eps,
+                        AD_weight_decay,
+                        BC_layer_norm,
+                    )
+                )
+
+        # Check the type of the last layer and append the desired one if necessary
+        if last_layer == "attn":
+            if not isinstance(self.layers[-1], nn.MultiheadAttention):
+                self.layers.append(
+                    nn.MultiheadAttention(
+                        embed_dim=d_model, num_heads=n_heads, dropout=dropout
+                    )
+                )
+        else:
+            if not isinstance(self.layers[-1], ResidualBlock):
+                self.layers.append(
+                    ResidualBlock(
+                        d_model,
+                        expand_factor,
+                        bias,
+                        d_conv,
+                        conv_bias,
+                        dropout,
+                        dt_rank,
+                        d_state,
+                        dt_scale,
+                        dt_init,
+                        dt_max,
+                        dt_min,
+                        dt_init_floor,
+                        norm,
+                        activation,
+                        bidirectional,
+                        use_learnable_interaction,
+                        layer_norm_eps,
+                        AD_weight_decay,
+                        BC_layer_norm,
+                    )
+                )
+
+    def forward(self, x):
+        for layer in self.layers:
+            if isinstance(layer, nn.MultiheadAttention):
+                # If it's an attention layer, handle input shape (seq_len, batch, embed_dim)
+                x = x.transpose(
+                    0, 1
+                )  # Switch to (seq_len, batch, embed_dim) for attention
+                x, _ = layer(x, x, x)
+                x = x.transpose(0, 1)  # Switch back to (batch, seq_len, embed_dim)
+            else:
+                # Otherwise, pass through Mamba block
+                x = layer(x)
+
+        return x
diff --git a/mambular/base_models/__init__.py b/mambular/base_models/__init__.py
index 6756093..895881b 100644
--- a/mambular/base_models/__init__.py
+++ b/mambular/base_models/__init__.py
@@ -6,6 +6,7 @@
 from .resnet import ResNet
 from .tabtransformer import TabTransformer
 from .mambatab import MambaTab
+from .mambattn import MambAttn
 
 __all__ = [
     "TaskModel",
@@ -16,4 +17,5 @@
     "MLP",
     "BaseModel",
     "MambaTab",
+    "MambAttn",
 ]
diff --git a/mambular/base_models/mambattn.py b/mambular/base_models/mambattn.py
new file mode 100644
index 0000000..7acd0b7
--- /dev/null
+++ b/mambular/base_models/mambattn.py
@@ -0,0 +1,226 @@
+import torch
+import torch.nn as nn
+from ..arch_utils.mambattn_arch import MambAttn
+from ..arch_utils.mlp_utils import MLP
+from ..arch_utils.normalization_layers import (
+    RMSNorm,
+    LayerNorm,
+    LearnableLayerScaling,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+)
+from ..configs.mambattention_config import DefaultMambAttentionConfig
+from .basemodel import BaseModel
+from ..arch_utils.embedding_layer import EmbeddingLayer
+
+
+class MambAttention(BaseModel):
+    """
+    A PyTorch model for tasks utilizing the Mamba architecture and various normalization techniques.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    num_classes : int, optional
+        Number of output classes (default is 1).
+    config : DefaultMambularConfig, optional
+        Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()).
+    **kwargs : dict
+        Additional keyword arguments.
+
+    Attributes
+    ----------
+    lr : float
+        Learning rate.
+    lr_patience : int
+        Patience for learning rate scheduler.
+    weight_decay : float
+        Weight decay for optimizer.
+    lr_factor : float
+        Factor by which the learning rate will be reduced.
+    pooling_method : str
+        Method to pool the features.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    embedding_activation : callable
+        Activation function for embeddings.
+    mamba : Mamba
+        Mamba architecture component.
+    norm_f : nn.Module
+        Normalization layer.
+    num_embeddings : nn.ModuleList
+        Module list for numerical feature embeddings.
+    cat_embeddings : nn.ModuleList
+        Module list for categorical feature embeddings.
+    tabular_head : MLP
+        Multi-layer perceptron head for tabular data.
+    cls_token : nn.Parameter
+        Class token parameter.
+    embedding_norm : nn.Module, optional
+        Layer normalization applied after embedding if specified.
+    """
+
+    def __init__(
+        self,
+        cat_feature_info,
+        num_feature_info,
+        num_classes=1,
+        config: DefaultMambAttentionConfig = DefaultMambAttentionConfig(),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        self.lr = self.hparams.get("lr", config.lr)
+        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
+        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
+        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
+        self.shuffle_embeddings = self.hparams.get(
+            "shuffle_embeddings", config.shuffle_embeddings
+        )
+        self.cat_feature_info = cat_feature_info
+        self.num_feature_info = num_feature_info
+
+        self.mamba = MambAttn(
+            d_model=self.hparams.get("d_model", config.d_model),
+            n_layers=self.hparams.get("n_layers", config.n_layers),
+            expand_factor=self.hparams.get("expand_factor", config.expand_factor),
+            bias=self.hparams.get("bias", config.bias),
+            d_conv=self.hparams.get("d_conv", config.d_conv),
+            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
+            dropout=self.hparams.get("dropout", config.dropout),
+            dt_rank=self.hparams.get("dt_rank", config.dt_rank),
+            d_state=self.hparams.get("d_state", config.d_state),
+            dt_scale=self.hparams.get("dt_scale", config.dt_scale),
+            dt_init=self.hparams.get("dt_init", config.dt_init),
+            dt_max=self.hparams.get("dt_max", config.dt_max),
+            dt_min=self.hparams.get("dt_min", config.dt_min),
+            dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor),
+            norm=globals()[self.hparams.get("norm", config.norm)],
+            activation=self.hparams.get("activation", config.activation),
+            bidirectional=self.hparams.get("bidiretional", config.bidirectional),
+            use_learnable_interaction=self.hparams.get(
+                "use_learnable_interactions", config.use_learnable_interaction
+            ),
+            AD_weight_decay=self.hparams.get("AB_weight_decay", config.AD_weight_decay),
+            BC_layer_norm=self.hparams.get("AB_layer_norm", config.BC_layer_norm),
+            layer_norm_eps=self.hparams.get("layer_norm_eps", config.layer_norm_eps),
+        )
+        norm_layer = self.hparams.get("norm", config.norm)
+        if norm_layer == "RMSNorm":
+            self.norm_f = RMSNorm(
+                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
+            )
+        elif norm_layer == "LayerNorm":
+            self.norm_f = LayerNorm(
+                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
+            )
+        elif norm_layer == "BatchNorm":
+            self.norm_f = BatchNorm(
+                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
+            )
+        elif norm_layer == "InstanceNorm":
+            self.norm_f = InstanceNorm(
+                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
+            )
+        elif norm_layer == "GroupNorm":
+            self.norm_f = GroupNorm(
+                1,
+                self.hparams.get("d_model", config.d_model),
+                eps=config.layer_norm_eps,
+            )
+        elif norm_layer == "LearnableLayerScaling":
+            self.norm_f = LearnableLayerScaling(
+                self.hparams.get("d_model", config.d_model)
+            )
+        else:
+            raise ValueError(f"Unsupported normalization layer: {norm_layer}")
+
+        self.embedding_layer = EmbeddingLayer(
+            num_feature_info=num_feature_info,
+            cat_feature_info=cat_feature_info,
+            d_model=self.hparams.get("d_model", config.d_model),
+            embedding_activation=self.hparams.get(
+                "embedding_activation", config.embedding_activation
+            ),
+            layer_norm_after_embedding=self.hparams.get(
+                "layer_norm_after_embedding", config.layer_norm_after_embedding
+            ),
+            use_cls=False,
+            cls_position=-1,
+            cat_encoding=self.hparams.get("cat_encoding", config.cat_encoding),
+        )
+
+        head_activation = self.hparams.get("head_activation", config.head_activation)
+
+        self.tabular_head = MLP(
+            self.hparams.get("d_model", config.d_model),
+            hidden_units_list=self.hparams.get(
+                "head_layer_sizes", config.head_layer_sizes
+            ),
+            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
+            use_skip_layers=self.hparams.get(
+                "head_skip_layers", config.head_skip_layers
+            ),
+            activation_fn=head_activation,
+            use_batch_norm=self.hparams.get(
+                "head_use_batch_norm", config.head_use_batch_norm
+            ),
+            n_output_units=num_classes,
+        )
+
+        if self.pooling_method == "cls":
+            self.use_cls = True
+        else:
+            self.use_cls = self.hparams.get("use_cls", config.use_cls)
+
+        if self.shuffle_embeddings:
+            self.perm = torch.randperm(self.embedding_layer.seq_len)
+
+    def forward(self, num_features, cat_features):
+        """
+        Defines the forward pass of the model.
+
+        Parameters
+        ----------
+        num_features : Tensor
+            Tensor containing the numerical features.
+        cat_features : Tensor
+            Tensor containing the categorical features.
+
+        Returns
+        -------
+        Tensor
+            The output predictions of the model.
+        """
+        x = self.embedding_layer(num_features, cat_features)
+
+        if self.shuffle_embeddings:
+            x = x[:, self.perm, :]
+
+        x = self.mamba(x)
+
+        if self.pooling_method == "avg":
+            x = torch.mean(x, dim=1)
+        elif self.pooling_method == "max":
+            x, _ = torch.max(x, dim=1)
+        elif self.pooling_method == "sum":
+            x = torch.sum(x, dim=1)
+        elif self.pooling_method == "cls_token":
+            x = x[:, -1]
+        elif self.pooling_method == "last":
+            x = x[:, -1]
+        else:
+            raise ValueError(f"Invalid pooling method: {self.pooling_method}")
+
+        x = self.norm_f(x)
+        preds = self.tabular_head(x)
+
+        return preds
diff --git a/mambular/configs/mambattention_config.py b/mambular/configs/mambattention_config.py
new file mode 100644
index 0000000..5bbae65
--- /dev/null
+++ b/mambular/configs/mambattention_config.py
@@ -0,0 +1,125 @@
+from dataclasses import dataclass
+import torch.nn as nn
+
+
+@dataclass
+class DefaultMambAttentionConfig:
+    """
+    Configuration class for the Default Mambular model with predefined hyperparameters.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    d_model : int, default=64
+        Dimensionality of the model.
+    n_layers : int, default=8
+        Number of layers in the model.
+    expand_factor : int, default=2
+        Expansion factor for the feed-forward layers.
+    bias : bool, default=False
+        Whether to use bias in the linear layers.
+    d_conv : int, default=16
+        Dimensionality of the convolutional layers.
+    conv_bias : bool, default=True
+        Whether to use bias in the convolutional layers.
+    dropout : float, default=0.05
+        Dropout rate for regularization.
+    dt_rank : str, default="auto"
+        Rank of the decision tree.
+    d_state : int, default=32
+        Dimensionality of the state in recurrent layers.
+    dt_scale : float, default=1.0
+        Scaling factor for decision tree.
+    dt_init : str, default="random"
+        Initialization method for decision tree.
+    dt_max : float, default=0.1
+        Maximum value for decision tree initialization.
+    dt_min : float, default=1e-04
+        Minimum value for decision tree initialization.
+    dt_init_floor : float, default=1e-04
+        Floor value for decision tree initialization.
+    norm : str, default="RMSNorm"
+        Normalization method to be used.
+    activation : callable, default=nn.SELU()
+        Activation function for the model.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    pooling_method : str, default="avg"
+        Pooling method to be used ('avg', 'max', etc.).
+    bidirectional : bool, default=False
+        Whether to use bidirectional processing of the input sequences.
+    use_learnable_interaction : bool, default=False
+        Whether to use learnable feature interactions before passing through mamba blocks.
+    use_cls : bool, default=True
+        Whether to append a cls to the end of each 'sequence'.
+    shuffle_embeddings : bool, default=False.
+        Whether to shuffle the embeddings before being passed to the Mamba layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=True
+        whether weight decay is also applied to A-D matrices.
+    BC_layer_norm: bool, default=False
+        whether to apply layer normalization to B-C matrices.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
+    """
+
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-06
+    lr_factor: float = 0.1
+    d_model: int = 64
+    n_layers: int = 4
+    expand_factor: int = 2
+    n_heads: int = 4
+    last_layer: str = "attn"
+    n_mamba_per_attention: int = 1
+    bias: bool = False
+    d_conv: int = 4
+    conv_bias: bool = True
+    dropout: float = 0.0
+    attn_dropout: float = 0.1
+    dt_rank: str = "auto"
+    d_state: int = 128
+    dt_scale: float = 1.0
+    dt_init: str = "random"
+    dt_max: float = 0.1
+    dt_min: float = 1e-04
+    dt_init_floor: float = 1e-04
+    norm: str = "LayerNorm"
+    activation: callable = nn.SiLU()
+    embedding_activation: callable = nn.Identity()
+    head_layer_sizes: list = ()
+    head_dropout: float = 0.5
+    head_skip_layers: bool = False
+    head_activation: callable = nn.SELU()
+    head_use_batch_norm: bool = False
+    layer_norm_after_embedding: bool = False
+    pooling_method: str = "avg"
+    bidirectional: bool = False
+    use_learnable_interaction: bool = False
+    use_cls: bool = False
+    shuffle_embeddings: bool = False
+    layer_norm_eps: float = 1e-05
+    AD_weight_decay: bool = True
+    BC_layer_norm: bool = False
+    cat_encoding: str = "int"
diff --git a/mambular/models/__init__.py b/mambular/models/__init__.py
index 6b9f40c..720d264 100644
--- a/mambular/models/__init__.py
+++ b/mambular/models/__init__.py
@@ -17,6 +17,11 @@
 
 from .mambatab import MambaTabClassifier, MambaTabRegressor, MambaTabLSS
 from .tabularnn import TabulaRNNClassifier, TabulaRNNRegressor, TabulaRNNLSS
+from .mambattention import (
+    MambAttentionClassifier,
+    MambAttentionRegressor,
+    MambAttentionLSS,
+)
 
 
 __all__ = [
@@ -44,4 +49,7 @@
     "TabulaRNNClassifier",
     "TabulaRNNRegressor",
     "TabulaRNNLSS",
+    "MambAttentionClassifier",
+    "MambAttentionRegressor",
+    "MambAttentionLSS",
 ]
diff --git a/mambular/models/mambattention.py b/mambular/models/mambattention.py
new file mode 100644
index 0000000..5c31754
--- /dev/null
+++ b/mambular/models/mambattention.py
@@ -0,0 +1,399 @@
+from .sklearn_base_regressor import SklearnBaseRegressor
+from .sklearn_base_lss import SklearnBaseLSS
+from .sklearn_base_classifier import SklearnBaseClassifier
+from ..base_models.mambattn import MambAttention
+from ..configs.mambattention_config import DefaultMambAttentionConfig
+
+
+class MambAttentionRegressor(SklearnBaseRegressor):
+    """
+    MambAttention regressor. This class extends the SklearnBaseRegressor class and uses the MambAttention model
+    with the default MambAttention configuration.
+
+    The accepted arguments to the MambAttentionRegressor class include both the attributes in the DefaultMambAttentionConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    d_model : int, default=64
+        Dimensionality of the model.
+    n_layers : int, default=8
+        Number of layers in the model.
+    expand_factor : int, default=2
+        Expansion factor for the feed-forward layers.
+    bias : bool, default=False
+        Whether to use bias in the linear layers.
+    d_conv : int, default=16
+        Dimensionality of the convolutional layers.
+    conv_bias : bool, default=True
+        Whether to use bias in the convolutional layers.
+    dropout : float, default=0.05
+        Dropout rate for regularization.
+    dt_rank : str, default="auto"
+        Rank of the decision tree.
+    d_state : int, default=32
+        Dimensionality of the state in recurrent layers.
+    dt_scale : float, default=1.0
+        Scaling factor for decision tree.
+    dt_init : str, default="random"
+        Initialization method for decision tree.
+    dt_max : float, default=0.1
+        Maximum value for decision tree initialization.
+    dt_min : float, default=1e-04
+        Minimum value for decision tree initialization.
+    dt_init_floor : float, default=1e-04
+        Floor value for decision tree initialization.
+    norm : str, default="RMSNorm"
+        Normalization method to be used.
+    activation : callable, default=nn.SELU()
+        Activation function for the model.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    pooling_method : str, default="avg"
+        Pooling method to be used ('avg', 'max', etc.).
+    bidirectional : bool, default=False
+        Whether to use bidirectional processing of the input sequences.
+    use_learnable_interaction : bool, default=False
+        Whether to use learnable feature interactions before passing through mamba blocks.
+    use_cls : bool, default=True
+        Whether to append a cls to the end of each 'sequence'.
+    shuffle_embeddings : bool, default=False.
+        Whether to shuffle the embeddings before being passed to the Mamba layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=True
+        whether weight decay is also applied to A-D matrices.
+    BC_layer_norm: bool, default=False
+        whether to apply layer normalization to B-C matrices.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the MambAttentionRegressor class are the same as the attributes in the DefaultMambAttentionConfig dataclass.
+    - MambAttentionRegressor uses SklearnBaseRegressor as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    MambAttention.models.SklearnBaseRegressor : The parent class for MambAttentionRegressor.
+
+    Examples
+    --------
+    >>> from MambAttention.models import MambAttentionRegressor
+    >>> model = MambAttentionRegressor(d_model=64, n_layers=8)
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(
+            model=MambAttention, config=DefaultMambAttentionConfig, **kwargs
+        )
+
+
+class MambAttentionClassifier(SklearnBaseClassifier):
+    """
+    MambAttention classifier. This class extends the SklearnBaseClassifier class and uses the MambAttention model
+    with the default MambAttention configuration.
+
+    The accepted arguments to the MambAttentionClassifier class include both the attributes in the DefaultMambAttentionConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    d_model : int, default=64
+        Dimensionality of the model.
+    n_layers : int, default=8
+        Number of layers in the model.
+    expand_factor : int, default=2
+        Expansion factor for the feed-forward layers.
+    bias : bool, default=False
+        Whether to use bias in the linear layers.
+    d_conv : int, default=16
+        Dimensionality of the convolutional layers.
+    conv_bias : bool, default=True
+        Whether to use bias in the convolutional layers.
+    dropout : float, default=0.05
+        Dropout rate for regularization.
+    dt_rank : str, default="auto"
+        Rank of the decision tree.
+    d_state : int, default=32
+        Dimensionality of the state in recurrent layers.
+    dt_scale : float, default=1.0
+        Scaling factor for decision tree.
+    dt_init : str, default="random"
+        Initialization method for decision tree.
+    dt_max : float, default=0.1
+        Maximum value for decision tree initialization.
+    dt_min : float, default=1e-04
+        Minimum value for decision tree initialization.
+    dt_init_floor : float, default=1e-04
+        Floor value for decision tree initialization.
+    norm : str, default="RMSNorm"
+        Normalization method to be used.
+    activation : callable, default=nn.SELU()
+        Activation function for the model.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    pooling_method : str, default="avg"
+        Pooling method to be used ('avg', 'max', etc.).
+    bidirectional : bool, default=False
+        Whether to use bidirectional processing of the input sequences.
+    use_learnable_interaction : bool, default=False
+        Whether to use learnable feature interactions before passing through mamba blocks.
+    shuffle_embeddings : bool, default=False.
+        Whether to shuffle the embeddings before being passed to the Mamba layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=True
+        whether weight decay is also applied to A-D matrices.
+    BC_layer_norm: bool, default=False
+        whether to apply layer normalization to B-C matrices.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the MambAttentionClassifier class are the same as the attributes in the DefaultMambAttentionConfig dataclass.
+    - MambAttentionClassifier uses SklearnBaseClassifier as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    MambAttention.models.SklearnBaseClassifier : The parent class for MambAttentionClassifier.
+
+    Examples
+    --------
+    >>> from MambAttention.models import MambAttentionClassifier
+    >>> model = MambAttentionClassifier(d_model=64, n_layers=8)
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(
+            model=MambAttention, config=DefaultMambAttentionConfig, **kwargs
+        )
+
+
+class MambAttentionLSS(SklearnBaseLSS):
+    """
+    MambAttention for distributional regression. This class extends the SklearnBaseLSS class and uses the MambAttention model
+    with the default MambAttention configuration.
+
+    The accepted arguments to the MambAttentionLSS class include both the attributes in the DefaultMambAttentionConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    family : str, default=None
+        Distributional family to be used for the model.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    d_model : int, default=64
+        Dimensionality of the model.
+    n_layers : int, default=8
+        Number of layers in the model.
+    expand_factor : int, default=2
+        Expansion factor for the feed-forward layers.
+    bias : bool, default=False
+        Whether to use bias in the linear layers.
+    d_conv : int, default=16
+        Dimensionality of the convolutional layers.
+    conv_bias : bool, default=True
+        Whether to use bias in the convolutional layers.
+    dropout : float, default=0.05
+        Dropout rate for regularization.
+    dt_rank : str, default="auto"
+        Rank of the decision tree.
+    d_state : int, default=32
+        Dimensionality of the state in recurrent layers.
+    dt_scale : float, default=1.0
+        Scaling factor for decision tree.
+    dt_init : str, default="random"
+        Initialization method for decision tree.
+    dt_max : float, default=0.1
+        Maximum value for decision tree initialization.
+    dt_min : float, default=1e-04
+        Minimum value for decision tree initialization.
+    dt_init_floor : float, default=1e-04
+        Floor value for decision tree initialization.
+    norm : str, default="RMSNorm"
+        Normalization method to be used.
+    activation : callable, default=nn.SELU()
+        Activation function for the model.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    pooling_method : str, default="avg"
+        Pooling method to be used ('avg', 'max', etc.).
+    bidirectional : bool, default=False
+        Whether to use bidirectional processing of the input sequences.
+    use_learnable_interaction : bool, default=False
+        Whether to use learnable feature interactions before passing through mamba blocks.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    shuffle_embeddings : bool, default=False.
+        Whether to shuffle the embeddings before being passed to the Mamba layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=True
+        whether weight decay is also applied to A-D matrices.
+    BC_layer_norm: bool, default=False
+        whether to apply layer normalization to B-C matrices.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    task : str, default="regression"
+        Indicates the type of machine learning task ('regression' or 'classification'). This can
+        influence certain preprocessing behaviors, especially when using decision tree-based binning as ple.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the MambAttentionLSS class are the same as the attributes in the DefaultMambAttentionConfig dataclass.
+    - MambAttentionLSS uses SklearnBaseLSS as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    MambAttention.models.SklearnBaseLSS : The parent class for MambAttentionLSS.
+
+    Examples
+    --------
+    >>> from MambAttention.models import MambAttentionLSS
+    >>> model = MambAttentionLSS(d_model=64, n_layers=8)
+    >>> model.fit(X_train, y_train, family="normal")
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(
+            model=MambAttention, config=DefaultMambAttentionConfig, **kwargs
+        )

From de0886e3beb77ea3bdfacd99f3520610384c4c4d Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Wed, 4 Sep 2024 18:01:48 +0200
Subject: [PATCH 002/132] adapt mabattn config

---
 mambular/configs/mambattention_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mambular/configs/mambattention_config.py b/mambular/configs/mambattention_config.py
index 5bbae65..0c3ce8b 100644
--- a/mambular/configs/mambattention_config.py
+++ b/mambular/configs/mambattention_config.py
@@ -90,14 +90,14 @@ class DefaultMambAttentionConfig:
     d_model: int = 64
     n_layers: int = 4
     expand_factor: int = 2
-    n_heads: int = 4
+    n_heads: int = 8
     last_layer: str = "attn"
     n_mamba_per_attention: int = 1
     bias: bool = False
     d_conv: int = 4
     conv_bias: bool = True
     dropout: float = 0.0
-    attn_dropout: float = 0.1
+    attn_dropout: float = 0.2
     dt_rank: str = "auto"
     d_state: int = 128
     dt_scale: float = 1.0

From d39c056150377d0a8e926b647864044a741d34b7 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 6 Sep 2024 22:41:54 +0200
Subject: [PATCH 003/132] pruning for hpo

---
 mambular/base_models/lightning_wrapper.py | 82 +++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/mambular/base_models/lightning_wrapper.py b/mambular/base_models/lightning_wrapper.py
index 6d3f5c3..dc45472 100644
--- a/mambular/base_models/lightning_wrapper.py
+++ b/mambular/base_models/lightning_wrapper.py
@@ -37,6 +37,8 @@ def __init__(
         lss=False,
         family=None,
         loss_fct: callable = None,
+        early_pruning_threshold=None,
+        pruning_epoch=5,
         **kwargs,
     ):
         super().__init__()
@@ -44,6 +46,9 @@ def __init__(
         self.lss = lss
         self.family = family
         self.loss_fct = loss_fct
+        self.early_pruning_threshold = early_pruning_threshold
+        self.pruning_epoch = pruning_epoch
+        self.val_losses = []
 
         if lss:
             pass
@@ -260,6 +265,83 @@ def test_step(self, batch, batch_idx):
 
         return test_loss
 
+    def on_validation_epoch_end(self):
+        """
+        Callback executed at the end of each validation epoch.
+
+        This method retrieves the current validation loss from the trainer's callback metrics
+        and stores it in a list for tracking validation losses across epochs. It also applies
+        pruning logic to stop training early if the validation loss exceeds a specified threshold.
+
+        Parameters
+        ----------
+        None
+
+        Attributes
+        ----------
+        val_loss : torch.Tensor or None
+            The validation loss for the current epoch, retrieved from `self.trainer.callback_metrics`.
+        val_loss_value : float
+            The validation loss for the current epoch, converted to a float.
+        val_losses : list of float
+            A list storing the validation losses for each epoch.
+        pruning_epoch : int
+            The epoch after which pruning logic will be applied.
+        early_pruning_threshold : float, optional
+            The threshold for early pruning based on validation loss. If the current validation
+            loss exceeds this value, training will be stopped early.
+
+        Notes
+        -----
+        If the current epoch is greater than or equal to `pruning_epoch`, and the validation
+        loss exceeds the `early_pruning_threshold`, the training is stopped early by setting
+        `self.trainer.should_stop` to True.
+        """
+        val_loss = self.trainer.callback_metrics.get("val_loss")
+        if val_loss is not None:
+            val_loss_value = val_loss.item()
+            self.val_losses.append(val_loss_value)  # Store val_loss for each epoch
+
+            # Apply pruning logic if needed
+            if self.current_epoch >= self.pruning_epoch:
+                if (
+                    self.early_pruning_threshold is not None
+                    and val_loss_value > self.early_pruning_threshold
+                ):
+                    print(
+                        f"Pruned at epoch {self.current_epoch}, val_loss {val_loss_value}"
+                    )
+                    self.trainer.should_stop = True  # Stop training early
+
+    def epoch_val_loss_at(self, epoch):
+        """
+        Retrieve the validation loss at a specific epoch.
+
+        This method allows the user to query the validation loss for any given epoch,
+        provided the epoch exists within the range of completed epochs. If the epoch
+        exceeds the length of the `val_losses` list, a default value of infinity is returned.
+
+        Parameters
+        ----------
+        epoch : int
+            The epoch number for which the validation loss is requested.
+
+        Returns
+        -------
+        float
+            The validation loss for the requested epoch. If the epoch does not exist,
+            the method returns `float("inf")`.
+
+        Notes
+        -----
+        This method relies on `self.val_losses` which stores the validation loss values
+        at the end of each epoch during training.
+        """
+        if epoch < len(self.val_losses):
+            return self.val_losses[epoch]
+        else:
+            return float("inf")
+
     def configure_optimizers(self):
         """
         Sets up the model's optimizer and learning rate scheduler based on the configurations provided.

From c1467bd834abffdc0f3cdc385354e72ddc8689bc Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 6 Sep 2024 22:42:07 +0200
Subject: [PATCH 004/132] include config mapper for hpo

---
 mambular/utils/config_mapper.py | 113 ++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 mambular/utils/config_mapper.py

diff --git a/mambular/utils/config_mapper.py b/mambular/utils/config_mapper.py
new file mode 100644
index 0000000..d7f4b85
--- /dev/null
+++ b/mambular/utils/config_mapper.py
@@ -0,0 +1,113 @@
+from skopt.space import Real, Integer, Categorical
+import torch.nn as nn
+from ..arch_utils.transformer_utils import ReGLU
+
+
+def round_to_nearest_16(x):
+    """Rounds the value to the nearest multiple of 16."""
+    return int(round(x / 16) * 16)
+
+
+def get_search_space(config):
+    """
+    Given a model configuration, return the hyperparameter search space
+    based on the config attributes.
+
+    Parameters
+    ----------
+    config : dataclass
+        The configuration object for the model.
+
+    Returns
+    -------
+    param_names : list
+        A list of parameter names to be optimized.
+    param_space : list
+        A list of hyperparameter ranges for Bayesian optimization.
+    """
+
+    search_space_mapping = {
+        # Learning rate-related parameters
+        "lr": Real(1e-6, 1e-2, prior="log-uniform"),
+        "lr_patience": Integer(5, 20),
+        "lr_factor": Real(0.1, 0.5),
+        # Model architecture parameters
+        "n_layers": Integer(1, 8),
+        "d_model": Integer(16, 512),  # Dimension of the model
+        "dropout": Real(0.0, 0.5),
+        "expand_factor": Integer(1, 4),
+        "d_state": Integer(16, 512),
+        "ff_dropout": Real(0.0, 0.5),
+        "rnn_dropout": Real(0.0, 0.5),
+        "attn_dropout": Real(0.0, 0.5),
+        "n_heads": Integer(1, 8),
+        "transformer_dim_feedforward": Integer(16, 512),
+        # Convolution-related parameters
+        "d_conv": Integer(4, 128),  # Dimension of convolution layers
+        "conv_bias": Categorical([True, False]),
+        # Normalization and regularization
+        "norm": Categorical(["LayerNorm", "BatchNorm", "RMSNorm"]),
+        "weight_decay": Real(1e-8, 1e-2, prior="log-uniform"),
+        "layer_norm_eps": Real(1e-7, 1e-4),
+        "head_dropout": Real(0.0, 0.5),
+        "bias": Categorical([True, False]),
+        "norm_first": Categorical([True, False]),
+        # Pooling, activation, and head layer settings
+        "pooling_method": Categorical(["avg", "max", "cls", "sum"]),
+        "activation": Categorical(
+            ["ReLU", "SELU", "Identity", "Tanh", "LeakyReLU", "SiLU"]
+        ),
+        "embedding_activation": Categorical(
+            ["ReLU", "SELU", "Identity", "Tanh", "LeakyReLU"]
+        ),
+        "rnn_activation": Categorical(["relu", "tanh"]),
+        "transformer_activation": Categorical(
+            ["ReLU", "SELU", "Identity", "Tanh", "LeakyReLU", "ReGLU"]
+        ),
+        "head_skip_layers": Categorical([True, False]),
+        "head_use_batch_norm": Categorical([True, False]),
+        # Sequence-related settings
+        "bidirectional": Categorical([True, False]),
+        "use_learnable_interaction": Categorical([True, False]),
+        "use_cls": Categorical([True, False]),
+        # Feature encoding
+        "cat_encoding": Categorical(["int", "one-hot"]),
+    }
+
+    layer_size_min, layer_size_max = 16, 512  # Dynamic layer sizes
+    max_head_layers = 5  # Set a maximum number of layers for optimization
+
+    param_names = []
+    param_space = []
+
+    for field in config.__dataclass_fields__:
+        if field in search_space_mapping:
+            param_names.append(field)
+            param_space.append(search_space_mapping[field])
+
+    # Handle head_layer_sizes dynamically by setting the length and individual sizes
+    if "head_layer_sizes" in config.__dataclass_fields__:
+        param_names.append("head_layer_size_length")
+        param_space.append(
+            Integer(1, max_head_layers)
+        )  # Optimize the length of the list
+
+        # Optimize individual layer sizes based on max_head_layers
+        for i in range(max_head_layers):
+            # Optimize over integers and multiply by 16 to ensure divisibility by 16
+            param_names.append(f"head_layer_size_{i+1}")
+            param_space.append(Integer(layer_size_min, layer_size_max))
+
+    return param_names, param_space
+
+
+activation_mapper = {
+    "ReLU": nn.ReLU(),
+    "Tanh": nn.Tanh(),
+    "SiLU": nn.SiLU(),
+    "LeakyReLU": nn.LeakyReLU(),
+    "Identity": nn.Identity(),
+    "Linear": nn.Identity(),
+    "SELU": nn.SELU(),
+    "ReGLU": ReGLU(),
+}

From 85adc317884c35b869f06b8af952282346c49aa7 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 6 Sep 2024 22:42:26 +0200
Subject: [PATCH 005/132] include gp_minimize in sklearn base regressor

---
 mambular/models/sklearn_base_regressor.py | 188 +++++++++++++++++++++-
 1 file changed, 183 insertions(+), 5 deletions(-)

diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index 1a098ac..85a2c7c 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -9,7 +9,16 @@
 from ..data_utils.datamodule import MambularDataModule
 from ..preprocessing import Preprocessor
 from lightning.pytorch.callbacks import ModelSummary
-from dataclasses import asdict, is_dataclass
+from skopt import gp_minimize
+from skopt.space import Real, Integer, Categorical
+import torch.nn as nn
+from sklearn.metrics import mean_squared_error
+import warnings
+from ..utils.config_mapper import (
+    get_search_space,
+    activation_mapper,
+    round_to_nearest_16,
+)
 
 
 class SklearnBaseRegressor(BaseEstimator):
@@ -183,7 +192,7 @@ def build_model(
             val_size=val_size,
             random_state=random_state,
             regression=True,
-            **dataloader_kwargs
+            **dataloader_kwargs,
         )
 
         self.data_module.preprocess_data(
@@ -258,7 +267,7 @@ def fit(
         checkpoint_path="model_checkpoints",
         dataloader_kwargs={},
         rebuild=True,
-        **trainer_kwargs
+        **trainer_kwargs,
     ):
         """
         Trains the regression model using the provided training data. Optionally, a separate validation set can be used.
@@ -329,7 +338,7 @@ def fit(
                 val_size=val_size,
                 random_state=random_state,
                 regression=True,
-                **dataloader_kwargs
+                **dataloader_kwargs,
             )
 
             self.data_module.preprocess_data(
@@ -370,7 +379,7 @@ def fit(
                 checkpoint_callback,
                 ModelSummary(max_depth=2),
             ],
-            **trainer_kwargs
+            **trainer_kwargs,
         )
         self.trainer.fit(self.task_model, self.data_module)
 
@@ -500,3 +509,172 @@ def score(self, X, y, metric=mean_squared_error):
         """
         predictions = self.predict(X)
         return metric(y, predictions)
+
+    def optimize_hparams(
+        self,
+        X,
+        y,
+        X_val=None,
+        y_val=None,
+        time=100,
+        max_epochs=200,
+        prune_by_epoch=True,
+        prune_epoch=5,
+        **optimize_kwargs,
+    ):
+        """
+        Optimizes hyperparameters using Bayesian optimization with optional pruning.
+
+        Parameters
+        ----------
+        X : array-like
+            Training data.
+        y : array-like
+            Training labels.
+        X_val, y_val : array-like, optional
+            Validation data and labels.
+        time : int
+            The number of optimization trials to run.
+        max_epochs : int
+            Maximum number of epochs for training.
+        prune_by_epoch : bool
+            Whether to prune based on a specific epoch (True) or the best validation loss (False).
+        prune_epoch : int
+            The specific epoch to prune by when prune_by_epoch is True.
+        **optimize_kwargs : dict
+            Additional keyword arguments passed to the fit method.
+
+        Returns
+        -------
+        best_hparams : list
+            Best hyperparameters found during optimization.
+        """
+
+        # Define the hyperparameter search space from the model config
+        param_names, param_space = get_search_space(self.config)
+
+        # Initial model fitting to get the baseline validation loss
+        self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs)
+        best_val_loss = float("inf")
+
+        if X_val is not None and y_val is not None:
+            val_loss = self.evaluate(
+                X_val, y_val, metrics={"Mean Squared Error": mean_squared_error}
+            )["Mean Squared Error"]
+        else:
+            val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
+                "val_loss"
+            ]
+
+        best_val_loss = val_loss
+        best_epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
+
+        def _objective(hyperparams):
+            nonlocal best_val_loss, best_epoch_val_loss  # Access across trials
+
+            head_layer_sizes = []
+
+            for key, param_value in zip(param_names, hyperparams):
+                if key == "head_layer_size_length":
+                    head_layer_size_length = param_value
+                elif key.startswith("head_layer_size_"):
+                    head_layer_sizes.append(round_to_nearest_16(param_value))
+                else:
+                    field_type = self.config.__dataclass_fields__[key].type
+
+                    # Check if the field is a callable (e.g., activation function)
+                    if field_type == callable and isinstance(param_value, str):
+                        if param_value in activation_mapper:
+                            setattr(self.config, key, activation_mapper[param_value])
+                        else:
+                            raise ValueError(
+                                f"Unknown activation function: {param_value}"
+                            )
+                    else:
+                        setattr(self.config, key, param_value)
+
+            # Truncate or use part of head_layer_sizes based on the optimized length
+            if head_layer_size_length is not None:
+                setattr(
+                    self.config,
+                    "head_layer_sizes",
+                    head_layer_sizes[:head_layer_size_length],
+                )
+
+                print(head_layer_sizes)
+
+            # Build the model with updated hyperparameters
+            self.build_model(
+                X, y, X_val=X_val, y_val=y_val, lr=self.config.lr, **optimize_kwargs
+            )
+
+            # Dynamically set the early pruning threshold
+            if prune_by_epoch:
+                early_pruning_threshold = (
+                    best_epoch_val_loss * 1.5
+                )  # Prune based on specific epoch loss
+            else:
+                early_pruning_threshold = (
+                    best_val_loss * 1.5
+                )  # Prune based on the best overall validation loss
+
+            # Initialize the model with pruning
+            self.task_model.early_pruning_threshold = early_pruning_threshold
+            self.task_model.pruning_epoch = prune_epoch
+
+            # Fit the model (limit epochs for faster optimization)
+            self.fit(
+                X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
+            )
+
+            # Retrieve the current validation loss
+            if X_val is not None and y_val is not None:
+                val_loss = self.evaluate(
+                    X_val, y_val, metrics={"Mean Squared Error": mean_squared_error}
+                )["Mean Squared Error"]
+            else:
+                val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
+                    "val_loss"
+                ]
+
+            # Retrieve validation loss at the specified epoch (e.g., epoch 5)
+            epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
+
+            # Update the best validation loss at the specified epoch
+            if prune_by_epoch and epoch_val_loss < best_epoch_val_loss:
+                best_epoch_val_loss = epoch_val_loss
+
+            # Update the best overall validation loss
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+
+            return val_loss
+
+        # Perform Bayesian optimization using scikit-optimize
+        result = gp_minimize(_objective, param_space, n_calls=time, random_state=42)
+
+        # Update the model with the best-found hyperparameters
+        best_hparams = result.x
+        if "head_layer_sizes" in self.config.__dataclass_fields__:
+            head_layer_sizes = []
+
+        # Iterate over the best hyperparameters found by optimization
+        for key, param_value in zip(param_names, best_hparams):
+            if key.startswith("head_layer_size_"):
+                # These are the individual head layer sizes
+                head_layer_sizes.append(round_to_nearest_16(param_value))
+            else:
+                # For all other config values, update normally
+                field_type = self.config.__dataclass_fields__[key].type
+                if field_type == callable and isinstance(param_value, str):
+                    setattr(self.config, key, activation_mapper[param_value])
+                else:
+                    setattr(self.config, key, param_value)
+
+        # After the loop, set head_layer_sizes in the config
+        if head_layer_sizes:
+            setattr(self.config, "head_layer_sizes", head_layer_sizes)
+
+        print("Best hyperparameters found:", best_hparams)
+
+        return best_hparams

From 6a6a51bbdabaca7ccd68d0c50b46048fd1127a02 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Wed, 11 Sep 2024 09:35:41 +0200
Subject: [PATCH 006/132] fix bug in TabTransformer embedding_layer

---
 mambular/base_models/tabtransformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/base_models/tabtransformer.py b/mambular/base_models/tabtransformer.py
index d9c5052..0e4f472 100644
--- a/mambular/base_models/tabtransformer.py
+++ b/mambular/base_models/tabtransformer.py
@@ -189,7 +189,7 @@ def forward(self, num_features, cat_features):
         Tensor
             The output predictions of the model.
         """
-        cat_embeddings = self.embedding_layer({}, cat_features)
+        cat_embeddings = self.embedding_layer(None, cat_features)
 
         num_features = torch.cat(num_features, dim=1)
         num_embeddings = self.norm_f(num_features)

From 29811d6cdfffb4f5b6e33a245f2be77439806046 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Wed, 11 Sep 2024 10:00:01 +0200
Subject: [PATCH 007/132] mlp basemodel convenience fix

---
 mambular/base_models/mlp.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index 9f61cab..4e50366 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -44,6 +44,7 @@ def __init__(
         self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
         self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
         self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.layer_sizes = self.hparams.get("layer_sizes", self.layer_sizes)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
@@ -69,28 +70,28 @@ def __init__(
             )
 
         # Input layer
-        self.layers.append(nn.Linear(input_dim, config.layer_sizes[0]))
+        self.layers.append(nn.Linear(input_dim, self.layer_sizes[0]))
         if config.batch_norm:
-            self.layers.append(nn.BatchNorm1d(config.layer_sizes[0]))
+            self.layers.append(nn.BatchNorm1d(self.layer_sizes[0]))
 
         norm_layer = self.hparams.get("norm", config.norm)
         if norm_layer == "RMSNorm":
-            self.norm_f = RMSNorm(config.layer_sizes[0])
+            self.norm_f = RMSNorm(self.layer_sizes[0])
         elif norm_layer == "LayerNorm":
-            self.norm_f = LayerNorm(config.layer_sizes[0])
+            self.norm_f = LayerNorm(self.layer_sizes[0])
         elif norm_layer == "BatchNorm":
-            self.norm_f = BatchNorm(config.layer_sizes[0])
+            self.norm_f = BatchNorm(self.layer_sizes[0])
         elif norm_layer == "InstanceNorm":
-            self.norm_f = InstanceNorm(config.layer_sizes[0])
+            self.norm_f = InstanceNorm(self.layer_sizes[0])
         elif norm_layer == "GroupNorm":
-            self.norm_f = GroupNorm(1, config.layer_sizes[0])
+            self.norm_f = GroupNorm(1, self.layer_sizes[0])
         elif norm_layer == "LearnableLayerScaling":
-            self.norm_f = LearnableLayerScaling(config.layer_sizes[0])
+            self.norm_f = LearnableLayerScaling(self.layer_sizes[0])
         else:
             self.norm_f = None
 
         if self.norm_f is not None:
-            self.layers.append(self.norm_f(config.layer_sizes[0]))
+            self.layers.append(self.norm_f(self.layer_sizes[0]))
 
         if config.use_glu:
             self.layers.append(nn.GLU())
@@ -100,14 +101,12 @@ def __init__(
             self.layers.append(nn.Dropout(config.dropout))
 
         # Hidden layers
-        for i in range(1, len(config.layer_sizes)):
-            self.layers.append(
-                nn.Linear(config.layer_sizes[i - 1], config.layer_sizes[i])
-            )
+        for i in range(1, len(self.layer_sizes)):
+            self.layers.append(nn.Linear(self.layer_sizes[i - 1], self.layer_sizes[i]))
             if config.batch_norm:
-                self.layers.append(nn.BatchNorm1d(config.layer_sizes[i]))
+                self.layers.append(nn.BatchNorm1d(self.layer_sizes[i]))
             if config.layer_norm:
-                self.layers.append(nn.LayerNorm(config.layer_sizes[i]))
+                self.layers.append(nn.LayerNorm(self.layer_sizes[i]))
             if config.use_glu:
                 self.layers.append(nn.GLU())
             else:
@@ -116,7 +115,7 @@ def __init__(
                 self.layers.append(nn.Dropout(config.dropout))
 
         # Output layer
-        self.layers.append(nn.Linear(config.layer_sizes[-1], num_classes))
+        self.layers.append(nn.Linear(self.layer_sizes[-1], num_classes))
 
         if self.use_embeddings:
             self.embedding_layer = EmbeddingLayer(

From 1cb93c59e19af3bcd91c9088d1d7674154c04819 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Wed, 11 Sep 2024 10:07:52 +0200
Subject: [PATCH 008/132] resnet convenience fix

---
 mambular/base_models/resnet.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mambular/base_models/resnet.py b/mambular/base_models/resnet.py
index a6a03b7..2b584d3 100644
--- a/mambular/base_models/resnet.py
+++ b/mambular/base_models/resnet.py
@@ -45,6 +45,7 @@ def __init__(
         self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
         self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
         self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.layer_sizes = self.hparams.get("layer_sizes", self.layer_sizes)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
         self.activation = config.activation
@@ -78,15 +79,15 @@ def __init__(
         else:
             self.norm_f = None
 
-        self.initial_layer = nn.Linear(input_dim, config.layer_sizes[0])
+        self.initial_layer = nn.Linear(input_dim, self.layer_sizes[0])
 
         self.blocks = nn.ModuleList()
         for i in range(config.num_blocks):
-            input_dim = config.layer_sizes[i]
+            input_dim = self.layer_sizes[i]
             output_dim = (
-                config.layer_sizes[i + 1]
-                if i + 1 < len(config.layer_sizes)
-                else config.layer_sizes[-1]
+                self.layer_sizes[i + 1]
+                if i + 1 < len(self.layer_sizes)
+                else self.layer_sizes[-1]
             )
             block = ResidualBlock(
                 input_dim,
@@ -97,7 +98,7 @@ def __init__(
             )
             self.blocks.append(block)
 
-        self.output_layer = nn.Linear(config.layer_sizes[-1], num_classes)
+        self.output_layer = nn.Linear(self.layer_sizes[-1], num_classes)
 
         if self.use_embeddings:
             self.embedding_layer = EmbeddingLayer(

From 4ef4b5d440f8974d06bcb130e3e1f92c4ef7005a Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 13 Sep 2024 10:46:33 +0200
Subject: [PATCH 009/132] config convenience fix

---
 mambular/base_models/tabularnn.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mambular/base_models/tabularnn.py b/mambular/base_models/tabularnn.py
index a3e31bc..9f3c54f 100644
--- a/mambular/base_models/tabularnn.py
+++ b/mambular/base_models/tabularnn.py
@@ -111,7 +111,10 @@ def __init__(
             n_output_units=num_classes,
         )
 
-        self.linear = nn.Linear(config.d_model, config.dim_feedforward)
+        self.linear = nn.Linear(
+            self.hparams.get("d_model", config.d_model),
+            self.hparams.get("dim_feedforward", config.dim_feedforward),
+        )
 
     def forward(self, num_features, cat_features):
         """

From bf906836304e5040b36b593f80dca067f064d65a Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 13 Sep 2024 17:36:01 +0200
Subject: [PATCH 010/132] add efficiency scripts

---
 efficiency/efficiency.ipynb | 474 ++++++++++++++++++++++++++++++++++++
 1 file changed, 474 insertions(+)
 create mode 100644 efficiency/efficiency.ipynb

diff --git a/efficiency/efficiency.ipynb b/efficiency/efficiency.ipynb
new file mode 100644
index 0000000..2dae5bb
--- /dev/null
+++ b/efficiency/efficiency.ipynb
@@ -0,0 +1,474 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from mambular.base_models.mambular import Mambular\n",
+    "from mambular.base_models.tabtransformer import TabTransformer\n",
+    "from mambular.base_models.ft_transformer import FTTransformer\n",
+    "from mambular.base_models.mlp import MLP\n",
+    "from mambular.base_models.mambatab import MambaTab\n",
+    "from mambular.base_models.resnet import ResNet\n",
+    "from mambular.base_models.mambattn import MambAttention\n",
+    "from mambular.base_models.tabularnn import TabulaRNN\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from accelerate import Accelerator\n",
+    "from accelerate.utils import ProfileKwargs\n",
+    "import re\n",
+    "from torch.profiler import profile, ProfilerActivity\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features (10-100) GPU efficiency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Initialize an empty DataFrame to store the results\n",
+    "df_results = pd.DataFrame(\n",
+    "    columns=[\"Model\", \"Num Features\", \"Total CUDA Memory (MB)\", \"Total CUDA Time (ms)\"]\n",
+    ")\n",
+    "\n",
+    "# Set up the profiler with memory profiling enabled\n",
+    "profile_kwargs = ProfileKwargs(\n",
+    "    activities=[\"cpu\", \"cuda\"], profile_memory=True, record_shapes=True\n",
+    ")\n",
+    "accelerator = Accelerator(cpu=False, kwargs_handlers=[profile_kwargs])\n",
+    "\n",
+    "# Loop over different numbers of features\n",
+    "for n_features in range(10, 100, 10): \n",
+    "    # Updated dictionaries for feature info\n",
+    "    cat_feature_info = {\n",
+    "        f\"cat_feature_{i}\": 10 for i in range(int(n_features/2))\n",
+    "    }  # 10 categories: 0 to 9\n",
+    "    num_feature_info = {\n",
+    "        f\"num_feature_{i}\": 64 for i in range(int(n_features/2))\n",
+    "    }  # 128-dimensional numerical features\n",
+    "\n",
+    "    # Create random numerical and categorical features, and move to CUDA\n",
+    "    num_features = [torch.randn(32, 64).cuda() for _ in range(int(n_features/2))]\n",
+    "    cat_features = [\n",
+    "        torch.randint(low=0, high=10, size=(32, 1)).cuda() for _ in range(int(n_features/2))\n",
+    "    ]\n",
+    "\n",
+    "    models = [\n",
+    "        Mambular(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            d_model=64,\n",
+    "        ).cuda(),\n",
+    "        FTTransformer(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            d_model=64,\n",
+    "            n_layers=5,\n",
+    "        ).cuda(),\n",
+    "        TabulaRNN(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            d_model=128,\n",
+    "            dim_feedforward=256,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            n_layers=4,\n",
+    "        ).cuda(),\n",
+    "        MLP(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            layer_sizes=[512, 256, 128, 32],\n",
+    "        ).cuda(),\n",
+    "        ResNet(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            layer_sizes=[512, 256, 16],\n",
+    "        ).cuda(),\n",
+    "        MambAttention(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            d_state=172,\n",
+    "        ).cuda(),\n",
+    "    ]\n",
+    "\n",
+    "    # Iterate over the models\n",
+    "    for model in models:\n",
+    "        # Prepare the model using the accelerator\n",
+    "        #model = accelerator.prepare(model)\n",
+    "\n",
+    "        # Profiling the model\n",
+    "        with profile(profile_memory=True, record_shapes=True) as prof:\n",
+    "            with torch.no_grad():\n",
+    "                outputs = model(num_features, cat_features)\n",
+    "\n",
+    "        # Extract key metrics from profiler\n",
+    "        key_averages = prof.key_averages()\n",
+    "        key_avg_output = str(key_averages.total_average())\n",
+    "\n",
+    "\n",
+    "\n",
+    "        # Extract cuda_memory_usage\n",
+    "        cuda_memory_match = re.search(r'cuda_memory_usage=(\\d+)', key_avg_output)\n",
+    "        total_cuda_memory = int(cuda_memory_match.group(1)) / (1024 ** 2) if cuda_memory_match else 0.0  # Convert to MB\n",
+    "\n",
+    "        # Extract cpu_memory_usage\n",
+    "        cpu_memory_match = re.search(r'cpu_memory_usage=(\\d+)', key_avg_output)\n",
+    "        total_cpu_memory = int(cpu_memory_match.group(1)) / (1024 ** 2) if cpu_memory_match else 0.0  # Convert to MB\n",
+    "\n",
+    "        # Extract self_cpu_time (convert from ms)\n",
+    "        cpu_time_match = re.search(r'self_cpu_time=([\\d.]+)ms', key_avg_output)\n",
+    "        total_cpu_time = float(cpu_time_match.group(1)) if cpu_time_match else 0.0  # CPU time in ms\n",
+    "\n",
+    "        # Extract self_cuda_time (convert from ms)\n",
+    "        cuda_time_match = re.search(r'self_cuda_time=([\\d.]+)ms', key_avg_output)\n",
+    "        total_cuda_time = float(cuda_time_match.group(1)) if cuda_time_match else 0.0  # CUDA time in ms\n",
+    "\n",
+    "        new_row = {\n",
+    "            \"Model\": model.__class__.__name__,\n",
+    "            \"Num Features\": n_features,\n",
+    "            \"Total CPU Time (ms)\": total_cpu_time,\n",
+    "            \"Total CUDA Time (ms)\": total_cuda_time,\n",
+    "            \"Total CPU Memory (MB)\": total_cpu_memory,\n",
+    "            \"Total CUDA Memory (MB)\": total_cuda_memory,\n",
+    "        }\n",
+    "\n",
+    "        # Append the new row to the DataFrame using pd.concat\n",
+    "        df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)\n",
+    "\n",
+    "# Display the profiling results\n",
+    "print(df_results.head())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Features (0-1000) GPU Efficiency. Batch Size is adapted to 8 to avoid crashes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from mambular.base_models.mambular import Mambular\n",
+    "from mambular.base_models.tabtransformer import TabTransformer\n",
+    "from mambular.base_models.ft_transformer import FTTransformer\n",
+    "from mambular.base_models.mlp import MLP\n",
+    "from mambular.base_models.resnet import ResNet\n",
+    "from mambular.base_models.mambattn import MambAttention\n",
+    "from mambular.base_models.tabularnn import TabulaRNN\n",
+    "from accelerate import Accelerator\n",
+    "from accelerate.utils import ProfileKwargs\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import warnings\n",
+    "# Parse the string to extract values using regex\n",
+    "import re\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "\n",
+    "import torch\n",
+    "\n",
+    "# Initialize models with updated feature info\n",
+    "\n",
+    "\n",
+    "# Initialize an empty DataFrame to store the results\n",
+    "df_results = pd.DataFrame(\n",
+    "    columns=[\"Model\", \"Num Features\", \"Total CUDA Memory (MB)\", \"Total CUDA Time (ms)\"]\n",
+    ")\n",
+    "\n",
+    "# Set up the profiler with memory profiling enabled\n",
+    "profile_kwargs = ProfileKwargs(\n",
+    "    activities=[\"cpu\", \"cuda\"], profile_memory=True, record_shapes=True\n",
+    ")\n",
+    "accelerator = Accelerator(cpu=False, kwargs_handlers=[profile_kwargs])\n",
+    "\n",
+    "# Loop over different numbers of features\n",
+    "for n_features in range(10, 1000, 100):\n",
+    "\n",
+    "    # Updated dictionaries for feature info\n",
+    "    cat_feature_info = {\n",
+    "        f\"cat_feature_{i}\": 10 for i in range(int(n_features/2))\n",
+    "    }  # 10 categories: 0 to 9\n",
+    "    num_feature_info = {\n",
+    "        f\"num_feature_{i}\": 64 for i in range(int(n_features/2))\n",
+    "    }  # 128-dimensional numerical features\n",
+    "\n",
+    "    # Create random numerical and categorical features, and move to CUDA\n",
+    "    num_features = [torch.randn(8, 64).cuda() for _ in range(int(n_features/2))]\n",
+    "    cat_features = [\n",
+    "        torch.randint(low=0, high=10, size=(8, 1)).cuda() for _ in range(int(n_features/2))\n",
+    "    ]\n",
+    "\n",
+    "    models = [\n",
+    "        Mambular(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            d_model=64,\n",
+    "        ).cuda(),\n",
+    "        FTTransformer(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            d_model=64,\n",
+    "            n_layers=5,\n",
+    "        ).cuda(),\n",
+    "        TabulaRNN(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            d_model=128,\n",
+    "            dim_feedforward=256,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            n_layers=4,\n",
+    "        ).cuda(),\n",
+    "        MLP(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            layer_sizes=[512, 256, 128, 32],\n",
+    "        ).cuda(),\n",
+    "        ResNet(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            layer_sizes=[512, 256, 16],\n",
+    "        ).cuda(),\n",
+    "        MambAttention(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            d_state=172,\n",
+    "        ).cuda(),\n",
+    "    ]\n",
+    "\n",
+    "    # Iterate over the models\n",
+    "    for model in models:\n",
+    "        # Prepare the model using the accelerator\n",
+    "        #model = accelerator.prepare(model)\n",
+    "\n",
+    "        # Profiling the model\n",
+    "        with profile(profile_memory=True, record_shapes=True) as prof:\n",
+    "            with torch.no_grad():\n",
+    "                outputs = model(num_features, cat_features)\n",
+    "\n",
+    "        # Extract key metrics from profiler\n",
+    "        key_averages = prof.key_averages()\n",
+    "        key_avg_output = str(key_averages.total_average())\n",
+    "\n",
+    "\n",
+    "\n",
+    "        # Extract cuda_memory_usage\n",
+    "        cuda_memory_match = re.search(r'cuda_memory_usage=(\\d+)', key_avg_output)\n",
+    "        total_cuda_memory = int(cuda_memory_match.group(1)) / (1024 ** 2) if cuda_memory_match else 0.0  # Convert to MB\n",
+    "\n",
+    "        # Extract cpu_memory_usage\n",
+    "        cpu_memory_match = re.search(r'cpu_memory_usage=(\\d+)', key_avg_output)\n",
+    "        total_cpu_memory = int(cpu_memory_match.group(1)) / (1024 ** 2) if cpu_memory_match else 0.0  # Convert to MB\n",
+    "\n",
+    "        # Extract self_cpu_time (convert from ms)\n",
+    "        cpu_time_match = re.search(r'self_cpu_time=([\\d.]+)ms', key_avg_output)\n",
+    "        total_cpu_time = float(cpu_time_match.group(1)) if cpu_time_match else 0.0  # CPU time in ms\n",
+    "\n",
+    "        # Extract self_cuda_time (convert from ms)\n",
+    "        cuda_time_match = re.search(r'self_cuda_time=([\\d.]+)ms', key_avg_output)\n",
+    "        total_cuda_time = float(cuda_time_match.group(1)) if cuda_time_match else 0.0  # CUDA time in ms\n",
+    "\n",
+    "        new_row = {\n",
+    "            \"Model\": model.__class__.__name__,\n",
+    "            \"Num Features\": n_features,\n",
+    "            \"Total CPU Time (ms)\": total_cpu_time,\n",
+    "            \"Total CUDA Time (ms)\": total_cuda_time,\n",
+    "            \"Total CPU Memory (MB)\": total_cpu_memory,\n",
+    "            \"Total CUDA Memory (MB)\": total_cuda_memory,\n",
+    "        }\n",
+    "\n",
+    "        # Append the new row to the DataFrame using pd.concat\n",
+    "        df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)\n",
+    "\n",
+    "# Display the profiling results\n",
+    "print(df_results.head())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# GPU vs Embedding dimension -> Batch size of 32, fixed feature number of 12 to simulate average tabular dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from mambular.base_models.mambular import Mambular\n",
+    "from mambular.base_models.tabtransformer import TabTransformer\n",
+    "from mambular.base_models.ft_transformer import FTTransformer\n",
+    "from mambular.base_models.mlp import MLP\n",
+    "from mambular.base_models.resnet import ResNet\n",
+    "from mambular.base_models.mambattn import MambAttention\n",
+    "from mambular.base_models.tabularnn import TabulaRNN\n",
+    "from accelerate import Accelerator\n",
+    "from accelerate.utils import ProfileKwargs\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import warnings\n",
+    "# Parse the string to extract values using regex\n",
+    "import re\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "\n",
+    "import torch\n",
+    "\n",
+    "# Initialize models with updated feature info\n",
+    "\n",
+    "\n",
+    "# Initialize an empty DataFrame to store the results\n",
+    "df_results = pd.DataFrame(\n",
+    "    columns=[\"Model\", \"Num Layers\", \"Total CUDA Memory (MB)\", \"Total CUDA Time (ms)\"]\n",
+    ")\n",
+    "\n",
+    "# Set up the profiler with memory profiling enabled\n",
+    "profile_kwargs = ProfileKwargs(\n",
+    "    activities=[\"cpu\", \"cuda\"], profile_memory=True, record_shapes=True\n",
+    ")\n",
+    "accelerator = Accelerator(cpu=False, kwargs_handlers=[profile_kwargs])\n",
+    "n_features=12\n",
+    "\n",
+    "# Loop over different numbers of features\n",
+    "for n_layers in range(4, 24):\n",
+    "\n",
+    "    # Updated dictionaries for feature info\n",
+    "    cat_feature_info = {\n",
+    "        f\"cat_feature_{i}\": 10 for i in range(int(n_features/2))\n",
+    "    }  # 10 categories: 0 to 9\n",
+    "    num_feature_info = {\n",
+    "        f\"num_feature_{i}\": 64 for i in range(int(n_features/2))\n",
+    "    }  # 128-dimensional numerical features\n",
+    "\n",
+    "    # Create random numerical and categorical features, and move to CUDA\n",
+    "    num_features = [torch.randn(32, 64).cuda() for _ in range(int(n_features/2))]\n",
+    "    cat_features = [\n",
+    "        torch.randint(low=0, high=10, size=(32, 1)).cuda() for _ in range(int(n_features/2))\n",
+    "    ]\n",
+    "\n",
+    "    models = [\n",
+    "        Mambular(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            d_model=64,\n",
+    "            n_layers=n_layers\n",
+    "        ).cuda(),\n",
+    "        FTTransformer(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            d_model=64,\n",
+    "            n_layers=n_layers\n",
+    "        ).cuda(),\n",
+    "        TabulaRNN(\n",
+    "            num_feature_info=num_feature_info,\n",
+    "            cat_feature_info=cat_feature_info,\n",
+    "            d_model=128,\n",
+    "            dim_feedforward=256,\n",
+    "            numerical_preprocessing=\"ple\",\n",
+    "            n_bins=64,\n",
+    "            n_layers=n_layers\n",
+    "        ).cuda(),\n",
+    "    ]\n",
+    "\n",
+    "    # Iterate over the models\n",
+    "    for model in models:\n",
+    "        # Prepare the model using the accelerator\n",
+    "        #model = accelerator.prepare(model)\n",
+    "\n",
+    "        # Profiling the model\n",
+    "        with profile(profile_memory=True, record_shapes=True) as prof:\n",
+    "            with torch.no_grad():\n",
+    "                outputs = model(num_features, cat_features)\n",
+    "\n",
+    "        # Extract key metrics from profiler\n",
+    "        key_averages = prof.key_averages()\n",
+    "        key_avg_output = str(key_averages.total_average())\n",
+    "\n",
+    "\n",
+    "\n",
+    "        # Extract cuda_memory_usage\n",
+    "        cuda_memory_match = re.search(r'cuda_memory_usage=(\\d+)', key_avg_output)\n",
+    "        total_cuda_memory = int(cuda_memory_match.group(1)) / (1024 ** 2) if cuda_memory_match else 0.0  # Convert to MB\n",
+    "\n",
+    "        # Extract cpu_memory_usage\n",
+    "        cpu_memory_match = re.search(r'cpu_memory_usage=(\\d+)', key_avg_output)\n",
+    "        total_cpu_memory = int(cpu_memory_match.group(1)) / (1024 ** 2) if cpu_memory_match else 0.0  # Convert to MB\n",
+    "\n",
+    "        # Extract self_cpu_time (convert from ms)\n",
+    "        cpu_time_match = re.search(r'self_cpu_time=([\\d.]+)ms', key_avg_output)\n",
+    "        total_cpu_time = float(cpu_time_match.group(1)) if cpu_time_match else 0.0  # CPU time in ms\n",
+    "\n",
+    "        # Extract self_cuda_time (convert from ms)\n",
+    "        cuda_time_match = re.search(r'self_cuda_time=([\\d.]+)ms', key_avg_output)\n",
+    "        total_cuda_time = float(cuda_time_match.group(1)) if cuda_time_match else 0.0  # CUDA time in ms\n",
+    "\n",
+    "        new_row = {\n",
+    "            \"Model\": model.__class__.__name__,\n",
+    "            \"Num Layers\": int(n_layers),\n",
+    "            \"Total CPU Time (ms)\": total_cpu_time,\n",
+    "            \"Total CUDA Time (ms)\": total_cuda_time,\n",
+    "            \"Total CPU Memory (MB)\": total_cpu_memory,\n",
+    "            \"Total CUDA Memory (MB)\": total_cuda_memory,\n",
+    "        }\n",
+    "\n",
+    "        # Append the new row to the DataFrame using pd.concat\n",
+    "        df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)\n",
+    "\n",
+    "# Display the profiling results\n",
+    "print(df_results.head())\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 2c4bcb87ccde2b85fe6eb5064045977234de922c Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 13 Sep 2024 17:36:46 +0200
Subject: [PATCH 011/132] remove citation for anonymity

---
 README.md | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/README.md b/README.md
index 5bbfc22..8c1211f 100644
--- a/README.md
+++ b/README.md
@@ -31,8 +31,6 @@ Mambular is a Python library for tabular deep learning. It includes models that
 - [🛠️ Installation](#️-installation)
 - [🚀 Usage](#-usage)
 - [💻 Implement Your Own Model](#-implement-your-own-model)
-- [🏷️ Citation](#️-citation)
-- [License](#license)
 
 
 # 🏃 Quickstart
@@ -310,18 +308,4 @@ Here's how you can implement a custom model with Mambular:
    regressor.fit(X_train, y_train, max_epochs=50)
    ```
 
-# 🏷️ Citation
 
-If you find this project useful in your research, please consider cite:
-```BibTeX
-@article{thielmann2024mambular,
-  title={Mambular: A Sequential Model for Tabular Deep Learning},
-  author={Thielmann, Anton Frederik and Kumar, Manish and Weisser, Christoph and Reuter, Arik and S{\"a}fken, Benjamin and Samiee, Soheila},
-  journal={arXiv preprint arXiv:2408.06291},
-  year={2024}
-}
-```
-
-# License
-
-The entire codebase is under MIT license.

From 797b624ecb66f0b5062daf7429ed5aa942087344 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Sat, 28 Sep 2024 16:13:14 +0200
Subject: [PATCH 012/132] add hpo to classifier and lss

---
 mambular/models/sklearn_base_classifier.py | 184 ++++++++++++++++++++-
 mambular/models/sklearn_base_lss.py        | 183 +++++++++++++++++++-
 2 files changed, 359 insertions(+), 8 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 0c7e30f..edb0012 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -11,6 +11,13 @@
 import numpy as np
 from lightning.pytorch.callbacks import ModelSummary
 from sklearn.metrics import log_loss
+from skopt import gp_minimize
+import warnings
+from ..utils.config_mapper import (
+    get_search_space,
+    activation_mapper,
+    round_to_nearest_16,
+)
 
 
 class SklearnBaseClassifier(BaseEstimator):
@@ -185,7 +192,7 @@ def build_model(
             val_size=val_size,
             random_state=random_state,
             regression=False,
-            **dataloader_kwargs
+            **dataloader_kwargs,
         )
 
         self.data_module.preprocess_data(
@@ -263,7 +270,7 @@ def fit(
         checkpoint_path="model_checkpoints",
         dataloader_kwargs={},
         rebuild=True,
-        **trainer_kwargs
+        **trainer_kwargs,
     ):
         """
         Trains the classification model using the provided training data. Optionally, a separate validation set can be used.
@@ -336,7 +343,7 @@ def fit(
                 val_size=val_size,
                 random_state=random_state,
                 regression=False,
-                **dataloader_kwargs
+                **dataloader_kwargs,
             )
 
             self.data_module.preprocess_data(
@@ -377,7 +384,7 @@ def fit(
                 checkpoint_callback,
                 ModelSummary(max_depth=2),
             ],
-            **trainer_kwargs
+            **trainer_kwargs,
         )
         self.trainer.fit(self.task_model, self.data_module)
 
@@ -591,3 +598,172 @@ def score(self, X, y, metric=(log_loss, True)):
         else:
             predictions = self.predict(X)
             return metric_func(y, predictions)
+
+    def optimize_hparams(
+        self,
+        X,
+        y,
+        X_val=None,
+        y_val=None,
+        time=100,
+        max_epochs=200,
+        prune_by_epoch=True,
+        prune_epoch=5,
+        **optimize_kwargs,
+    ):
+        """
+        Optimizes hyperparameters using Bayesian optimization with optional pruning.
+
+        Parameters
+        ----------
+        X : array-like
+            Training data.
+        y : array-like
+            Training labels.
+        X_val, y_val : array-like, optional
+            Validation data and labels.
+        time : int
+            The number of optimization trials to run.
+        max_epochs : int
+            Maximum number of epochs for training.
+        prune_by_epoch : bool
+            Whether to prune based on a specific epoch (True) or the best validation loss (False).
+        prune_epoch : int
+            The specific epoch to prune by when prune_by_epoch is True.
+        **optimize_kwargs : dict
+            Additional keyword arguments passed to the fit method.
+
+        Returns
+        -------
+        best_hparams : list
+            Best hyperparameters found during optimization.
+        """
+
+        # Define the hyperparameter search space from the model config
+        param_names, param_space = get_search_space(self.config)
+
+        # Initial model fitting to get the baseline validation loss
+        self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs)
+        best_val_loss = float("inf")
+
+        if X_val is not None and y_val is not None:
+            val_loss = self.evaluate(
+                X_val, y_val, metrics={"Accuracy": (accuracy_score, False)}
+            )["Accuracy"]
+        else:
+            val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
+                "val_loss"
+            ]
+
+        best_val_loss = val_loss
+        best_epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
+
+        def _objective(hyperparams):
+            nonlocal best_val_loss, best_epoch_val_loss  # Access across trials
+
+            head_layer_sizes = []
+
+            for key, param_value in zip(param_names, hyperparams):
+                if key == "head_layer_size_length":
+                    head_layer_size_length = param_value
+                elif key.startswith("head_layer_size_"):
+                    head_layer_sizes.append(round_to_nearest_16(param_value))
+                else:
+                    field_type = self.config.__dataclass_fields__[key].type
+
+                    # Check if the field is a callable (e.g., activation function)
+                    if field_type == callable and isinstance(param_value, str):
+                        if param_value in activation_mapper:
+                            setattr(self.config, key, activation_mapper[param_value])
+                        else:
+                            raise ValueError(
+                                f"Unknown activation function: {param_value}"
+                            )
+                    else:
+                        setattr(self.config, key, param_value)
+
+            # Truncate or use part of head_layer_sizes based on the optimized length
+            if head_layer_size_length is not None:
+                setattr(
+                    self.config,
+                    "head_layer_sizes",
+                    head_layer_sizes[:head_layer_size_length],
+                )
+
+                print(head_layer_sizes)
+
+            # Build the model with updated hyperparameters
+            self.build_model(
+                X, y, X_val=X_val, y_val=y_val, lr=self.config.lr, **optimize_kwargs
+            )
+
+            # Dynamically set the early pruning threshold
+            if prune_by_epoch:
+                early_pruning_threshold = (
+                    best_epoch_val_loss * 1.5
+                )  # Prune based on specific epoch loss
+            else:
+                early_pruning_threshold = (
+                    best_val_loss * 1.5
+                )  # Prune based on the best overall validation loss
+
+            # Initialize the model with pruning
+            self.task_model.early_pruning_threshold = early_pruning_threshold
+            self.task_model.pruning_epoch = prune_epoch
+
+            # Fit the model (limit epochs for faster optimization)
+            self.fit(
+                X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
+            )
+
+            # Retrieve the current validation loss
+            if X_val is not None and y_val is not None:
+                val_loss = self.evaluate(
+                    X_val, y_val, metrics={"Accuracy": (accuracy_score, False)}
+                )["Accuracy"]
+            else:
+                val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
+                    "val_loss"
+                ]
+
+            # Retrieve validation loss at the specified epoch (e.g., epoch 5)
+            epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
+
+            # Update the best validation loss at the specified epoch
+            if prune_by_epoch and epoch_val_loss < best_epoch_val_loss:
+                best_epoch_val_loss = epoch_val_loss
+
+            # Update the best overall validation loss
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+
+            return val_loss
+
+        # Perform Bayesian optimization using scikit-optimize
+        result = gp_minimize(_objective, param_space, n_calls=time, random_state=42)
+
+        # Update the model with the best-found hyperparameters
+        best_hparams = result.x
+        if "head_layer_sizes" in self.config.__dataclass_fields__:
+            head_layer_sizes = []
+
+        # Iterate over the best hyperparameters found by optimization
+        for key, param_value in zip(param_names, best_hparams):
+            if key.startswith("head_layer_size_"):
+                # These are the individual head layer sizes
+                head_layer_sizes.append(round_to_nearest_16(param_value))
+            else:
+                # For all other config values, update normally
+                field_type = self.config.__dataclass_fields__[key].type
+                if field_type == callable and isinstance(param_value, str):
+                    setattr(self.config, key, activation_mapper[param_value])
+                else:
+                    setattr(self.config, key, param_value)
+
+        # After the loop, set head_layer_sizes in the config
+        if head_layer_sizes:
+            setattr(self.config, "head_layer_sizes", head_layer_sizes)
+
+        print("Best hyperparameters found:", best_hparams)
+
+        return best_hparams
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index ad7100f..2d9f1b1 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -32,6 +32,13 @@
     StudentTDistribution,
 )
 from lightning.pytorch.callbacks import ModelSummary
+from skopt import gp_minimize
+import warnings
+from ..utils.config_mapper import (
+    get_search_space,
+    activation_mapper,
+    round_to_nearest_16,
+)
 
 
 class SklearnBaseLSS(BaseEstimator):
@@ -203,7 +210,7 @@ def build_model(
             val_size=val_size,
             random_state=random_state,
             regression=False,
-            **dataloader_kwargs
+            **dataloader_kwargs,
         )
 
         self.data_module.preprocess_data(
@@ -282,7 +289,7 @@ def fit(
         checkpoint_path="model_checkpoints",
         distributional_kwargs=None,
         dataloader_kwargs={},
-        **trainer_kwargs
+        **trainer_kwargs,
     ):
         """
         Trains the regression model using the provided training data. Optionally, a separate validation set can be used.
@@ -376,7 +383,7 @@ def fit(
             val_size=val_size,
             random_state=random_state,
             regression=True,
-            **dataloader_kwargs
+            **dataloader_kwargs,
         )
 
         self.data_module.preprocess_data(
@@ -417,7 +424,7 @@ def fit(
                 checkpoint_callback,
                 ModelSummary(max_depth=2),
             ],
-            **trainer_kwargs
+            **trainer_kwargs,
         )
         self.trainer.fit(self.task_model, self.data_module)
 
@@ -585,3 +592,171 @@ def score(self, X, y, metric="NLL"):
         predictions = self.predict(X)
         score = self.task_model.family.evaluate_nll(y, predictions)
         return score
+
+    def optimize_hparams(
+        self,
+        X,
+        y,
+        X_val=None,
+        y_val=None,
+        time=100,
+        max_epochs=200,
+        prune_by_epoch=True,
+        prune_epoch=5,
+        **optimize_kwargs,
+    ):
+        """
+        Optimizes hyperparameters using Bayesian optimization with optional pruning.
+
+        Parameters
+        ----------
+        X : array-like
+            Training data.
+        y : array-like
+            Training labels.
+        X_val, y_val : array-like, optional
+            Validation data and labels.
+        time : int
+            The number of optimization trials to run.
+        max_epochs : int
+            Maximum number of epochs for training.
+        prune_by_epoch : bool
+            Whether to prune based on a specific epoch (True) or the best validation loss (False).
+        prune_epoch : int
+            The specific epoch to prune by when prune_by_epoch is True.
+        **optimize_kwargs : dict
+            Additional keyword arguments passed to the fit method.
+
+        Returns
+        -------
+        best_hparams : list
+            Best hyperparameters found during optimization.
+        """
+
+        # Define the hyperparameter search space from the model config
+        param_names, param_space = get_search_space(self.config)
+
+        # Initial model fitting to get the baseline validation loss
+        self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs)
+        best_val_loss = float("inf")
+
+        if X_val is not None and y_val is not None:
+            val_loss = self.score(
+                X_val,
+                y_val,
+            )
+        else:
+            val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
+                "val_loss"
+            ]
+
+        best_val_loss = val_loss
+        best_epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
+
+        def _objective(hyperparams):
+            nonlocal best_val_loss, best_epoch_val_loss  # Access across trials
+
+            head_layer_sizes = []
+
+            for key, param_value in zip(param_names, hyperparams):
+                if key == "head_layer_size_length":
+                    head_layer_size_length = param_value
+                elif key.startswith("head_layer_size_"):
+                    head_layer_sizes.append(round_to_nearest_16(param_value))
+                else:
+                    field_type = self.config.__dataclass_fields__[key].type
+
+                    # Check if the field is a callable (e.g., activation function)
+                    if field_type == callable and isinstance(param_value, str):
+                        if param_value in activation_mapper:
+                            setattr(self.config, key, activation_mapper[param_value])
+                        else:
+                            raise ValueError(
+                                f"Unknown activation function: {param_value}"
+                            )
+                    else:
+                        setattr(self.config, key, param_value)
+
+            # Truncate or use part of head_layer_sizes based on the optimized length
+            if head_layer_size_length is not None:
+                setattr(
+                    self.config,
+                    "head_layer_sizes",
+                    head_layer_sizes[:head_layer_size_length],
+                )
+
+                print(head_layer_sizes)
+
+            # Build the model with updated hyperparameters
+            self.build_model(
+                X, y, X_val=X_val, y_val=y_val, lr=self.config.lr, **optimize_kwargs
+            )
+
+            # Dynamically set the early pruning threshold
+            if prune_by_epoch:
+                early_pruning_threshold = (
+                    best_epoch_val_loss * 1.5
+                )  # Prune based on specific epoch loss
+            else:
+                early_pruning_threshold = (
+                    best_val_loss * 1.5
+                )  # Prune based on the best overall validation loss
+
+            # Initialize the model with pruning
+            self.task_model.early_pruning_threshold = early_pruning_threshold
+            self.task_model.pruning_epoch = prune_epoch
+
+            # Fit the model (limit epochs for faster optimization)
+            self.fit(
+                X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
+            )
+
+            # Retrieve the current validation loss
+            if X_val is not None and y_val is not None:
+                val_loss = self.score(X_val, y_val)
+            else:
+                val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
+                    "val_loss"
+                ]
+
+            # Retrieve validation loss at the specified epoch (e.g., epoch 5)
+            epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
+
+            # Update the best validation loss at the specified epoch
+            if prune_by_epoch and epoch_val_loss < best_epoch_val_loss:
+                best_epoch_val_loss = epoch_val_loss
+
+            # Update the best overall validation loss
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+
+            return val_loss
+
+        # Perform Bayesian optimization using scikit-optimize
+        result = gp_minimize(_objective, param_space, n_calls=time, random_state=42)
+
+        # Update the model with the best-found hyperparameters
+        best_hparams = result.x
+        if "head_layer_sizes" in self.config.__dataclass_fields__:
+            head_layer_sizes = []
+
+        # Iterate over the best hyperparameters found by optimization
+        for key, param_value in zip(param_names, best_hparams):
+            if key.startswith("head_layer_size_"):
+                # These are the individual head layer sizes
+                head_layer_sizes.append(round_to_nearest_16(param_value))
+            else:
+                # For all other config values, update normally
+                field_type = self.config.__dataclass_fields__[key].type
+                if field_type == callable and isinstance(param_value, str):
+                    setattr(self.config, key, activation_mapper[param_value])
+                else:
+                    setattr(self.config, key, param_value)
+
+        # After the loop, set head_layer_sizes in the config
+        if head_layer_sizes:
+            setattr(self.config, "head_layer_sizes", head_layer_sizes)
+
+        print("Best hyperparameters found:", best_hparams)
+
+        return best_hparams

From 5b056646a8ec2a2f0c10cbf9e95edacb39474e77 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Sat, 28 Sep 2024 16:13:35 +0200
Subject: [PATCH 013/132] minor pooling error in mambular

---
 mambular/base_models/mambular.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index d362b8a..9cd6134 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -213,7 +213,7 @@ def forward(self, num_features, cat_features):
             x, _ = torch.max(x, dim=1)
         elif self.pooling_method == "sum":
             x = torch.sum(x, dim=1)
-        elif self.pooling_method == "cls_token":
+        elif self.pooling_method == "cls":
             x = x[:, -1]
         elif self.pooling_method == "last":
             x = x[:, -1]

From 14663e07f1fca67f85eda7ff7027c88bdda7984b Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Sat, 28 Sep 2024 17:20:24 +0200
Subject: [PATCH 014/132] add conv layer to rnn for positional invariance

---
 mambular/arch_utils/rnn_utils.py | 141 +++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 mambular/arch_utils/rnn_utils.py

diff --git a/mambular/arch_utils/rnn_utils.py b/mambular/arch_utils/rnn_utils.py
new file mode 100644
index 0000000..e505043
--- /dev/null
+++ b/mambular/arch_utils/rnn_utils.py
@@ -0,0 +1,141 @@
+import torch
+import torch.nn as nn
+
+
+class ConvRNN(nn.Module):
+    def __init__(
+        self,
+        model_type: str,  # 'RNN', 'LSTM', or 'GRU'
+        input_size: int,  # Number of input features (128 in your case)
+        hidden_size: int,  # Number of hidden units in RNN layers
+        num_layers: int,  # Number of RNN layers
+        bidirectional: bool,  # Whether RNN is bidirectional
+        rnn_dropout: float,  # Dropout rate for RNN
+        bias: bool,  # Bias for RNN
+        conv_bias: bool,  # Bias for Conv1d
+        rnn_activation: str = None,  # Only for RNN
+        d_conv: int = 4,  # Kernel size for Conv1d
+        residuals: bool = False,  # Whether to use residual connections
+    ):
+        super(ConvRNN, self).__init__()
+
+        # Choose RNN layer based on model_type
+        rnn_layer = {"RNN": nn.RNN, "LSTM": nn.LSTM, "GRU": nn.GRU}[model_type]
+
+        self.input_size = input_size  # Number of input features (128 in your case)
+        self.hidden_size = hidden_size  # Number of hidden units in RNN
+        self.num_layers = num_layers  # Number of RNN layers
+        self.bidirectional = bidirectional  # Whether RNN is bidirectional
+        self.rnn_type = model_type
+        self.residuals = residuals
+
+        # Convolutional layers
+        self.convs = nn.ModuleList()
+
+        if self.residuals:
+            self.residual_matrix = nn.ParameterList(
+                [
+                    nn.Parameter(torch.randn(hidden_size, hidden_size))
+                    for _ in range(num_layers)
+                ]
+            )
+
+        # First Conv1d layer uses input_size
+        self.convs.append(
+            nn.Conv1d(
+                in_channels=self.input_size,  # Input size for first layer
+                out_channels=self.input_size,  # Output channels (128)
+                kernel_size=d_conv,
+                padding=d_conv - 1,  # Padding to maintain sequence length
+                bias=conv_bias,
+                groups=self.input_size,  # Depthwise convolution, each channel independent
+            )
+        )
+
+        # Subsequent Conv1d layers use hidden_size as input
+        for i in range(self.num_layers - 1):
+            self.convs.append(
+                nn.Conv1d(
+                    in_channels=self.hidden_size,  # Hidden size for subsequent layers
+                    out_channels=self.hidden_size,  # Output channels
+                    kernel_size=d_conv,
+                    padding=d_conv - 1,  # Padding to maintain sequence length
+                    bias=conv_bias,
+                    groups=self.hidden_size,  # Depthwise convolution
+                )
+            )
+
+        # Initialize the RNN layers
+        self.rnns = nn.ModuleList()
+        for i in range(self.num_layers):
+            if model_type == "RNN":
+                rnn = rnn_layer(
+                    input_size=(
+                        self.input_size if i == 0 else self.hidden_size
+                    ),  # First layer uses input_size
+                    hidden_size=self.hidden_size,
+                    num_layers=1,  # One RNN layer at a time
+                    bidirectional=self.bidirectional,
+                    batch_first=True,
+                    dropout=rnn_dropout if i < self.num_layers - 1 else 0,
+                    bias=bias,
+                    nonlinearity=(
+                        rnn_activation if model_type == "RNN" else None
+                    ),  # Only RNN uses nonlinearity
+                )
+            else:  # For LSTM or GRU
+                rnn = rnn_layer(
+                    input_size=(
+                        self.input_size if i == 0 else self.hidden_size
+                    ),  # First layer uses input_size
+                    hidden_size=self.hidden_size,
+                    num_layers=1,  # One RNN layer at a time
+                    bidirectional=self.bidirectional,
+                    batch_first=True,
+                    dropout=rnn_dropout if i < self.num_layers - 1 else 0,
+                    bias=bias,
+                )
+            self.rnns.append(rnn)
+
+    def forward(self, x):
+        """
+        Forward pass through Conv-RNN layers.
+
+        Parameters
+        -----------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, seq_length, input_size).
+
+        Returns
+        --------
+        output : torch.Tensor
+            Output tensor after passing through Conv-RNN layers.
+        """
+        _, L, _ = x.shape
+        if self.residuals:
+            residual = x
+
+        # Loop through the RNN layers and apply 1D convolution before each
+        for i in range(self.num_layers):
+            # Transpose to (batch_size, input_size, seq_length) for Conv1d
+            x = x.transpose(1, 2)
+
+            # Apply the 1D convolution
+            x = self.convs[i](x)[:, :, :L]
+
+            # Transpose back to (batch_size, seq_length, input_size)
+            x = x.transpose(1, 2)
+
+            # Pass through the RNN layer
+            x, _ = self.rnns[i](x)
+
+            # Residual connection with learnable matrix
+            if self.residuals:
+                if i < self.num_layers and i > 0:
+                    residual_proj = torch.matmul(residual, self.residual_matrix[i])
+                    x = x + residual_proj
+
+                # Update residual for next layer
+                residual = x
+
+        return x, _

From 4bd83e9de5d1c9eed3d7d469b8a0a9cc7e60921b Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Sat, 28 Sep 2024 17:20:37 +0200
Subject: [PATCH 015/132] add convrnn to base class

---
 mambular/base_models/tabularnn.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/mambular/base_models/tabularnn.py b/mambular/base_models/tabularnn.py
index 9f3c54f..4433c7e 100644
--- a/mambular/base_models/tabularnn.py
+++ b/mambular/base_models/tabularnn.py
@@ -4,6 +4,7 @@
 from ..configs.tabularnn_config import DefaultTabulaRNNConfig
 from .basemodel import BaseModel
 from ..arch_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.rnn_utils import ConvRNN
 from ..arch_utils.normalization_layers import (
     RMSNorm,
     LayerNorm,
@@ -62,20 +63,18 @@ def __init__(
         else:
             self.norm_f = None
 
-        rnn_layer = {"RNN": nn.RNN, "LSTM": nn.LSTM, "GRU": nn.GRU}[config.model_type]
-        self.rnn = rnn_layer(
+        self.rnn = ConvRNN(
+            model_type=self.hparams.get("model_type", config.model_type),
             input_size=self.hparams.get("d_model", config.d_model),
             hidden_size=self.hparams.get("dim_feedforward", config.dim_feedforward),
             num_layers=self.hparams.get("n_layers", config.n_layers),
             bidirectional=self.hparams.get("bidirectional", config.bidirectional),
-            batch_first=True,
-            dropout=self.hparams.get("rnn_dropout", config.rnn_dropout),
+            rnn_dropout=self.hparams.get("rnn_dropout", config.rnn_dropout),
             bias=self.hparams.get("bias", config.bias),
-            nonlinearity=(
-                self.hparams.get("rnn_activation", config.rnn_activation)
-                if config.model_type == "RNN"
-                else None
-            ),
+            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
+            rnn_activation=self.hparams.get("rnn_activation", config.rnn_activation),
+            d_conv=self.hparams.get("d_conv", config.d_conv),
+            residuals=self.hparams.get("residuals", config.residuals),
         )
 
         self.embedding_layer = EmbeddingLayer(

From b3327a19b332a21092b5bb303d24d50b0db76d2d Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Sat, 28 Sep 2024 17:20:45 +0200
Subject: [PATCH 016/132] adjust config of RNN

---
 mambular/configs/tabularnn_config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mambular/configs/tabularnn_config.py b/mambular/configs/tabularnn_config.py
index 700181c..9ca6636 100644
--- a/mambular/configs/tabularnn_config.py
+++ b/mambular/configs/tabularnn_config.py
@@ -81,3 +81,6 @@ class DefaultTabulaRNNConfig:
     numerical_embedding: str = "ple"
     bidirectional: bool = False
     cat_encoding: str = "int"
+    d_conv: int = 4
+    conv_bias: bool = True
+    residuals: bool = False

From 3d76805ef1cc92e216c682781368d72728c9d992 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Sat, 28 Sep 2024 18:05:14 +0200
Subject: [PATCH 017/132] include optimizer args in taskmodel

---
 mambular/base_models/lightning_wrapper.py | 24 +++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/mambular/base_models/lightning_wrapper.py b/mambular/base_models/lightning_wrapper.py
index dc45472..2bfc25b 100644
--- a/mambular/base_models/lightning_wrapper.py
+++ b/mambular/base_models/lightning_wrapper.py
@@ -39,9 +39,12 @@ def __init__(
         loss_fct: callable = None,
         early_pruning_threshold=None,
         pruning_epoch=5,
+        optimizer_type: str = "Adam",
+        optimizer_args: dict = None,
         **kwargs,
     ):
         super().__init__()
+        self.optimizer_type = optimizer_type
         self.num_classes = num_classes
         self.lss = lss
         self.family = family
@@ -50,6 +53,12 @@ def __init__(
         self.pruning_epoch = pruning_epoch
         self.val_losses = []
 
+        self.optimizer_params = {
+            k.replace("optimizer_", ""): v
+            for k, v in optimizer_args.items()
+            if k.startswith("optimizer_")
+        }
+
         if lss:
             pass
         else:
@@ -345,17 +354,20 @@ def epoch_val_loss_at(self, epoch):
     def configure_optimizers(self):
         """
         Sets up the model's optimizer and learning rate scheduler based on the configurations provided.
-
-        Returns
-        -------
-        dict
-            A dictionary containing the optimizer and lr_scheduler configurations.
+        The optimizer type can be chosen by the user (Adam, SGD, etc.).
         """
-        optimizer = torch.optim.Adam(
+        # Dynamically choose the optimizer based on the passed optimizer_type
+        optimizer_class = getattr(torch.optim, self.optimizer_type)
+
+        # Initialize the optimizer with the chosen class and parameters
+        optimizer = optimizer_class(
             self.base_model.parameters(),
             lr=self.lr,
             weight_decay=self.weight_decay,
+            **self.optimizer_params,  # Pass any additional optimizer-specific parameters
         )
+
+        # Define learning rate scheduler
         scheduler = {
             "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
                 optimizer,

From 81ae17f84dab508ad76715097d785542d9fab63c Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Sat, 28 Sep 2024 18:08:24 +0200
Subject: [PATCH 018/132] adapt sklearn classes to allow optimizer kwargs

---
 mambular/models/sklearn_base_classifier.py | 21 ++++++++++++++++++---
 mambular/models/sklearn_base_lss.py        | 20 ++++++++++++++++++--
 mambular/models/sklearn_base_regressor.py  | 18 +++++++++++++++++-
 3 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index edb0012..fcfb0e2 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -35,7 +35,9 @@ def __init__(self, model, config, **kwargs):
         ]
 
         self.config_kwargs = {
-            k: v for k, v in kwargs.items() if k not in preprocessor_arg_names
+            k: v
+            for k, v in kwargs.items()
+            if k not in self.preprocessor_arg_names and not k.startswith("optimizer")
         }
         self.config = config(**self.config_kwargs)
 
@@ -45,6 +47,8 @@ def __init__(self, model, config, **kwargs):
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.task_model = None
+        self.base_model = model
+        self.built = False
 
         # Raise a warning if task is set to 'classification'
         if preprocessor_kwargs.get("task") == "regression":
@@ -53,8 +57,15 @@ def __init__(self, model, config, **kwargs):
                 UserWarning,
             )
 
-        self.base_model = model
-        self.built = False
+        self.optimizer_type = kwargs.get("optimizer_type", "adam")
+
+        self.optimizer_kwargs = {
+            k: v
+            for k, v in kwargs.items()
+            if k
+            not in ["lr", "weight_decay", "patience", "lr_patience", "optimizer_type"]
+            and k.startswith("optimizer_")
+        }
 
     def get_params(self, deep=True):
         """
@@ -211,6 +222,8 @@ def build_model(
             lr_patience=lr_patience,
             lr_factor=factor,
             weight_decay=weight_decay,
+            optimizer_type=self.optimizer_type,
+            optimizer_args=self.optimizer_kwargs,
         )
 
         self.built = True
@@ -362,6 +375,8 @@ def fit(
                 lr_patience=lr_patience,
                 lr_factor=factor,
                 weight_decay=weight_decay,
+                optimizer_type=self.optimizer_type,
+                optimizer_args=self.optimizer_kwargs,
             )
 
         early_stop_callback = EarlyStopping(
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index 2d9f1b1..cebfe26 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -56,7 +56,9 @@ def __init__(self, model, config, **kwargs):
         ]
 
         self.config_kwargs = {
-            k: v for k, v in kwargs.items() if k not in preprocessor_arg_names
+            k: v
+            for k, v in kwargs.items()
+            if k not in self.preprocessor_arg_names and not k.startswith("optimizer")
         }
         self.config = config(**self.config_kwargs)
 
@@ -66,6 +68,8 @@ def __init__(self, model, config, **kwargs):
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.task_model = None
+        self.base_model = model
+        self.built = False
 
         # Raise a warning if task is set to 'classification'
         if preprocessor_kwargs.get("task") == "classification":
@@ -74,7 +78,15 @@ def __init__(self, model, config, **kwargs):
                 UserWarning,
             )
 
-        self.base_model = model
+        self.optimizer_type = kwargs.get("optimizer_type", "adam")
+
+        self.optimizer_kwargs = {
+            k: v
+            for k, v in kwargs.items()
+            if k
+            not in ["lr", "weight_decay", "patience", "lr_patience", "optimizer_type"]
+            and k.startswith("optimizer_")
+        }
 
     def get_params(self, deep=True):
         """
@@ -229,6 +241,8 @@ def build_model(
             lr_patience=lr_patience,
             lr_factor=factor,
             weight_decay=weight_decay,
+            optimizer_type=self.optimizer_type,
+            optimizer_args=self.optimizer_kwargs,
         )
 
         self.built = True
@@ -402,6 +416,8 @@ def fit(
             lr_factor=factor,
             weight_decay=weight_decay,
             lss=True,
+            optimizer_type=self.optimizer_type,
+            optimizer_args=self.optimizer_kwargs,
         )
 
         early_stop_callback = EarlyStopping(
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index 85a2c7c..23a96b6 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -36,7 +36,9 @@ def __init__(self, model, config, **kwargs):
         ]
 
         self.config_kwargs = {
-            k: v for k, v in kwargs.items() if k not in self.preprocessor_arg_names
+            k: v
+            for k, v in kwargs.items()
+            if k not in self.preprocessor_arg_names and not k.startswith("optimizer")
         }
         self.config = config(**self.config_kwargs)
 
@@ -56,6 +58,16 @@ def __init__(self, model, config, **kwargs):
                 UserWarning,
             )
 
+        self.optimizer_type = kwargs.get("optimizer_type", "adam")
+
+        self.optimizer_kwargs = {
+            k: v
+            for k, v in kwargs.items()
+            if k
+            not in ["lr", "weight_decay", "patience", "lr_patience", "optimizer_type"]
+            and k.startswith("optimizer_")
+        }
+
     def get_params(self, deep=True):
         """
         Get parameters for this estimator.
@@ -208,6 +220,8 @@ def build_model(
             lr_patience=lr_patience,
             lr_factor=factor,
             weight_decay=weight_decay,
+            optimizer_type=self.optimizer_type,
+            optimizer_args=self.optimizer_kwargs,
         )
 
         self.built = True
@@ -354,6 +368,8 @@ def fit(
                 lr_patience=lr_patience,
                 lr_factor=factor,
                 weight_decay=weight_decay,
+                optimizer_type=self.optimizer_type,
+                optimizer_args=self.optimizer_kwargs,
             )
 
         else:

From bce26cbe8ce9545e967bd5f241336958fd5e2355 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Sat, 28 Sep 2024 21:06:22 +0200
Subject: [PATCH 019/132] adjust default optimizer

---
 mambular/models/sklearn_base_classifier.py | 2 +-
 mambular/models/sklearn_base_lss.py        | 2 +-
 mambular/models/sklearn_base_regressor.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index fcfb0e2..1e09dd7 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -57,7 +57,7 @@ def __init__(self, model, config, **kwargs):
                 UserWarning,
             )
 
-        self.optimizer_type = kwargs.get("optimizer_type", "adam")
+        self.optimizer_type = kwargs.get("optimizer_type", "Adam")
 
         self.optimizer_kwargs = {
             k: v
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index cebfe26..e057391 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -78,7 +78,7 @@ def __init__(self, model, config, **kwargs):
                 UserWarning,
             )
 
-        self.optimizer_type = kwargs.get("optimizer_type", "adam")
+        self.optimizer_type = kwargs.get("optimizer_type", "Adam")
 
         self.optimizer_kwargs = {
             k: v
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index 23a96b6..afc4e98 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -58,7 +58,7 @@ def __init__(self, model, config, **kwargs):
                 UserWarning,
             )
 
-        self.optimizer_type = kwargs.get("optimizer_type", "adam")
+        self.optimizer_type = kwargs.get("optimizer_type", "Adam")
 
         self.optimizer_kwargs = {
             k: v

From 28d56b6e7bbdcec0886513ea7a53651228d4dbc1 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Sat, 28 Sep 2024 23:08:53 +0200
Subject: [PATCH 020/132] include skopt in requirements

---
 requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 35e749d..f5783ff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ scikit_learn
 torch
 torchmetrics
 setuptools
-properscoring
\ No newline at end of file
+properscoring
+scikit-optimize
\ No newline at end of file

From 472394224876075c0d9eed9fccc3680b03be4c84 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 11:44:01 +0200
Subject: [PATCH 021/132] include pscan and original mamba_ssm triton version

---
 mambular/arch_utils/get_norm_fn.py            |  49 ++++++
 mambular/arch_utils/mamba_utils/__init__.py   |   0
 .../{ => mamba_utils}/mamba_arch.py           | 108 +++++++------
 .../arch_utils/mamba_utils/mamba_original.py  | 152 ++++++++++++++++++
 .../arch_utils/mamba_utils/mambattn_arch.py   | 118 ++++++++++++++
 mambular/arch_utils/mambattn_arch.py          | 138 ----------------
 mambular/base_models/mambattn.py              |  66 +-------
 mambular/base_models/mambular.py              |  69 +-------
 mambular/configs/mambattention_config.py      |   3 +
 mambular/configs/mambular_config.py           |   6 +
 10 files changed, 396 insertions(+), 313 deletions(-)
 create mode 100644 mambular/arch_utils/get_norm_fn.py
 create mode 100644 mambular/arch_utils/mamba_utils/__init__.py
 rename mambular/arch_utils/{ => mamba_utils}/mamba_arch.py (81%)
 create mode 100644 mambular/arch_utils/mamba_utils/mamba_original.py
 create mode 100644 mambular/arch_utils/mamba_utils/mambattn_arch.py
 delete mode 100644 mambular/arch_utils/mambattn_arch.py

diff --git a/mambular/arch_utils/get_norm_fn.py b/mambular/arch_utils/get_norm_fn.py
new file mode 100644
index 0000000..42bf83b
--- /dev/null
+++ b/mambular/arch_utils/get_norm_fn.py
@@ -0,0 +1,49 @@
+from .normalization_layers import (
+    RMSNorm,
+    LayerNorm,
+    LearnableLayerScaling,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+)
+
+
+def get_normalization_layer(config):
+    """
+    Function to return the appropriate normalization layer based on the configuration.
+
+    Parameters:
+    -----------
+    config : DefaultMambularConfig
+        Configuration object containing the parameters for the model including normalization.
+
+    Returns:
+    --------
+    nn.Module:
+        The normalization layer as per the config.
+
+    Raises:
+    -------
+    ValueError:
+        If an unsupported normalization layer is specified in the config.
+    """
+
+    norm_layer = config.norm
+
+    d_model = config.d_model
+    layer_norm_eps = config.layer_norm_eps
+
+    if norm_layer == "RMSNorm":
+        return RMSNorm(d_model, eps=layer_norm_eps)
+    elif norm_layer == "LayerNorm":
+        return LayerNorm(d_model, eps=layer_norm_eps)
+    elif norm_layer == "BatchNorm":
+        return BatchNorm(d_model, eps=layer_norm_eps)
+    elif norm_layer == "InstanceNorm":
+        return InstanceNorm(d_model, eps=layer_norm_eps)
+    elif norm_layer == "GroupNorm":
+        return GroupNorm(1, d_model, eps=layer_norm_eps)
+    elif norm_layer == "LearnableLayerScaling":
+        return LearnableLayerScaling(d_model)
+    else:
+        raise ValueError(f"Unsupported normalization layer: {norm_layer}")
diff --git a/mambular/arch_utils/mamba_utils/__init__.py b/mambular/arch_utils/mamba_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mambular/arch_utils/mamba_arch.py b/mambular/arch_utils/mamba_utils/mamba_arch.py
similarity index 81%
rename from mambular/arch_utils/mamba_arch.py
rename to mambular/arch_utils/mamba_utils/mamba_arch.py
index 537b8e5..5368328 100644
--- a/mambular/arch_utils/mamba_arch.py
+++ b/mambular/arch_utils/mamba_utils/mamba_arch.py
@@ -2,7 +2,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .normalization_layers import (
+from ..normalization_layers import (
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,
@@ -10,6 +10,7 @@
     InstanceNorm,
     GroupNorm,
 )
+from ..get_norm_fn import get_normalization_layer
 
 
 ### Heavily inspired and mostly taken from https://github.com/alxndrTL/mamba.py
@@ -25,55 +26,36 @@ class Mamba(nn.Module):
 
     def __init__(
         self,
-        d_model=32,
-        n_layers=8,
-        expand_factor=2,
-        bias=False,
-        d_conv=8,
-        conv_bias=True,
-        dropout=0.01,
-        dt_rank="auto",
-        d_state=16,
-        dt_scale=1.0,
-        dt_init="random",
-        dt_max=0.1,
-        dt_min=1e-03,
-        dt_init_floor=1e-04,
-        norm=RMSNorm,
-        activation=F.silu,
-        bidirectional=False,
-        use_learnable_interaction=False,
-        layer_norm_eps=1e-05,
-        AD_weight_decay=False,
-        BC_layer_norm=True,
+        config,
     ):
         super().__init__()
 
         self.layers = nn.ModuleList(
             [
                 ResidualBlock(
-                    d_model,
-                    expand_factor,
-                    bias,
-                    d_conv,
-                    conv_bias,
-                    dropout,
-                    dt_rank,
-                    d_state,
-                    dt_scale,
-                    dt_init,
-                    dt_max,
-                    dt_min,
-                    dt_init_floor,
-                    norm,
-                    activation,
-                    bidirectional,
-                    use_learnable_interaction,
-                    layer_norm_eps,
-                    AD_weight_decay,
-                    BC_layer_norm,
+                    d_model=config.d_model,
+                    expand_factor=config.expand_factor,
+                    bias=config.bias,
+                    d_conv=config.d_conv,
+                    conv_bias=config.conv_bias,
+                    dropout=config.dropout,
+                    dt_rank=config.dt_rank,
+                    d_state=config.d_state,
+                    dt_scale=config.dt_scale,
+                    dt_init=config.dt_init,
+                    dt_max=config.dt_max,
+                    dt_min=config.dt_min,
+                    dt_init_floor=config.dt_init_floor,
+                    norm=get_normalization_layer(config),
+                    activation=config.activation,
+                    bidirectional=config.bidirectional,
+                    use_learnable_interaction=config.use_learnable_interaction,
+                    layer_norm_eps=config.layer_norm_eps,
+                    AD_weight_decay=config.AD_weight_decay,
+                    BC_layer_norm=config.BC_layer_norm,
+                    use_pscan=config.use_pscan,
                 )
-                for _ in range(n_layers)
+                for _ in range(config.n_layers)
             ]
         )
 
@@ -114,6 +96,7 @@ def __init__(
         layer_norm_eps=1e-05,
         AD_weight_decay=False,
         BC_layer_norm=False,
+        use_pscan=False,
     ):
         super().__init__()
 
@@ -132,7 +115,7 @@ def __init__(
                 f"Invalid normalization layer: {norm.__name__}. "
                 f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}"
             )
-        elif isinstance(norm, str) and norm not in self.VALID_NORMALIZATION_LAYERS:
+        elif isinstance(norm, str) and norm not in VALID_NORMALIZATION_LAYERS:
             raise ValueError(
                 f"Invalid normalization layer: {norm}. "
                 f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}"
@@ -161,8 +144,9 @@ def __init__(
             layer_norm_eps=layer_norm_eps,
             AD_weight_decay=AD_weight_decay,
             BC_layer_norm=BC_layer_norm,
+            use_pscan=use_pscan,
         )
-        self.norm = norm(d_model, eps=layer_norm_eps)
+        self.norm = norm
 
     def forward(self, x):
         output = self.layers(self.norm(x)) + x
@@ -204,8 +188,26 @@ def __init__(
         layer_norm_eps=1e-05,
         AD_weight_decay=False,
         BC_layer_norm=False,
+        use_pscan=False,
     ):
         super().__init__()
+
+        self.use_pscan = use_pscan
+
+        if self.use_pscan:
+            try:
+                from mambapy.pscan import pscan
+
+                self.pscan = pscan  # Store the imported pscan function
+            except ImportError:
+                self.pscan = None  # Set to None if pscan is not available
+                print(
+                    "The 'mambapy' package is not installed. Please install it by running:\n"
+                    "pip install mambapy"
+                )
+        else:
+            self.pscan = None
+
         self.d_inner = d_model * expand_factor
         self.bidirectional = bidirectional
         self.use_learnable_interaction = use_learnable_interaction
@@ -390,14 +392,18 @@ def selective_scan_seq(self, x, delta, A, B, C, D):
 
         BX = deltaB * (x.unsqueeze(-1))
 
-        h = torch.zeros(x.size(0), self.d_inner, self.d_state, device=deltaA.device)
-        hs = []
+        if self.use_pscan:
+            hs = self.pscan(deltaA, BX)
+        else:
+
+            h = torch.zeros(x.size(0), self.d_inner, self.d_state, device=deltaA.device)
+            hs = []
 
-        for t in range(0, L):
-            h = deltaA[:, t] * h + BX[:, t]
-            hs.append(h)
+            for t in range(0, L):
+                h = deltaA[:, t] * h + BX[:, t]
+                hs.append(h)
 
-        hs = torch.stack(hs, dim=1)
+            hs = torch.stack(hs, dim=1)
 
         y = (hs @ C.unsqueeze(-1)).squeeze(3)
 
diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
new file mode 100644
index 0000000..2c2a6f2
--- /dev/null
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -0,0 +1,152 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..normalization_layers import (
+    RMSNorm,
+    LayerNorm,
+    LearnableLayerScaling,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+)
+
+try:
+    from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+
+try:
+    from mamba_ssm import Mamba
+except ImportError:
+    Mamba = None
+
+
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            if not getattr(module.bias, "_no_reinit", False):
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(n_residuals_per_layer * n_layer)
+
+
+class MambaOriginal(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        VALID_NORMALIZATION_LAYERS = {
+            "RMSNorm": RMSNorm,
+            "LayerNorm": LayerNorm,
+            "LearnableLayerScaling": LearnableLayerScaling,
+            "BatchNorm": BatchNorm,
+            "InstanceNorm": InstanceNorm,
+            "GroupNorm": GroupNorm,
+        }
+
+        # Get normalization layer from config
+        norm = config.norm
+        if isinstance(norm, str) and norm in VALID_NORMALIZATION_LAYERS:
+            self.norm_f = VALID_NORMALIZATION_LAYERS[norm](
+                config.d_model, eps=config.layer_norm_eps
+            )
+        else:
+            raise ValueError(
+                f"Invalid normalization layer: {norm}. "
+                f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}"
+            )
+
+        # Initialize Mamba layers based on the configuration
+        self.layers = nn.ModuleList(
+            [
+                Mamba(
+                    d_model=config.d_model,
+                    d_state=config.d_state,
+                    d_conv=config.d_conv,
+                    expand=config.expand_factor,
+                    dt_rank=config.dt_rank,
+                    dt_min=config.dt_min,
+                    dt_max=config.dt_max,
+                    dt_init=config.dt_init,
+                    dt_scale=config.dt_scale,
+                    dt_init_floor=config.dt_init_floor,
+                    conv_bias=config.conv_bias,
+                    bias=config.bias,
+                    use_fast_path=True,  # Fused kernel options
+                    layer_idx=i,
+                )
+                for i in range(config.n_layers)
+            ]
+        )
+
+        # Apply weight initialization
+        self.apply(
+            lambda m: _init_weights(
+                m,
+                n_layer=config.n_layers,
+                n_residuals_per_layer=1 if config.d_intermediate == 0 else 2,
+            )
+        )
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(
+                batch_size, max_seqlen, dtype=dtype, **kwargs
+            )
+            for i, layer in enumerate(self.layers)
+        }
+
+    def forward(self, input_ids, inference_params=None, **mixer_kwargs):
+        hidden_states = self.embedding(input_ids)
+        residual = None
+
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                inference_params=inference_params,
+                **mixer_kwargs,
+            )
+
+        if not self.fused_add_norm:
+            residual = (
+                (hidden_states + residual) if residual is not None else hidden_states
+            )
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            hidden_states = layer_norm_fn(
+                hidden_states,
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+                is_rms_norm=isinstance(self.norm_f, RMSNorm),
+            )
+
+        return hidden_states
diff --git a/mambular/arch_utils/mamba_utils/mambattn_arch.py b/mambular/arch_utils/mamba_utils/mambattn_arch.py
new file mode 100644
index 0000000..c15e699
--- /dev/null
+++ b/mambular/arch_utils/mamba_utils/mambattn_arch.py
@@ -0,0 +1,118 @@
+import torch.nn as nn
+from .mamba_arch import ResidualBlock
+from ..get_norm_fn import get_normalization_layer
+
+
+class MambAttn(nn.Module):
+    """Mamba model composed of alternating MambaBlocks and Attention layers.
+
+    Attributes:
+        config (MambaConfig): Configuration object for the Mamba model.
+        layers (nn.ModuleList): List of alternating ResidualBlock (Mamba layers) and attention layers constituting the model.
+    """
+
+    def __init__(
+        self,
+        config,
+    ):
+        super().__init__()
+
+        # Define Mamba and Attention layers alternation
+        self.layers = nn.ModuleList()
+
+        total_blocks = (
+            config.n_layers + config.n_attention_layers
+        )  # Total blocks to be created
+        attention_count = 0
+
+        for i in range(total_blocks):
+            if (i + 1) % (
+                config.n_mamba_per_attention + 1
+            ) == 0:  # Insert attention layer after N Mamba layers
+                self.layers.append(
+                    nn.MultiheadAttention(
+                        embed_dim=config.d_model,
+                        num_heads=config.n_heads,
+                        dropout=config.attn_dropout,
+                    )
+                )
+                attention_count += 1
+            else:
+                self.layers.append(
+                    ResidualBlock(
+                        d_model=config.d_model,
+                        expand_factor=config.expand_factor,
+                        bias=config.bias,
+                        d_conv=config.d_conv,
+                        conv_bias=config.conv_bias,
+                        dropout=config.dropout,
+                        dt_rank=config.dt_rank,
+                        d_state=config.d_state,
+                        dt_scale=config.dt_scale,
+                        dt_init=config.dt_init,
+                        dt_max=config.dt_max,
+                        dt_min=config.dt_min,
+                        dt_init_floor=config.dt_init_floor,
+                        norm=get_normalization_layer(config),
+                        activation=config.activation,
+                        bidirectional=config.bidirectional,
+                        use_learnable_interaction=config.use_learnable_interaction,
+                        layer_norm_eps=config.layer_norm_eps,
+                        AD_weight_decay=config.AD_weight_decay,
+                        BC_layer_norm=config.BC_layer_norm,
+                        use_pscan=config.use_pscan,
+                    )
+                )
+
+        # Check the type of the last layer and append the desired one if necessary
+        if config.last_layer == "attn":
+            if not isinstance(self.layers[-1], nn.MultiheadAttention):
+                self.layers.append(
+                    nn.MultiheadAttention(
+                        embed_dim=config.d_model,
+                        num_heads=config.n_heads,
+                        dropout=config.dropout,
+                    )
+                )
+        else:
+            if not isinstance(self.layers[-1], ResidualBlock):
+                self.layers.append(
+                    ResidualBlock(
+                        d_model=config.d_model,
+                        expand_factor=config.expand_factor,
+                        bias=config.bias,
+                        d_conv=config.d_conv,
+                        conv_bias=config.conv_bias,
+                        dropout=config.dropout,
+                        dt_rank=config.dt_rank,
+                        d_state=config.d_state,
+                        dt_scale=config.dt_scale,
+                        dt_init=config.dt_init,
+                        dt_max=config.dt_max,
+                        dt_min=config.dt_min,
+                        dt_init_floor=config.dt_init_floor,
+                        norm=get_normalization_layer(config),
+                        activation=config.activation,
+                        bidirectional=config.bidirectional,
+                        use_learnable_interaction=config.use_learnable_interaction,
+                        layer_norm_eps=config.layer_norm_eps,
+                        AD_weight_decay=config.AD_weight_decay,
+                        BC_layer_norm=config.BC_layer_norm,
+                        use_pscan=config.use_pscan,
+                    )
+                )
+
+    def forward(self, x):
+        for layer in self.layers:
+            if isinstance(layer, nn.MultiheadAttention):
+                # If it's an attention layer, handle input shape (seq_len, batch, embed_dim)
+                x = x.transpose(
+                    0, 1
+                )  # Switch to (seq_len, batch, embed_dim) for attention
+                x, _ = layer(x, x, x)
+                x = x.transpose(0, 1)  # Switch back to (batch, seq_len, embed_dim)
+            else:
+                # Otherwise, pass through Mamba block
+                x = layer(x)
+
+        return x
diff --git a/mambular/arch_utils/mambattn_arch.py b/mambular/arch_utils/mambattn_arch.py
deleted file mode 100644
index 33eae60..0000000
--- a/mambular/arch_utils/mambattn_arch.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .normalization_layers import RMSNorm
-from .mamba_arch import ResidualBlock
-
-
-class MambAttn(nn.Module):
-    """Mamba model composed of alternating MambaBlocks and Attention layers.
-
-    Attributes:
-        config (MambaConfig): Configuration object for the Mamba model.
-        layers (nn.ModuleList): List of alternating ResidualBlock (Mamba layers) and attention layers constituting the model.
-    """
-
-    def __init__(
-        self,
-        d_model=32,
-        n_layers=8,
-        n_attention_layers=1,  # Introduce attention layer count
-        n_mamba_per_attention=1,  # Ratio of Mamba layers to attention layers
-        n_heads=4,  # Number of attention heads
-        expand_factor=2,
-        bias=False,
-        d_conv=8,
-        conv_bias=True,
-        dropout=0.0,
-        attn_dropout=0.1,
-        dt_rank="auto",
-        d_state=16,
-        dt_scale=1.0,
-        dt_init="random",
-        dt_max=0.1,
-        last_layer="attn",  # Define the desired last layer type
-        dt_min=1e-03,
-        dt_init_floor=1e-04,
-        norm=RMSNorm,
-        activation=F.silu,
-        bidirectional=False,
-        use_learnable_interaction=False,
-        layer_norm_eps=1e-05,
-        AD_weight_decay=False,
-        BC_layer_norm=True,
-    ):
-        super().__init__()
-
-        # Define Mamba and Attention layers alternation
-        self.layers = nn.ModuleList()
-
-        total_blocks = n_layers + n_attention_layers  # Total blocks to be created
-        attention_count = 0
-
-        for i in range(total_blocks):
-            if (i + 1) % (
-                n_mamba_per_attention + 1
-            ) == 0:  # Insert attention layer after N Mamba layers
-                self.layers.append(
-                    nn.MultiheadAttention(
-                        embed_dim=d_model, num_heads=n_heads, dropout=attn_dropout
-                    )
-                )
-                attention_count += 1
-            else:
-                self.layers.append(
-                    ResidualBlock(
-                        d_model,
-                        expand_factor,
-                        bias,
-                        d_conv,
-                        conv_bias,
-                        dropout,
-                        dt_rank,
-                        d_state,
-                        dt_scale,
-                        dt_init,
-                        dt_max,
-                        dt_min,
-                        dt_init_floor,
-                        norm,
-                        activation,
-                        bidirectional,
-                        use_learnable_interaction,
-                        layer_norm_eps,
-                        AD_weight_decay,
-                        BC_layer_norm,
-                    )
-                )
-
-        # Check the type of the last layer and append the desired one if necessary
-        if last_layer == "attn":
-            if not isinstance(self.layers[-1], nn.MultiheadAttention):
-                self.layers.append(
-                    nn.MultiheadAttention(
-                        embed_dim=d_model, num_heads=n_heads, dropout=dropout
-                    )
-                )
-        else:
-            if not isinstance(self.layers[-1], ResidualBlock):
-                self.layers.append(
-                    ResidualBlock(
-                        d_model,
-                        expand_factor,
-                        bias,
-                        d_conv,
-                        conv_bias,
-                        dropout,
-                        dt_rank,
-                        d_state,
-                        dt_scale,
-                        dt_init,
-                        dt_max,
-                        dt_min,
-                        dt_init_floor,
-                        norm,
-                        activation,
-                        bidirectional,
-                        use_learnable_interaction,
-                        layer_norm_eps,
-                        AD_weight_decay,
-                        BC_layer_norm,
-                    )
-                )
-
-    def forward(self, x):
-        for layer in self.layers:
-            if isinstance(layer, nn.MultiheadAttention):
-                # If it's an attention layer, handle input shape (seq_len, batch, embed_dim)
-                x = x.transpose(
-                    0, 1
-                )  # Switch to (seq_len, batch, embed_dim) for attention
-                x, _ = layer(x, x, x)
-                x = x.transpose(0, 1)  # Switch back to (batch, seq_len, embed_dim)
-            else:
-                # Otherwise, pass through Mamba block
-                x = layer(x)
-
-        return x
diff --git a/mambular/base_models/mambattn.py b/mambular/base_models/mambattn.py
index 7acd0b7..f96daf1 100644
--- a/mambular/base_models/mambattn.py
+++ b/mambular/base_models/mambattn.py
@@ -1,15 +1,8 @@
 import torch
 import torch.nn as nn
-from ..arch_utils.mambattn_arch import MambAttn
+from ..arch_utils.mamba_utils.mambattn_arch import MambAttn
 from ..arch_utils.mlp_utils import MLP
-from ..arch_utils.normalization_layers import (
-    RMSNorm,
-    LayerNorm,
-    LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
-)
+from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..configs.mambattention_config import DefaultMambAttentionConfig
 from .basemodel import BaseModel
 from ..arch_utils.embedding_layer import EmbeddingLayer
@@ -88,60 +81,9 @@ def __init__(
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        self.mamba = MambAttn(
-            d_model=self.hparams.get("d_model", config.d_model),
-            n_layers=self.hparams.get("n_layers", config.n_layers),
-            expand_factor=self.hparams.get("expand_factor", config.expand_factor),
-            bias=self.hparams.get("bias", config.bias),
-            d_conv=self.hparams.get("d_conv", config.d_conv),
-            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
-            dropout=self.hparams.get("dropout", config.dropout),
-            dt_rank=self.hparams.get("dt_rank", config.dt_rank),
-            d_state=self.hparams.get("d_state", config.d_state),
-            dt_scale=self.hparams.get("dt_scale", config.dt_scale),
-            dt_init=self.hparams.get("dt_init", config.dt_init),
-            dt_max=self.hparams.get("dt_max", config.dt_max),
-            dt_min=self.hparams.get("dt_min", config.dt_min),
-            dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor),
-            norm=globals()[self.hparams.get("norm", config.norm)],
-            activation=self.hparams.get("activation", config.activation),
-            bidirectional=self.hparams.get("bidiretional", config.bidirectional),
-            use_learnable_interaction=self.hparams.get(
-                "use_learnable_interactions", config.use_learnable_interaction
-            ),
-            AD_weight_decay=self.hparams.get("AB_weight_decay", config.AD_weight_decay),
-            BC_layer_norm=self.hparams.get("AB_layer_norm", config.BC_layer_norm),
-            layer_norm_eps=self.hparams.get("layer_norm_eps", config.layer_norm_eps),
-        )
+        self.mamba = MambAttn(config)
         norm_layer = self.hparams.get("norm", config.norm)
-        if norm_layer == "RMSNorm":
-            self.norm_f = RMSNorm(
-                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
-            )
-        elif norm_layer == "LayerNorm":
-            self.norm_f = LayerNorm(
-                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
-            )
-        elif norm_layer == "BatchNorm":
-            self.norm_f = BatchNorm(
-                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
-            )
-        elif norm_layer == "InstanceNorm":
-            self.norm_f = InstanceNorm(
-                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
-            )
-        elif norm_layer == "GroupNorm":
-            self.norm_f = GroupNorm(
-                1,
-                self.hparams.get("d_model", config.d_model),
-                eps=config.layer_norm_eps,
-            )
-        elif norm_layer == "LearnableLayerScaling":
-            self.norm_f = LearnableLayerScaling(
-                self.hparams.get("d_model", config.d_model)
-            )
-        else:
-            raise ValueError(f"Unsupported normalization layer: {norm_layer}")
+        self.norm_f = get_normalization_layer(config)
 
         self.embedding_layer = EmbeddingLayer(
             num_feature_info=num_feature_info,
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index d362b8a..44bdca3 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -1,18 +1,12 @@
 import torch
 import torch.nn as nn
-from ..arch_utils.mamba_arch import Mamba
+from ..arch_utils.mamba_utils.mamba_arch import Mamba
 from ..arch_utils.mlp_utils import MLP
-from ..arch_utils.normalization_layers import (
-    RMSNorm,
-    LayerNorm,
-    LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
-)
 from ..configs.mambular_config import DefaultMambularConfig
 from .basemodel import BaseModel
 from ..arch_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.get_norm_fn import get_normalization_layer
+from ..arch_utils.mamba_utils.mamba_original import MambaOriginal
 
 
 class Mambular(BaseModel):
@@ -88,60 +82,11 @@ def __init__(
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        self.mamba = Mamba(
-            d_model=self.hparams.get("d_model", config.d_model),
-            n_layers=self.hparams.get("n_layers", config.n_layers),
-            expand_factor=self.hparams.get("expand_factor", config.expand_factor),
-            bias=self.hparams.get("bias", config.bias),
-            d_conv=self.hparams.get("d_conv", config.d_conv),
-            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
-            dropout=self.hparams.get("dropout", config.dropout),
-            dt_rank=self.hparams.get("dt_rank", config.dt_rank),
-            d_state=self.hparams.get("d_state", config.d_state),
-            dt_scale=self.hparams.get("dt_scale", config.dt_scale),
-            dt_init=self.hparams.get("dt_init", config.dt_init),
-            dt_max=self.hparams.get("dt_max", config.dt_max),
-            dt_min=self.hparams.get("dt_min", config.dt_min),
-            dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor),
-            norm=globals()[self.hparams.get("norm", config.norm)],
-            activation=self.hparams.get("activation", config.activation),
-            bidirectional=self.hparams.get("bidiretional", config.bidirectional),
-            use_learnable_interaction=self.hparams.get(
-                "use_learnable_interactions", config.use_learnable_interaction
-            ),
-            AD_weight_decay=self.hparams.get("AB_weight_decay", config.AD_weight_decay),
-            BC_layer_norm=self.hparams.get("AB_layer_norm", config.BC_layer_norm),
-            layer_norm_eps=self.hparams.get("layer_norm_eps", config.layer_norm_eps),
-        )
-        norm_layer = self.hparams.get("norm", config.norm)
-        if norm_layer == "RMSNorm":
-            self.norm_f = RMSNorm(
-                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
-            )
-        elif norm_layer == "LayerNorm":
-            self.norm_f = LayerNorm(
-                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
-            )
-        elif norm_layer == "BatchNorm":
-            self.norm_f = BatchNorm(
-                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
-            )
-        elif norm_layer == "InstanceNorm":
-            self.norm_f = InstanceNorm(
-                self.hparams.get("d_model", config.d_model), eps=config.layer_norm_eps
-            )
-        elif norm_layer == "GroupNorm":
-            self.norm_f = GroupNorm(
-                1,
-                self.hparams.get("d_model", config.d_model),
-                eps=config.layer_norm_eps,
-            )
-        elif norm_layer == "LearnableLayerScaling":
-            self.norm_f = LearnableLayerScaling(
-                self.hparams.get("d_model", config.d_model)
-            )
+        if config.use_mamba_ssm:
+            self.mamba = MambaOriginal(config)
         else:
-            raise ValueError(f"Unsupported normalization layer: {norm_layer}")
+            self.mamba = Mamba(config)
+        self.norm_f = get_normalization_layer(config)
 
         self.embedding_layer = EmbeddingLayer(
             num_feature_info=num_feature_info,
diff --git a/mambular/configs/mambattention_config.py b/mambular/configs/mambattention_config.py
index 0c3ce8b..dbbe2ef 100644
--- a/mambular/configs/mambattention_config.py
+++ b/mambular/configs/mambattention_config.py
@@ -81,6 +81,8 @@ class DefaultMambAttentionConfig:
         whether to apply layer normalization to B-C matrices.
     cat_encoding : str, default="int"
         whether to use integer encoding or one-hot encoding for cat features.
+    use_pscan : bool, default=False
+        whether to use pscan for the ssm
     """
 
     lr: float = 1e-04
@@ -123,3 +125,4 @@ class DefaultMambAttentionConfig:
     AD_weight_decay: bool = True
     BC_layer_norm: bool = False
     cat_encoding: str = "int"
+    use_pscan: bool = False
diff --git a/mambular/configs/mambular_config.py b/mambular/configs/mambular_config.py
index 2083961..1683848 100644
--- a/mambular/configs/mambular_config.py
+++ b/mambular/configs/mambular_config.py
@@ -81,6 +81,10 @@ class DefaultMambularConfig:
         whether to apply layer normalization to B-C matrices.
     cat_encoding : str, default="int"
         whether to use integer encoding or one-hot encoding for cat features.
+    use_pscan : bool, default=False
+        whether to use pscan for the ssm
+    use_mamba_ssm : bool, default=False
+        whether to use mamba_ssm with Triton for the ssm
     """
 
     lr: float = 1e-04
@@ -119,3 +123,5 @@ class DefaultMambularConfig:
     AD_weight_decay: bool = True
     BC_layer_norm: bool = False
     cat_encoding: str = "int"
+    use_pscan: bool = False
+    use_mamba_ssm: bool = False

From df60c1c3492634262198c8fac8fca004ca42b984 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 12:03:40 +0200
Subject: [PATCH 022/132] fix import

---
 mambular/base_models/mambatab.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/base_models/mambatab.py b/mambular/base_models/mambatab.py
index 0e21dbe..5afd857 100644
--- a/mambular/base_models/mambatab.py
+++ b/mambular/base_models/mambatab.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from ..arch_utils.mamba_arch import Mamba
+from ..arch_utils.mamba_utils.mamba_arch import Mamba
 from ..arch_utils.mlp_utils import MLP
 from ..arch_utils.normalization_layers import (
     RMSNorm,

From 85a468b81a65ed834b3c68bb54356646a5a7e9a8 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 12:04:30 +0200
Subject: [PATCH 023/132] fix import and config

---
 mambular/base_models/mambatab.py    | 20 +-------------------
 mambular/base_models/mambular.py    |  1 -
 mambular/configs/mambatab_config.py |  1 +
 3 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/mambular/base_models/mambatab.py b/mambular/base_models/mambatab.py
index 5afd857..3d504aa 100644
--- a/mambular/base_models/mambatab.py
+++ b/mambular/base_models/mambatab.py
@@ -66,25 +66,7 @@ def __init__(
             n_output_units=num_classes,
         )
 
-        self.mamba = Mamba(
-            d_model=self.hparams.get("d_model", config.d_model),
-            n_layers=self.hparams.get("n_layers", config.n_layers),
-            expand_factor=self.hparams.get("expand_factor", config.expand_factor),
-            bias=self.hparams.get("bias", config.bias),
-            d_conv=self.hparams.get("d_conv", config.d_conv),
-            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
-            dropout=self.hparams.get("dropout", config.dropout),
-            dt_rank=self.hparams.get("dt_rank", config.dt_rank),
-            d_state=self.hparams.get("d_state", config.d_state),
-            dt_scale=self.hparams.get("dt_scale", config.dt_scale),
-            dt_init=self.hparams.get("dt_init", config.dt_init),
-            dt_max=self.hparams.get("dt_max", config.dt_max),
-            dt_min=self.hparams.get("dt_min", config.dt_min),
-            dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor),
-            activation=self.hparams.get("activation", config.activation),
-            bidirectional=False,
-            use_learnable_interaction=False,
-        )
+        self.mamba = Mamba(config)
 
     def forward(self, num_features, cat_features):
         x = num_features + cat_features
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index 44bdca3..e8358c0 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -1,5 +1,4 @@
 import torch
-import torch.nn as nn
 from ..arch_utils.mamba_utils.mamba_arch import Mamba
 from ..arch_utils.mlp_utils import MLP
 from ..configs.mambular_config import DefaultMambularConfig
diff --git a/mambular/configs/mambatab_config.py b/mambular/configs/mambatab_config.py
index 3ebea6f..69e8afd 100644
--- a/mambular/configs/mambatab_config.py
+++ b/mambular/configs/mambatab_config.py
@@ -92,3 +92,4 @@ class DefaultMambaTabConfig:
     head_use_batch_norm: bool = False
     norm: str = "LayerNorm"
     axis: int = 1
+    use_pscan: bool = False

From 3fdfa96ce6403f40d81ffff9727892e97902a773 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 12:20:48 +0200
Subject: [PATCH 024/132] adjust attn_config

---
 mambular/configs/mambattention_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mambular/configs/mambattention_config.py b/mambular/configs/mambattention_config.py
index dbbe2ef..0f3d7a3 100644
--- a/mambular/configs/mambattention_config.py
+++ b/mambular/configs/mambattention_config.py
@@ -126,3 +126,4 @@ class DefaultMambAttentionConfig:
     BC_layer_norm: bool = False
     cat_encoding: str = "int"
     use_pscan: bool = False
+    n_attention_layers: int = 1

From 949bdb1943250e7cdfd50080e27d2f49f3f60069 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 13:01:19 +0200
Subject: [PATCH 025/132] fix d_interemediate-d_state error

---
 mambular/arch_utils/mamba_utils/mamba_original.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index 2c2a6f2..d7e9b22 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -19,6 +19,8 @@
 
 try:
     from mamba_ssm import Mamba
+
+    print("successfully imported Mamba from mamba-ssm")
 except ImportError:
     Mamba = None
 
@@ -108,7 +110,7 @@ def __init__(self, config):
             lambda m: _init_weights(
                 m,
                 n_layer=config.n_layers,
-                n_residuals_per_layer=1 if config.d_intermediate == 0 else 2,
+                n_residuals_per_layer=1 if config.d_state == 0 else 2,
             )
         )
 

From 161ceeb23b368dedc8ccb895c1f7620249a3b83e Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 13:06:17 +0200
Subject: [PATCH 026/132] input of original ssm forward pass fix

---
 mambular/arch_utils/mamba_utils/mamba_original.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index d7e9b22..3e99bd9 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -122,13 +122,12 @@ def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs)
             for i, layer in enumerate(self.layers)
         }
 
-    def forward(self, input_ids, inference_params=None, **mixer_kwargs):
-        hidden_states = self.embedding(input_ids)
+    def forward(self, x, inference_params=None, **mixer_kwargs):
         residual = None
 
         for layer in self.layers:
             hidden_states, residual = layer(
-                hidden_states,
+                x,
                 residual,
                 inference_params=inference_params,
                 **mixer_kwargs,

From 141b76f0dd54e9fb0b844508c0568f1574e30ef0 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 13:10:00 +0200
Subject: [PATCH 027/132] forward pass original mamba tryout

---
 mambular/arch_utils/mamba_utils/mamba_original.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index 3e99bd9..7c6e7e5 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -122,15 +122,13 @@ def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs)
             for i, layer in enumerate(self.layers)
         }
 
-    def forward(self, x, inference_params=None, **mixer_kwargs):
+    def forward(self, x):
         residual = None
 
         for layer in self.layers:
             hidden_states, residual = layer(
                 x,
                 residual,
-                inference_params=inference_params,
-                **mixer_kwargs,
             )
 
         if not self.fused_add_norm:

From c0322bcdf74b141a9e5313982f7ca96430fea9eb Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 13:22:55 +0200
Subject: [PATCH 028/132] create ResidualBlock for Original Mamba-ssm

---
 .../arch_utils/mamba_utils/mamba_original.py  | 105 +++++++++++++-----
 1 file changed, 78 insertions(+), 27 deletions(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index 7c6e7e5..dd89b94 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -1,8 +1,6 @@
 import math
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-
 from ..normalization_layers import (
     RMSNorm,
     LayerNorm,
@@ -12,6 +10,7 @@
     GroupNorm,
 )
 
+
 try:
     from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
 except ImportError:
@@ -57,6 +56,80 @@ def _init_weights(
                     p /= math.sqrt(n_residuals_per_layer * n_layer)
 
 
+class ResidualBlock(nn.Module):
+    """Residual block composed of a MambaBlock and a normalization layer.
+
+    Attributes:
+        layers (MambaBlock): MambaBlock layers.
+        norm (RMSNorm): Normalization layer.
+    """
+
+    def __init__(
+        self,
+        d_model=32,
+        expand_factor=2,
+        bias=False,
+        d_conv=16,
+        conv_bias=True,
+        dt_rank="auto",
+        d_state=32,
+        dt_scale=1.0,
+        dt_init="random",
+        dt_max=0.1,
+        dt_min=1e-03,
+        dt_init_floor=1e-04,
+        norm=RMSNorm,
+        layer_idx=0,
+    ):
+        super().__init__()
+
+        VALID_NORMALIZATION_LAYERS = {
+            "RMSNorm": RMSNorm,
+            "LayerNorm": LayerNorm,
+            "LearnableLayerScaling": LearnableLayerScaling,
+            "BatchNorm": BatchNorm,
+            "InstanceNorm": InstanceNorm,
+            "GroupNorm": GroupNorm,
+        }
+
+        # Check if the provided normalization layer is valid
+        if isinstance(norm, type) and norm.__name__ not in VALID_NORMALIZATION_LAYERS:
+            raise ValueError(
+                f"Invalid normalization layer: {norm.__name__}. "
+                f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}"
+            )
+        elif isinstance(norm, str) and norm not in VALID_NORMALIZATION_LAYERS:
+            raise ValueError(
+                f"Invalid normalization layer: {norm}. "
+                f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}"
+            )
+
+        if dt_rank == "auto":
+            dt_rank = math.ceil(d_model / 16)
+
+        self.layers = Mamba(
+            d_model=d_model,
+            d_state=d_state,
+            d_conv=d_conv,
+            expand=expand_factor,
+            dt_rank=dt_rank,
+            dt_min=dt_min,
+            dt_max=dt_max,
+            dt_init=dt_init,
+            dt_scale=dt_scale,
+            dt_init_floor=dt_init_floor,
+            conv_bias=conv_bias,
+            bias=bias,
+            use_fast_path=True,  # Fused kernel options
+            layer_idx=layer_idx,
+        )
+        self.norm = norm
+
+    def forward(self, x):
+        output = self.layers(self.norm(x)) + x
+        return output
+
+
 class MambaOriginal(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -85,7 +158,7 @@ def __init__(self, config):
         # Initialize Mamba layers based on the configuration
         self.layers = nn.ModuleList(
             [
-                Mamba(
+                ResidualBlock(
                     d_model=config.d_model,
                     d_state=config.d_state,
                     d_conv=config.d_conv,
@@ -123,29 +196,7 @@ def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs)
         }
 
     def forward(self, x):
-        residual = None
-
         for layer in self.layers:
-            hidden_states, residual = layer(
-                x,
-                residual,
-            )
-
-        if not self.fused_add_norm:
-            residual = (
-                (hidden_states + residual) if residual is not None else hidden_states
-            )
-            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
-        else:
-            hidden_states = layer_norm_fn(
-                hidden_states,
-                self.norm_f.weight,
-                self.norm_f.bias,
-                eps=self.norm_f.eps,
-                residual=residual,
-                prenorm=False,
-                residual_in_fp32=self.residual_in_fp32,
-                is_rms_norm=isinstance(self.norm_f, RMSNorm),
-            )
+            x = layer(x)
 
-        return hidden_states
+        return x

From 0dd1b3799ea46ad3bfe3e6be01aba2cf57b8dab3 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 13:29:31 +0200
Subject: [PATCH 029/132] fix residualblock **kwargs

---
 mambular/arch_utils/mamba_utils/mamba_original.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index dd89b94..4967552 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -156,13 +156,14 @@ def __init__(self, config):
             )
 
         # Initialize Mamba layers based on the configuration
+
         self.layers = nn.ModuleList(
             [
                 ResidualBlock(
                     d_model=config.d_model,
                     d_state=config.d_state,
                     d_conv=config.d_conv,
-                    expand=config.expand_factor,
+                    expand_factor=config.expand_factor,
                     dt_rank=config.dt_rank,
                     dt_min=config.dt_min,
                     dt_max=config.dt_max,
@@ -171,7 +172,6 @@ def __init__(self, config):
                     dt_init_floor=config.dt_init_floor,
                     conv_bias=config.conv_bias,
                     bias=config.bias,
-                    use_fast_path=True,  # Fused kernel options
                     layer_idx=i,
                 )
                 for i in range(config.n_layers)

From 4b9066e96736ecf8c8d13752630d61e18ad035e2 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Mon, 21 Oct 2024 13:35:09 +0200
Subject: [PATCH 030/132] fix RMSNorm init

---
 mambular/arch_utils/mamba_utils/mamba_original.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index 4967552..548c89b 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -9,7 +9,7 @@
     InstanceNorm,
     GroupNorm,
 )
-
+from ..get_norm_fn import get_normalization_layer
 
 try:
     from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
@@ -163,6 +163,7 @@ def __init__(self, config):
                     d_model=config.d_model,
                     d_state=config.d_state,
                     d_conv=config.d_conv,
+                    norm=get_normalization_layer(config),
                     expand_factor=config.expand_factor,
                     dt_rank=config.dt_rank,
                     dt_min=config.dt_min,

From 0f57b264abcae4fc758ab6e002bdb9669c7e09f0 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 17:39:37 +0200
Subject: [PATCH 031/132] try-out Mamba2

---
 .../arch_utils/mamba_utils/init_weights.py    |  27 +++
 mambular/arch_utils/mamba_utils/mamba_arch.py | 162 ++++++++++++++++--
 .../arch_utils/mamba_utils/mamba_original.py  | 120 +++++++------
 mambular/base_models/mambular.py              |   6 +-
 4 files changed, 249 insertions(+), 66 deletions(-)
 create mode 100644 mambular/arch_utils/mamba_utils/init_weights.py

diff --git a/mambular/arch_utils/mamba_utils/init_weights.py b/mambular/arch_utils/mamba_utils/init_weights.py
new file mode 100644
index 0000000..958b80d
--- /dev/null
+++ b/mambular/arch_utils/mamba_utils/init_weights.py
@@ -0,0 +1,27 @@
+import math
+import torch
+import torch.nn as nn
+
+# taken from https://github.com/state-spaces/mamba
+
+
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            if not getattr(module.bias, "_no_reinit", False):
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+
+    if rescale_prenorm_residual:
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(n_residuals_per_layer * n_layer)
diff --git a/mambular/arch_utils/mamba_utils/mamba_arch.py b/mambular/arch_utils/mamba_utils/mamba_arch.py
index 5368328..7175231 100644
--- a/mambular/arch_utils/mamba_utils/mamba_arch.py
+++ b/mambular/arch_utils/mamba_utils/mamba_arch.py
@@ -67,11 +67,70 @@ def forward(self, x):
 
 
 class ResidualBlock(nn.Module):
-    """Residual block composed of a MambaBlock and a normalization layer.
-
-    Attributes:
-        layers (MambaBlock): MambaBlock layers.
-        norm (RMSNorm): Normalization layer.
+    """
+    Residual block composed of a MambaBlock and a normalization layer.
+
+    Parameters
+    ----------
+    d_model : int, optional
+        Dimension of the model input, by default 32.
+    expand_factor : int, optional
+        Expansion factor for the model, by default 2.
+    bias : bool, optional
+        Whether to use bias in the MambaBlock, by default False.
+    d_conv : int, optional
+        Dimension of the convolution layer in the MambaBlock, by default 16.
+    conv_bias : bool, optional
+        Whether to use bias in the convolution layer, by default True.
+    dropout : float, optional
+        Dropout rate for the layers, by default 0.01.
+    dt_rank : Union[str, int], optional
+        Rank for dynamic time components, 'auto' or an integer, by default 'auto'.
+    d_state : int, optional
+        Dimension of the state vector, by default 32.
+    dt_scale : float, optional
+        Scale factor for dynamic time components, by default 1.0.
+    dt_init : str, optional
+        Initialization strategy for dynamic time components, by default 'random'.
+    dt_max : float, optional
+        Maximum value for dynamic time components, by default 0.1.
+    dt_min : float, optional
+        Minimum value for dynamic time components, by default 1e-03.
+    dt_init_floor : float, optional
+        Floor value for initialization of dynamic time components, by default 1e-04.
+    norm : callable, optional
+        Normalization layer, by default RMSNorm.
+    activation : callable, optional
+        Activation function used in the MambaBlock, by default `F.silu`.
+    bidirectional : bool, optional
+        Whether the block is bidirectional, by default False.
+    use_learnable_interaction : bool, optional
+        Whether to use learnable interactions, by default False.
+    layer_norm_eps : float, optional
+        Epsilon for layer normalization, by default 1e-05.
+    AD_weight_decay : bool, optional
+        Whether to apply weight decay in adaptive dynamics, by default False.
+    BC_layer_norm : bool, optional
+        Whether to use layer normalization for batch compatibility, by default False.
+    use_pscan : bool, optional
+        Whether to use PSCAN, by default False.
+
+    Attributes
+    ----------
+    layers : MambaBlock
+        The main MambaBlock layers for processing input.
+    norm : callable
+        Normalization layer applied before the MambaBlock.
+
+    Methods
+    -------
+    forward(x)
+        Performs a forward pass through the block and returns the output.
+
+    Raises
+    ------
+    ValueError
+        If the provided normalization layer is not valid.
     """
 
     def __init__(
@@ -149,22 +208,94 @@ def __init__(
         self.norm = norm
 
     def forward(self, x):
+        """
+        Forward pass through the residual block.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor to the block.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor after applying the residual connection and MambaBlock.
+        """
         output = self.layers(self.norm(x)) + x
         return output
 
 
 class MambaBlock(nn.Module):
-    """MambaBlock module containing the main computational components.
+    """
+    MambaBlock module containing the main computational components for processing input.
+
+    Parameters
+    ----------
+    d_model : int, optional
+        Dimension of the model input, by default 32.
+    expand_factor : int, optional
+        Factor by which the input is expanded in the block, by default 2.
+    bias : bool, optional
+        Whether to use bias in the linear projections, by default False.
+    d_conv : int, optional
+        Dimension of the convolution layer, by default 16.
+    conv_bias : bool, optional
+        Whether to use bias in the convolution layer, by default True.
+    dropout : float, optional
+        Dropout rate applied to the layers, by default 0.01.
+    dt_rank : Union[str, int], optional
+        Rank for dynamic time components, either 'auto' or an integer, by default 'auto'.
+    d_state : int, optional
+        Dimensionality of the state vector, by default 32.
+    dt_scale : float, optional
+        Scale factor applied to the dynamic time component, by default 1.0.
+    dt_init : str, optional
+        Initialization strategy for the dynamic time component, by default 'random'.
+    dt_max : float, optional
+        Maximum value for dynamic time component initialization, by default 0.1.
+    dt_min : float, optional
+        Minimum value for dynamic time component initialization, by default 1e-03.
+    dt_init_floor : float, optional
+        Floor value for dynamic time component initialization, by default 1e-04.
+    activation : callable, optional
+        Activation function applied in the block, by default `F.silu`.
+    bidirectional : bool, optional
+        Whether the block is bidirectional, by default False.
+    use_learnable_interaction : bool, optional
+        Whether to use learnable feature interaction, by default False.
+    layer_norm_eps : float, optional
+        Epsilon for layer normalization, by default 1e-05.
+    AD_weight_decay : bool, optional
+        Whether to apply weight decay in adaptive dynamics, by default False.
+    BC_layer_norm : bool, optional
+        Whether to use layer normalization for batch compatibility, by default False.
+    use_pscan : bool, optional
+        Whether to use the PSCAN mechanism, by default False.
+
+    Attributes
+    ----------
+    in_proj : nn.Linear
+        Linear projection applied to the input tensor.
+    conv1d : nn.Conv1d
+        1D convolutional layer for processing input.
+    x_proj : nn.Linear
+        Linear projection applied to input-dependent tensors.
+    dt_proj : nn.Linear
+        Linear projection for the dynamical time component.
+    A_log : nn.Parameter
+        Logarithmically stored tensor A for internal dynamics.
+    D : nn.Parameter
+        Tensor for the D component of the model's dynamics.
+    out_proj : nn.Linear
+        Linear projection applied to the output.
+    learnable_interaction : LearnableFeatureInteraction
+        Layer for learnable feature interactions, if `use_learnable_interaction` is True.
+
+    Methods
+    -------
+    forward(x)
+        Performs a forward pass through the MambaBlock.
 
-    Attributes:
-        in_proj (nn.Linear): Linear projection for input.
-        conv1d (nn.Conv1d): 1D convolutional layer.
-        x_proj (nn.Linear): Linear projection for input-dependent tensors.
-        dt_proj (nn.Linear): Linear projection for dynamical time.
-        A_log (nn.Parameter): Logarithmically stored A tensor.
-        D (nn.Parameter): Tensor for D component.
-        out_proj (nn.Linear): Linear projection for output.
-        learnable_interaction (LearnableFeatureInteraction): Learnable feature interaction layer.
     """
 
     def __init__(
@@ -341,6 +472,7 @@ def forward(self, x):
             x_bwd = self.dropout(x_bwd)
             y_bwd = self.ssm(torch.flip(x_bwd, [1]), forward=False)
             y = y_fwd + torch.flip(y_bwd, [1])
+            y = y / 2
         else:
             y = y_fwd
 
diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index 548c89b..ff25e9a 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -10,50 +10,7 @@
     GroupNorm,
 )
 from ..get_norm_fn import get_normalization_layer
-
-try:
-    from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
-except ImportError:
-    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
-
-try:
-    from mamba_ssm import Mamba
-
-    print("successfully imported Mamba from mamba-ssm")
-except ImportError:
-    Mamba = None
-
-
-def _init_weights(
-    module,
-    n_layer,
-    initializer_range=0.02,  # Now only used for embedding layer.
-    rescale_prenorm_residual=True,
-    n_residuals_per_layer=1,  # Change to 2 if we have MLP
-):
-    if isinstance(module, nn.Linear):
-        if module.bias is not None:
-            if not getattr(module.bias, "_no_reinit", False):
-                nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
-        nn.init.normal_(module.weight, std=initializer_range)
-
-    if rescale_prenorm_residual:
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name in ["out_proj.weight", "fc2.weight"]:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                # We need to reinit p since this code could be called multiple times
-                # Having just p *= scale would repeatedly scale it down
-                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-                with torch.no_grad():
-                    p /= math.sqrt(n_residuals_per_layer * n_layer)
+from .init_weights import _init_weights
 
 
 class ResidualBlock(nn.Module):
@@ -80,6 +37,7 @@ def __init__(
         dt_init_floor=1e-04,
         norm=RMSNorm,
         layer_idx=0,
+        mamba_version="mamba1",
     ):
         super().__init__()
 
@@ -107,7 +65,10 @@ def __init__(
         if dt_rank == "auto":
             dt_rank = math.ceil(d_model / 16)
 
-        self.layers = Mamba(
+        # Lazy import for Mamba and only import if it's None
+        self._lazy_import_mamba(mamba_version)
+
+        self.layers = MambaBlock(
             d_model=d_model,
             d_state=d_state,
             d_conv=d_conv,
@@ -125,6 +86,28 @@ def __init__(
         )
         self.norm = norm
 
+    def _lazy_import_mamba(self, mamba_version):
+        """Lazily import Mamba or Mamba2 based on the provided version and alias it."""
+        global OriginalMambaBlock
+        if MambaBlock is None:
+            try:
+                if mamba_version == "mamba1":
+                    from mamba_ssm import Mamba as MambaBlock
+
+                    print("Successfully imported Mamba (version 1)")
+                elif mamba_version == "mamba2":
+                    from mamba_ssm import Mamba2 as MambaBlock
+
+                    print("Successfully imported Mamba2")
+                else:
+                    raise ValueError(
+                        f"Invalid mamba_version: {mamba_version}. Choose 'mamba1' or 'mamba2'."
+                    )
+            except ImportError:
+                raise ImportError(
+                    f"Failed to import {mamba_version}. Please ensure the correct version is installed. Install it via pip install mamba-ssm"
+                )
+
     def forward(self, x):
         output = self.layers(self.norm(x)) + x
         return output
@@ -145,6 +128,7 @@ def __init__(self, config):
 
         # Get normalization layer from config
         norm = config.norm
+        self.bidirectional = config.bidirectional
         if isinstance(norm, str) and norm in VALID_NORMALIZATION_LAYERS:
             self.norm_f = VALID_NORMALIZATION_LAYERS[norm](
                 config.d_model, eps=config.layer_norm_eps
@@ -157,9 +141,10 @@ def __init__(self, config):
 
         # Initialize Mamba layers based on the configuration
 
-        self.layers = nn.ModuleList(
+        self.fwd_layers = nn.ModuleList(
             [
                 ResidualBlock(
+                    mamba_version=config.mamba_version,
                     d_model=config.d_model,
                     d_state=config.d_state,
                     d_conv=config.d_conv,
@@ -179,6 +164,30 @@ def __init__(self, config):
             ]
         )
 
+        if self.bidirectional:
+            self.bckwd_layers = nn.ModuleList(
+                [
+                    ResidualBlock(
+                        mamba_version=config.mamba_version,
+                        d_model=config.d_model,
+                        d_state=config.d_state,
+                        d_conv=config.d_conv,
+                        norm=get_normalization_layer(config),
+                        expand_factor=config.expand_factor,
+                        dt_rank=config.dt_rank,
+                        dt_min=config.dt_min,
+                        dt_max=config.dt_max,
+                        dt_init=config.dt_init,
+                        dt_scale=config.dt_scale,
+                        dt_init_floor=config.dt_init_floor,
+                        conv_bias=config.conv_bias,
+                        bias=config.bias,
+                        layer_idx=i + config.n_layers,
+                    )
+                    for i in range(config.n_layers)
+                ]
+            )
+
         # Apply weight initialization
         self.apply(
             lambda m: _init_weights(
@@ -197,7 +206,22 @@ def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs)
         }
 
     def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
+        if self.bidirectional:
+            # Reverse input and pass through backward layers
+            x_reversed = torch.flip(x, [1])
+        # Forward pass through forward layers
+        for layer in self.fwd_layers:
+            x = layer(x)  # Update x in-place as each forward layer processes it
+
+        if self.bidirectional:
+            for layer in self.bckwd_layers:
+                x_reversed = layer(x_reversed)
+
+            # Reverse the output of the backward pass to original order
+            x_reversed = torch.flip(x_reversed, [1])
+
+            # Combine forward and backward outputs by averaging
+            return (x + x_reversed) / 2
 
+        # Return forward output only if not bidirectional
         return x
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index e8358c0..73c0fe2 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -81,10 +81,10 @@ def __init__(
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        if config.use_mamba_ssm:
-            self.mamba = MambaOriginal(config)
-        else:
+        if config.mamba_version == "mamba-torch":
             self.mamba = Mamba(config)
+        else:
+            self.mamba = MambaOriginal(config)
         self.norm_f = get_normalization_layer(config)
 
         self.embedding_layer = EmbeddingLayer(

From c10d40a43ae45886b86c225c6b112486df59f10e Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 17:55:27 +0200
Subject: [PATCH 032/132] adjust config to new params

---
 mambular/configs/mambular_config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mambular/configs/mambular_config.py b/mambular/configs/mambular_config.py
index 1683848..725a9a1 100644
--- a/mambular/configs/mambular_config.py
+++ b/mambular/configs/mambular_config.py
@@ -83,8 +83,8 @@ class DefaultMambularConfig:
         whether to use integer encoding or one-hot encoding for cat features.
     use_pscan : bool, default=False
         whether to use pscan for the ssm
-    use_mamba_ssm : bool, default=False
-        whether to use mamba_ssm with Triton for the ssm
+    mamba_version : str, default="mamba-torch"
+        options are "mamba-torch", "mamba1" and "mamba2"
     """
 
     lr: float = 1e-04
@@ -124,4 +124,4 @@ class DefaultMambularConfig:
     BC_layer_norm: bool = False
     cat_encoding: str = "int"
     use_pscan: bool = False
-    use_mamba_ssm: bool = False
+    mamba_version: str = "mamba-torch"

From c90c9fd8e91ea06567e0a1507f763ab219978b49 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 18:05:42 +0200
Subject: [PATCH 033/132] adjust mamba-ssm import

---
 mambular/arch_utils/mamba_utils/mamba_original.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index ff25e9a..5fb8125 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -88,7 +88,7 @@ def __init__(
 
     def _lazy_import_mamba(self, mamba_version):
         """Lazily import Mamba or Mamba2 based on the provided version and alias it."""
-        global OriginalMambaBlock
+        global MambaBlock
         if MambaBlock is None:
             try:
                 if mamba_version == "mamba1":

From b424e8bc5e48342fc48c84a5ea174e31126dff0f Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 18:19:45 +0200
Subject: [PATCH 034/132] adjust _lazy_import in ResidualBlock

---
 .../arch_utils/mamba_utils/mamba_original.py  | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index 5fb8125..a200e9c 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -21,6 +21,8 @@ class ResidualBlock(nn.Module):
         norm (RMSNorm): Normalization layer.
     """
 
+    MambaBlock = None  # Declare MambaBlock at the class level
+
     def __init__(
         self,
         d_model=32,
@@ -41,6 +43,10 @@ def __init__(
     ):
         super().__init__()
 
+        # Lazy import for Mamba and only import if it's None
+        if ResidualBlock.MambaBlock is None:
+            self._lazy_import_mamba(mamba_version)
+
         VALID_NORMALIZATION_LAYERS = {
             "RMSNorm": RMSNorm,
             "LayerNorm": LayerNorm,
@@ -65,10 +71,8 @@ def __init__(
         if dt_rank == "auto":
             dt_rank = math.ceil(d_model / 16)
 
-        # Lazy import for Mamba and only import if it's None
-        self._lazy_import_mamba(mamba_version)
-
-        self.layers = MambaBlock(
+        # Use the imported MambaBlock to create layers
+        self.layers = ResidualBlock.MambaBlock(
             d_model=d_model,
             d_state=d_state,
             d_conv=d_conv,
@@ -88,16 +92,17 @@ def __init__(
 
     def _lazy_import_mamba(self, mamba_version):
         """Lazily import Mamba or Mamba2 based on the provided version and alias it."""
-        global MambaBlock
-        if MambaBlock is None:
+        if ResidualBlock.MambaBlock is None:
             try:
                 if mamba_version == "mamba1":
                     from mamba_ssm import Mamba as MambaBlock
 
+                    ResidualBlock.MambaBlock = MambaBlock
                     print("Successfully imported Mamba (version 1)")
                 elif mamba_version == "mamba2":
                     from mamba_ssm import Mamba2 as MambaBlock
 
+                    ResidualBlock.MambaBlock = MambaBlock
                     print("Successfully imported Mamba2")
                 else:
                     raise ValueError(
@@ -105,7 +110,7 @@ def _lazy_import_mamba(self, mamba_version):
                     )
             except ImportError:
                 raise ImportError(
-                    f"Failed to import {mamba_version}. Please ensure the correct version is installed. Install it via pip install mamba-ssm"
+                    f"Failed to import {mamba_version}. Please ensure the correct version is installed."
                 )
 
     def forward(self, x):

From 9a04905b36d16c23bb26f61b7175e9c85503abf9 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 21:21:07 +0200
Subject: [PATCH 035/132] include configs in package

---
 mambular/configs/__init__.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/mambular/configs/__init__.py b/mambular/configs/__init__.py
index e69de29..a990e05 100644
--- a/mambular/configs/__init__.py
+++ b/mambular/configs/__init__.py
@@ -0,0 +1,20 @@
+from .mambular_config import DefaultMambularConfig
+from .fttransformer_config import DefaultFTTransformerConfig
+from .resnet_config import DefaultResNetConfig
+from .mlp_config import DefaultMLPConfig
+from .tabtransformer_config import DefaultTabTransformerConfig
+from .mambatab_config import DefaultMambaTabConfig
+from .tabularnn_config import DefaultTabulaRNNConfig
+from .mambattention_config import DefaultMambAttentionConfig
+
+
+__all__ = [
+    "DefaultMambularConfig",
+    "DefaultFTTransformerConfig",
+    "DefaultResNetConfig",
+    "DefaultMLPConfig",
+    "DefaultTabTransformerConfig",
+    "DefaultMambaTabConfig",
+    "DefaultTabulaRNNConfig",
+    "DefaultMambAttentionConfig",
+]

From 31ddf22c690ed183883e73739f84bfc50adfdd79 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 21:21:21 +0200
Subject: [PATCH 036/132] adjust mambatab config to use mamba-ssm and mamba2

---
 mambular/configs/mambatab_config.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mambular/configs/mambatab_config.py b/mambular/configs/mambatab_config.py
index 69e8afd..4927c35 100644
--- a/mambular/configs/mambatab_config.py
+++ b/mambular/configs/mambatab_config.py
@@ -63,6 +63,10 @@ class DefaultMambaTabConfig:
         Whether to use batch normalization in the head layers.
     layer_norm_after_embedding : bool, default=False
         Whether to apply layer normalization after embedding.
+    use_pscan : bool, default=False
+        whether to use pscan for the ssm
+    mamba_version : str, default="mamba-torch"
+        options are "mamba-torch", "mamba1" and "mamba2"
     """
 
     lr: float = 1e-04
@@ -93,3 +97,4 @@ class DefaultMambaTabConfig:
     norm: str = "LayerNorm"
     axis: int = 1
     use_pscan: bool = False
+    mamba_version: str = "mamba-torch"

From 8b08f8e4aba5777cc270cdf88d96f42b17e57363 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 21:21:37 +0200
Subject: [PATCH 037/132] adjust docstrings to document new params

---
 mambular/models/mambatab.py | 12 ++++++++++++
 mambular/models/mambular.py | 22 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/mambular/models/mambatab.py b/mambular/models/mambatab.py
index baccad2..5b72957 100644
--- a/mambular/models/mambatab.py
+++ b/mambular/models/mambatab.py
@@ -71,6 +71,10 @@ class MambaTabRegressor(SklearnBaseRegressor):
         Normalization method to be used.
     axis : int, default=1
         Axis over which Mamba iterates. If 1, it iterates over the rows; if 0, it iterates over the columns.
+    use_pscan : bool, default=False
+        whether to use pscan for the ssm
+    mamba_version : str, default="mamba-torch"
+        options are "mamba-torch", "mamba1" and "mamba2"
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -167,6 +171,10 @@ class MambaTabClassifier(SklearnBaseClassifier):
         Normalization method to be used.
     axis : int, default=1
         Axis over which Mamba iterates. If 1, it iterates over the rows; if 0, it iterates over the columns.
+    use_pscan : bool, default=False
+        whether to use pscan for the ssm
+    mamba_version : str, default="mamba-torch"
+        options are "mamba-torch", "mamba1" and "mamba2"
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -265,6 +273,10 @@ class MambaTabLSS(SklearnBaseLSS):
         Normalization method to be used.
     axis : int, default=1
         Axis over which Mamba iterates. If 1, it iterates over the rows; if 0, it iterates over the columns.
+    use_pscan : bool, default=False
+        whether to use pscan for the ssm
+    mamba_version : str, default="mamba-torch"
+        options are "mamba-torch", "mamba1" and "mamba2"
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
diff --git a/mambular/models/mambular.py b/mambular/models/mambular.py
index ef65ceb..1d4c7ae 100644
--- a/mambular/models/mambular.py
+++ b/mambular/models/mambular.py
@@ -87,6 +87,10 @@ class MambularRegressor(SklearnBaseRegressor):
         whether to apply layer normalization to B-C matrices.
     cat_encoding : str, default="int"
         whether to use integer encoding or one-hot encoding for cat features.
+    use_pscan : bool, default=False
+        whether to use pscan for the ssm
+    mamba_version : str, default="mamba-torch"
+        options are "mamba-torch", "mamba1" and "mamba2"
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -214,6 +218,10 @@ class MambularClassifier(SklearnBaseClassifier):
         whether to apply layer normalization to B-C matrices.
     cat_encoding : str, default="int"
         whether to use integer encoding or one-hot encoding for cat features.
+    use_pscan : bool, default=False
+        whether to use pscan for the ssm
+    mamba_version : str, default="mamba-torch"
+        options are "mamba-torch", "mamba1" and "mamba2"
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -333,6 +341,20 @@ class MambularLSS(SklearnBaseLSS):
         Whether to use bidirectional processing of the input sequences.
     use_learnable_interaction : bool, default=False
         Whether to use learnable feature interactions before passing through mamba blocks.
+    shuffle_embeddings : bool, default=False.
+        Whether to shuffle the embeddings before being passed to the Mamba layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=True
+        whether weight decay is also applied to A-D matrices.
+    BC_layer_norm: bool, default=False
+        whether to apply layer normalization to B-C matrices.
+    cat_encoding : str, default="int"
+        whether to use integer encoding or one-hot encoding for cat features.
+    use_pscan : bool, default=False
+        whether to use pscan for the ssm
+    mamba_version : str, default="mamba-torch"
+        options are "mamba-torch", "mamba1" and "mamba2"
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.

From 426e6c66b419512c30e088cbd47d33e9f83096d7 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 21:21:58 +0200
Subject: [PATCH 038/132] delete unnecessary imports in mambatab

---
 mambular/base_models/mambatab.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/mambular/base_models/mambatab.py b/mambular/base_models/mambatab.py
index 3d504aa..f436590 100644
--- a/mambular/base_models/mambatab.py
+++ b/mambular/base_models/mambatab.py
@@ -3,15 +3,11 @@
 from ..arch_utils.mamba_utils.mamba_arch import Mamba
 from ..arch_utils.mlp_utils import MLP
 from ..arch_utils.normalization_layers import (
-    RMSNorm,
     LayerNorm,
-    LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
 )
 from ..configs.mambatab_config import DefaultMambaTabConfig
 from .basemodel import BaseModel
+from ..arch_utils.mamba_utils.mamba_original import MambaOriginal
 
 
 class MambaTab(BaseModel):
@@ -66,7 +62,10 @@ def __init__(
             n_output_units=num_classes,
         )
 
-        self.mamba = Mamba(config)
+        if config.mamba_version == "mamba-torch":
+            self.mamba = Mamba(config)
+        else:
+            self.mamba = MambaOriginal(config)
 
     def forward(self, num_features, cat_features):
         x = num_features + cat_features

From 617d6f3e08781b25d26e9c65cb369f9702b38651 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 21:22:05 +0200
Subject: [PATCH 039/132] include new models in readme

---
 README.md | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 8c1211f..6582226 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,8 @@ Mambular is a Python library for tabular deep learning. It includes models that
 - [🛠️ Installation](#️-installation)
 - [🚀 Usage](#-usage)
 - [💻 Implement Your Own Model](#-implement-your-own-model)
+- [🏷️ Citation](#️-citation)
+- [License](#license)
 
 
 # 🏃 Quickstart
@@ -51,15 +53,17 @@ Mambular is a Python package that brings the power of advanced deep learning arc
 
 # 🤖 Models
 
-| Model            | Description                                                                                                                                         |
-| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Mambular`       | A sequential model using Mamba blocks [Gu and Dao](https://arxiv.org/pdf/2312.00752)  specifically designed for various tabular data tasks.         |
-| `FTTransformer`  | A model leveraging transformer encoders, as introduced by [Gorishniy et al.](https://arxiv.org/abs/2106.11959), for tabular data.                   |
-| `MLP`            | A classical Multi-Layer Perceptron (MLP) model for handling tabular data tasks.                                                                     |
-| `ResNet`         | An adaptation of the ResNet architecture for tabular data applications.                                                                             |
-| `TabTransformer` | A transformer-based model for tabular data introduced by [Huang et al.](https://arxiv.org/abs/2012.06678), enhancing feature learning capabilities. |
-| `MambaTab`       | A tabular model using a Mamba-Block on a joint input representation described [here](https://arxiv.org/abs/2401.08867) . Not a sequential model.    |
-| `TabulaRNN`      | A Recurrent Neural Network for Tabular data. Not yet included in the benchmarks                                                                     |
+| Model            | Description                                                                                                                                             |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Mambular`       | A sequential model using Mamba blocks [Gu and Dao](https://arxiv.org/pdf/2312.00752)  specifically designed for various tabular data tasks.             |
+| `FTTransformer`  | A model leveraging transformer encoders, as introduced by [Gorishniy et al.](https://arxiv.org/abs/2106.11959), for tabular data.                       |
+| `MLP`            | A classical Multi-Layer Perceptron (MLP) model for handling tabular data tasks.                                                                         |
+| `ResNet`         | An adaptation of the ResNet architecture for tabular data applications.                                                                                 |
+| `TabTransformer` | A transformer-based model for tabular data introduced by [Huang et al.](https://arxiv.org/abs/2012.06678), enhancing feature learning capabilities.     |
+| `MambaTab`       | A tabular model using a Mamba-Block on a joint input representation described [here](https://arxiv.org/abs/2401.08867) . Not a sequential model.        |
+| `TabulaRNN`      | A Recurrent Neural Network for Tabular data. Not yet included in the benchmarks                                                                         |
+| `MambAttention`  | A combination between Mamba and Transformers, similar to Jamba by [Lieber et al.](https://arxiv.org/abs/2403.19887). Not yet included in the benchmarks |
+
 
 
 All models are available for `regression`, `classification` and distributional regression, denoted by `LSS`.
@@ -198,6 +202,7 @@ MambularLSS allows you to model the full distribution of a response variable, no
 - **negativebinom**: For over-dispersed count data.
 - **inversegamma**: Often used as a prior in Bayesian inference.
 - **categorical**: For data with more than two categories.
+- **Quantile**: For quantile regression using the pinball loss.
 
 These distribution classes make MambularLSS versatile in modeling various data types and distributions.
 
@@ -308,4 +313,18 @@ Here's how you can implement a custom model with Mambular:
    regressor.fit(X_train, y_train, max_epochs=50)
    ```
 
+# 🏷️ Citation
+
+If you find this project useful in your research, please consider cite:
+```BibTeX
+@article{thielmann2024mambular,
+  title={Mambular: A Sequential Model for Tabular Deep Learning},
+  author={Thielmann, Anton Frederik and Kumar, Manish and Weisser, Christoph and Reuter, Arik and S{\"a}fken, Benjamin and Samiee, Soheila},
+  journal={arXiv preprint arXiv:2408.06291},
+  year={2024}
+}
+```
+
+# License
 
+The entire codebase is under MIT license.

From 10881ebad07f1037793fd6d5bac00628e0607840 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 24 Oct 2024 21:25:00 +0200
Subject: [PATCH 040/132] include mamba-ssm installation in readme

---
 README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/README.md b/README.md
index 6582226..49b017b 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,19 @@ Install Mambular using pip:
 pip install mambular
 ```
 
+If you want to use the original mamba and mamba2 implementations, additionally install mamba-ssm via:
+
+```sh
+pip install mamba-ssm
+```
+
+Be careful to use the correct torch and cuda versions: 
+
+```sh
+pip install torch==2.0.0+cu118 torchvision==0.15.0+cu118 torchaudio==2.0.0+cu118 -f https://download.pytorch.org/whl/cu118/torch_stable.html
+pip install mamba-ssm
+```
+
 # 🚀 Usage
 
 <h2> Preprocessing </h2>

From ef04001a64ded937fd0ab22edf05a8fa9d889f74 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 10:46:16 +0000
Subject: [PATCH 041/132] fix preprocessor kwargs typo

---
 mambular/models/sklearn_base_classifier.py | 4 ++--
 mambular/models/sklearn_base_lss.py        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 1e09dd7..187cfec 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -22,7 +22,7 @@
 
 class SklearnBaseClassifier(BaseEstimator):
     def __init__(self, model, config, **kwargs):
-        preprocessor_arg_names = [
+        self.preprocessor_arg_names = [
             "n_bins",
             "numerical_preprocessing",
             "use_decision_tree_bins",
@@ -42,7 +42,7 @@ def __init__(self, model, config, **kwargs):
         self.config = config(**self.config_kwargs)
 
         preprocessor_kwargs = {
-            k: v for k, v in kwargs.items() if k in preprocessor_arg_names
+            k: v for k, v in kwargs.items() if k in self.preprocessor_arg_names
         }
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index 5b55148..132954d 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -44,7 +44,7 @@
 
 class SklearnBaseLSS(BaseEstimator):
     def __init__(self, model, config, **kwargs):
-        preprocessor_arg_names = [
+        self.preprocessor_arg_names = [
             "n_bins",
             "numerical_preprocessing",
             "use_decision_tree_bins",
@@ -64,7 +64,7 @@ def __init__(self, model, config, **kwargs):
         self.config = config(**self.config_kwargs)
 
         preprocessor_kwargs = {
-            k: v for k, v in kwargs.items() if k in preprocessor_arg_names
+            k: v for k, v in kwargs.items() if k in self.preprocessor_arg_names
         }
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)

From 1c529f03a433200c5143aae7a12e1deeb1db751b Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 10:46:42 +0000
Subject: [PATCH 042/132] restructure utils and add neural decision tree

---
 mambular/arch_utils/get_norm_fn.py            |   2 +-
 mambular/arch_utils/layer_utils/__init__.py   |   0
 .../attention_net_arch_utils.py               |   0
 .../{ => layer_utils}/attention_utils.py      |   0
 .../arch_utils/layer_utils/block_diagonal.py  |  26 ++
 .../{ => layer_utils}/embedding_layer.py      |  32 +-
 .../arch_utils/layer_utils/embedding_tree.py  |  83 ++++
 .../{ => layer_utils}/normalization_layers.py |   0
 .../{ => layer_utils}/poly_layer.py           |   0
 .../{ => layer_utils}/rotary_utils.py         |   0
 mambular/arch_utils/lstm_utils.py             | 354 ++++++++++++++++++
 mambular/arch_utils/mamba_utils/mamba_arch.py |   5 +-
 .../arch_utils/mamba_utils/mamba_original.py  |   2 +-
 mambular/arch_utils/neural_decision_tree.py   | 184 +++++++++
 mambular/arch_utils/rnn_utils.py              |  56 +--
 15 files changed, 707 insertions(+), 37 deletions(-)
 create mode 100644 mambular/arch_utils/layer_utils/__init__.py
 rename mambular/arch_utils/{ => layer_utils}/attention_net_arch_utils.py (100%)
 rename mambular/arch_utils/{ => layer_utils}/attention_utils.py (100%)
 create mode 100644 mambular/arch_utils/layer_utils/block_diagonal.py
 rename mambular/arch_utils/{ => layer_utils}/embedding_layer.py (86%)
 create mode 100644 mambular/arch_utils/layer_utils/embedding_tree.py
 rename mambular/arch_utils/{ => layer_utils}/normalization_layers.py (100%)
 rename mambular/arch_utils/{ => layer_utils}/poly_layer.py (100%)
 rename mambular/arch_utils/{ => layer_utils}/rotary_utils.py (100%)
 create mode 100644 mambular/arch_utils/lstm_utils.py
 create mode 100644 mambular/arch_utils/neural_decision_tree.py

diff --git a/mambular/arch_utils/get_norm_fn.py b/mambular/arch_utils/get_norm_fn.py
index 42bf83b..f90352e 100644
--- a/mambular/arch_utils/get_norm_fn.py
+++ b/mambular/arch_utils/get_norm_fn.py
@@ -1,4 +1,4 @@
-from .normalization_layers import (
+from .layer_utils.normalization_layers import (
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,
diff --git a/mambular/arch_utils/layer_utils/__init__.py b/mambular/arch_utils/layer_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mambular/arch_utils/attention_net_arch_utils.py b/mambular/arch_utils/layer_utils/attention_net_arch_utils.py
similarity index 100%
rename from mambular/arch_utils/attention_net_arch_utils.py
rename to mambular/arch_utils/layer_utils/attention_net_arch_utils.py
diff --git a/mambular/arch_utils/attention_utils.py b/mambular/arch_utils/layer_utils/attention_utils.py
similarity index 100%
rename from mambular/arch_utils/attention_utils.py
rename to mambular/arch_utils/layer_utils/attention_utils.py
diff --git a/mambular/arch_utils/layer_utils/block_diagonal.py b/mambular/arch_utils/layer_utils/block_diagonal.py
new file mode 100644
index 0000000..c0174fd
--- /dev/null
+++ b/mambular/arch_utils/layer_utils/block_diagonal.py
@@ -0,0 +1,26 @@
+import torch.nn as nn
+import torch
+
+
+class BlockDiagonal(nn.Module):
+    def __init__(self, in_features, out_features, num_blocks, bias=True):
+        super(BlockDiagonal, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.num_blocks = num_blocks
+
+        assert out_features % num_blocks == 0
+
+        block_out_features = out_features // num_blocks
+
+        self.blocks = nn.ModuleList(
+            [
+                nn.Linear(in_features, block_out_features, bias=bias)
+                for _ in range(num_blocks)
+            ]
+        )
+
+    def forward(self, x):
+        x = [block(x) for block in self.blocks]
+        x = torch.cat(x, dim=-1)
+        return x
diff --git a/mambular/arch_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
similarity index 86%
rename from mambular/arch_utils/embedding_layer.py
rename to mambular/arch_utils/layer_utils/embedding_layer.py
index 43fe453..3a5846f 100644
--- a/mambular/arch_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+from .embedding_tree import NeuralEmbeddingTree
 
 
 class EmbeddingLayer(nn.Module):
@@ -13,6 +14,7 @@ def __init__(
         use_cls=False,
         cls_position=0,
         cat_encoding="int",
+        embedding_layer="linear",
     ):
         """
         Embedding layer that handles numerical and categorical embeddings.
@@ -46,16 +48,28 @@ def __init__(
         self.layer_norm_after_embedding = layer_norm_after_embedding
         self.use_cls = use_cls
         self.cls_position = cls_position
+        if embedding_layer == "ndt":
+            self.num_embeddings = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        NeuralEmbeddingTree(
+                            input_dim=1, output_dim=d_model, temperature=0.3
+                        ),
+                    )
+                    for feature_name, input_shape in num_feature_info.items()
+                ]
+            )
 
-        self.num_embeddings = nn.ModuleList(
-            [
-                nn.Sequential(
-                    nn.Linear(input_shape, d_model, bias=False),
-                    self.embedding_activation,
-                )
-                for feature_name, input_shape in num_feature_info.items()
-            ]
-        )
+        else:
+            self.num_embeddings = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Linear(input_shape, d_model, bias=False),
+                        self.embedding_activation,
+                    )
+                    for feature_name, input_shape in num_feature_info.items()
+                ]
+            )
 
         self.cat_embeddings = nn.ModuleList()
         for feature_name, num_categories in cat_feature_info.items():
diff --git a/mambular/arch_utils/layer_utils/embedding_tree.py b/mambular/arch_utils/layer_utils/embedding_tree.py
new file mode 100644
index 0000000..bab24ac
--- /dev/null
+++ b/mambular/arch_utils/layer_utils/embedding_tree.py
@@ -0,0 +1,83 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class NeuralEmbeddingTree(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        temperature=0.0,
+    ):
+        """
+        Initialize the neural decision tree with a neural network at each leaf.
+
+        Parameters:
+        -----------
+        input_dim: int
+            The number of input features.
+        depth: int
+            The depth of the tree. The number of leaves will be 2^depth.
+        output_dim: int
+            The number of output classes (default is 1 for regression tasks).
+        lamda: float
+            Regularization parameter.
+        """
+        super(NeuralEmbeddingTree, self).__init__()
+
+        self.temperature = temperature
+        self.output_dim = output_dim
+        self.depth = int(math.log2(output_dim))
+
+        # Initialize internal nodes with linear layers followed by hard thresholds
+        self.inner_nodes = nn.Sequential(
+            nn.Linear(input_dim + 1, output_dim, bias=False),
+        )
+
+    def forward(self, X):
+        """Implementation of the forward pass with hard decision boundaries."""
+        batch_size = X.size()[0]
+        X = self._data_augment(X)
+
+        # Get the decision boundaries for the internal nodes
+        decision_boundaries = self.inner_nodes(X)
+
+        # Apply hard thresholding to simulate binary decisions
+        if self.temperature > 0.0:
+            # Replace sigmoid with Gumbel-Softmax for path_prob calculation
+            logits = decision_boundaries / self.temperature
+            path_prob = (
+                (logits > 0).float() + logits.sigmoid() - logits.sigmoid().detach()
+            )
+        else:
+            path_prob = (decision_boundaries > 0).float()
+
+        # Prepare for routing at the internal nodes
+        path_prob = torch.unsqueeze(path_prob, dim=2)
+        path_prob = torch.cat((path_prob, 1 - path_prob), dim=2)
+
+        _mu = X.data.new(batch_size, 1, 1).fill_(1.0)
+
+        # Iterate through internal nodes in each layer to compute the final path
+        # probabilities and the regularization term.
+        begin_idx = 0
+        end_idx = 1
+
+        for layer_idx in range(0, self.depth):
+            _path_prob = path_prob[:, begin_idx:end_idx, :]
+
+            _mu = _mu.view(batch_size, -1, 1).repeat(1, 1, 2)
+
+            _mu = _mu * _path_prob  # update path probabilities
+
+            begin_idx = end_idx
+            end_idx = begin_idx + 2 ** (layer_idx + 1)
+
+        mu = _mu.view(batch_size, self.output_dim)
+
+        return mu
+
+    def _data_augment(self, X):
+        return F.pad(X, (1, 0), value=1)
diff --git a/mambular/arch_utils/normalization_layers.py b/mambular/arch_utils/layer_utils/normalization_layers.py
similarity index 100%
rename from mambular/arch_utils/normalization_layers.py
rename to mambular/arch_utils/layer_utils/normalization_layers.py
diff --git a/mambular/arch_utils/poly_layer.py b/mambular/arch_utils/layer_utils/poly_layer.py
similarity index 100%
rename from mambular/arch_utils/poly_layer.py
rename to mambular/arch_utils/layer_utils/poly_layer.py
diff --git a/mambular/arch_utils/rotary_utils.py b/mambular/arch_utils/layer_utils/rotary_utils.py
similarity index 100%
rename from mambular/arch_utils/rotary_utils.py
rename to mambular/arch_utils/layer_utils/rotary_utils.py
diff --git a/mambular/arch_utils/lstm_utils.py b/mambular/arch_utils/lstm_utils.py
new file mode 100644
index 0000000..28396e1
--- /dev/null
+++ b/mambular/arch_utils/lstm_utils.py
@@ -0,0 +1,354 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .layer_utils.block_diagonal import BlockDiagonal
+
+
+class mLSTMblock(nn.Module):
+    """
+    mLSTM block with convolutions, gated mechanisms, and projection layers.
+
+    Parameters
+    ----------
+    x_example : torch.Tensor
+        Example input tensor for defining input dimensions.
+    factor : float
+        Factor to scale hidden size relative to input size.
+    depth : int
+        Depth of block diagonal layers.
+    dropout : float, optional
+        Dropout probability (default is 0.2).
+    """
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_layers,
+        bidirectional=None,
+        batch_first=None,
+        nonlinearity=F.silu,
+        dropout=0.2,
+        bias=True,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.activation = nonlinearity
+
+        self.ln = nn.LayerNorm(self.input_size)
+
+        self.left = nn.Linear(self.input_size, self.hidden_size)
+        self.right = nn.Linear(self.input_size, self.hidden_size)
+
+        self.conv = nn.Conv1d(
+            in_channels=self.hidden_size,  # Hidden size for subsequent layers
+            out_channels=self.hidden_size,  # Output channels
+            kernel_size=3,
+            padding="same",  # Padding to maintain sequence length
+            bias=True,
+            groups=self.hidden_size,
+        )
+        self.drop = nn.Dropout(dropout + 0.1)
+
+        self.lskip = nn.Linear(self.hidden_size, self.hidden_size)
+
+        self.wq = BlockDiagonal(
+            in_features=self.hidden_size,
+            out_features=self.hidden_size,
+            num_blocks=num_layers,
+            bias=bias,
+        )
+        self.wk = BlockDiagonal(
+            in_features=self.hidden_size,
+            out_features=self.hidden_size,
+            num_blocks=num_layers,
+            bias=bias,
+        )
+        self.wv = BlockDiagonal(
+            in_features=self.hidden_size,
+            out_features=self.hidden_size,
+            num_blocks=num_layers,
+            bias=bias,
+        )
+        self.dropq = nn.Dropout(dropout / 2)
+        self.dropk = nn.Dropout(dropout / 2)
+        self.dropv = nn.Dropout(dropout / 2)
+
+        self.i_gate = nn.Linear(self.hidden_size, self.hidden_size)
+        self.f_gate = nn.Linear(self.hidden_size, self.hidden_size)
+        self.o_gate = nn.Linear(self.hidden_size, self.hidden_size)
+
+        self.ln_c = nn.LayerNorm(self.hidden_size)
+        self.ln_n = nn.LayerNorm(self.hidden_size)
+
+        self.lnf = nn.LayerNorm(self.hidden_size)
+        self.lno = nn.LayerNorm(self.hidden_size)
+        self.lni = nn.LayerNorm(self.hidden_size)
+
+        self.GN = nn.LayerNorm(self.hidden_size)
+        self.ln_out = nn.LayerNorm(self.hidden_size)
+
+        self.drop2 = nn.Dropout(dropout)
+
+        self.proj = nn.Linear(self.hidden_size, self.hidden_size)
+        self.ln_proj = nn.LayerNorm(self.hidden_size)
+
+        # Remove fixed-size initializations for dynamic state initialization
+        self.ct_1 = None
+        self.nt_1 = None
+
+    def init_states(self, batch_size, seq_length, device):
+        """
+        Initialize the state tensors with the correct batch and sequence dimensions.
+
+        Parameters
+        ----------
+        batch_size : int
+            The batch size.
+        seq_length : int
+            The sequence length.
+        device : torch.device
+            The device to place the tensors on.
+        """
+        self.ct_1 = torch.zeros(batch_size, seq_length, self.hidden_size, device=device)
+        self.nt_1 = torch.zeros(batch_size, seq_length, self.hidden_size, device=device)
+
+    def forward(self, x):
+        """
+        Forward pass through mLSTM block.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (batch, sequence_length, input_size).
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor of shape (batch, sequence_length, input_size).
+        """
+        assert x.ndim == 3
+        B, N, D = x.shape
+        device = x.device
+
+        # Initialize states dynamically based on input shape
+        if self.ct_1 is None or self.ct_1.shape[0] != B or self.ct_1.shape[1] != N:
+            self.init_states(B, N, device)
+
+        x = self.ln(x)  # layer norm on x
+
+        left = self.left(x)  # part left
+        right = self.activation(
+            self.right(x)
+        )  # part right with just swish (silu) function
+
+        left_left = left.transpose(1, 2)
+        left_left = self.activation(self.drop(self.conv(left_left).transpose(1, 2)))
+        l_skip = self.lskip(left_left)
+
+        # start mLSTM
+        q = self.dropq(self.wq(left_left))
+        k = self.dropk(self.wk(left_left))
+        v = self.dropv(self.wv(left))
+
+        i = torch.exp(self.lni(self.i_gate(left_left)))
+        f = torch.exp(self.lnf(self.f_gate(left_left)))
+        o = torch.sigmoid(self.lno(self.o_gate(left_left)))
+
+        ct_1 = self.ct_1
+
+        ct = f * ct_1 + i * v * k
+        ct = torch.mean(self.ln_c(ct), [0, 1], keepdim=True)
+        self.ct_1 = ct.detach()
+
+        nt_1 = self.nt_1
+        nt = f * nt_1 + i * k
+        nt = torch.mean(self.ln_n(nt), [0, 1], keepdim=True)
+        self.nt_1 = nt.detach()
+
+        ht = o * ((ct * q) / torch.max(nt * q))
+        # end mLSTM
+        ht = ht
+
+        left = self.drop2(self.GN(ht + l_skip))
+
+        out = self.ln_out(left * right)
+        out = self.ln_proj(self.proj(out))
+
+        return out, None
+
+
+class sLSTMblock(nn.Module):
+    """
+    sLSTM block with convolutions, gated mechanisms, and projection layers.
+
+    Parameters
+    ----------
+    input_size : int
+        Size of the input features.
+    hidden_size : int
+        Size of the hidden state.
+    num_layers : int
+        Depth of block diagonal layers.
+    dropout : float, optional
+        Dropout probability (default is 0.2).
+    """
+
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_layers,
+        bidirectional=None,
+        batch_first=None,
+        nonlinearity=F.silu,
+        dropout=0.2,
+        bias=True,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.activation = nonlinearity
+
+        self.drop = nn.Dropout(dropout)
+
+        self.i_gate = BlockDiagonal(
+            in_features=self.input_size,
+            out_features=self.input_size,
+            num_blocks=num_layers,
+            bias=bias,
+        )
+        self.f_gate = BlockDiagonal(
+            in_features=self.input_size,
+            out_features=self.input_size,
+            num_blocks=num_layers,
+            bias=bias,
+        )
+        self.o_gate = BlockDiagonal(
+            in_features=self.input_size,
+            out_features=self.input_size,
+            num_blocks=num_layers,
+            bias=bias,
+        )
+        self.z_gate = BlockDiagonal(
+            in_features=self.input_size,
+            out_features=self.input_size,
+            num_blocks=num_layers,
+            bias=bias,
+        )
+
+        self.ri_gate = BlockDiagonal(
+            self.input_size, self.input_size, num_layers, bias=False
+        )
+        self.rf_gate = BlockDiagonal(
+            self.input_size, self.input_size, num_layers, bias=False
+        )
+        self.ro_gate = BlockDiagonal(
+            self.input_size, self.input_size, num_layers, bias=False
+        )
+        self.rz_gate = BlockDiagonal(
+            self.input_size, self.input_size, num_layers, bias=False
+        )
+
+        self.ln_i = nn.LayerNorm(self.input_size)
+        self.ln_f = nn.LayerNorm(self.input_size)
+        self.ln_o = nn.LayerNorm(self.input_size)
+        self.ln_z = nn.LayerNorm(self.input_size)
+
+        self.GN = nn.LayerNorm(self.input_size)
+        self.ln_c = nn.LayerNorm(self.input_size)
+        self.ln_n = nn.LayerNorm(self.input_size)
+        self.ln_h = nn.LayerNorm(self.input_size)
+
+        self.left_linear = nn.Linear(self.input_size, int(self.input_size * (4 / 3)))
+        self.right_linear = nn.Linear(self.input_size, int(self.input_size * (4 / 3)))
+
+        self.ln_out = nn.LayerNorm(int(self.input_size * (4 / 3)))
+
+        self.proj = nn.Linear(int(self.input_size * (4 / 3)), self.hidden_size)
+
+        # Remove initial fixed-size states
+        self.ct_1 = None
+        self.nt_1 = None
+        self.ht_1 = None
+        self.mt_1 = None
+
+    def init_states(self, batch_size, seq_length, device):
+        """
+        Initialize the state tensors with the correct batch and sequence dimensions.
+
+        Parameters
+        ----------
+        batch_size : int
+            The batch size.
+        seq_length : int
+            The sequence length.
+        device : torch.device
+            The device to place the tensors on.
+        """
+        self.nt_1 = torch.zeros(batch_size, seq_length, self.input_size, device=device)
+        self.ct_1 = torch.zeros(batch_size, seq_length, self.input_size, device=device)
+        self.ht_1 = torch.zeros(batch_size, seq_length, self.input_size, device=device)
+        self.mt_1 = torch.zeros(batch_size, seq_length, self.input_size, device=device)
+
+    def forward(self, x):
+        """
+        Forward pass through sLSTM block.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (batch, sequence_length, input_size).
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor of shape (batch, sequence_length, input_size).
+        """
+        B, N, D = x.shape
+        device = x.device
+
+        # Initialize states dynamically based on input shape
+        if self.ct_1 is None or self.nt_1.shape[0] != B or self.nt_1.shape[1] != N:
+            self.init_states(B, N, device)
+
+        x = self.activation(x)
+
+        # Start sLSTM operations
+        ht_1 = self.ht_1
+
+        i = torch.exp(self.ln_i(self.i_gate(x) + self.ri_gate(ht_1)))
+        f = torch.exp(self.ln_f(self.f_gate(x) + self.rf_gate(ht_1)))
+
+        # Use expand_as to match the shapes of f and i for element-wise operations
+        m = torch.max(torch.log(f) + self.mt_1.expand_as(f), torch.log(i))
+        i = torch.exp(torch.log(i) - m)
+        f = torch.exp(torch.log(f) + self.mt_1.expand_as(f) - m)
+        self.mt_1 = m.detach()
+
+        o = torch.sigmoid(self.ln_o(self.o_gate(x) + self.ro_gate(ht_1)))
+        z = torch.tanh(self.ln_z(self.z_gate(x) + self.rz_gate(ht_1)))
+
+        ct_1 = self.ct_1
+        ct = f * ct_1 + i * z
+        ct = torch.mean(self.ln_c(ct), [0, 1], keepdim=True)
+        self.ct_1 = ct.detach()
+
+        nt_1 = self.nt_1
+        nt = f * nt_1 + i
+        nt = torch.mean(self.ln_n(nt), [0, 1], keepdim=True)
+        self.nt_1 = nt.detach()
+
+        ht = o * (ct / nt)
+        ht = torch.mean(self.ln_h(ht), [0, 1], keepdim=True)
+        self.ht_1 = ht.detach()
+
+        slstm_out = self.GN(ht)
+
+        left = self.left_linear(slstm_out)
+        right = F.gelu(self.right_linear(slstm_out))
+
+        out = self.ln_out(left * right)
+        out = self.proj(out)
+        return out, None
diff --git a/mambular/arch_utils/mamba_utils/mamba_arch.py b/mambular/arch_utils/mamba_utils/mamba_arch.py
index 7175231..f971865 100644
--- a/mambular/arch_utils/mamba_utils/mamba_arch.py
+++ b/mambular/arch_utils/mamba_utils/mamba_arch.py
@@ -2,7 +2,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from ..normalization_layers import (
+from ..layer_utils.normalization_layers import (
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,
@@ -387,7 +387,6 @@ def __init__(
         elif dt_init == "random":
             nn.init.uniform_(self.dt_proj_fwd.weight, -dt_init_std, dt_init_std)
             if self.bidirectional:
-
                 nn.init.uniform_(self.dt_proj_bwd.weight, -dt_init_std, dt_init_std)
         else:
             raise NotImplementedError
@@ -422,7 +421,6 @@ def __init__(
             self.D_fwd._no_weight_decay = True
 
         if self.bidirectional:
-
             if not AD_weight_decay:
                 self.A_log_bwd._no_weight_decay = True
                 self.D_bwd._no_weight_decay = True
@@ -527,7 +525,6 @@ def selective_scan_seq(self, x, delta, A, B, C, D):
         if self.use_pscan:
             hs = self.pscan(deltaA, BX)
         else:
-
             h = torch.zeros(x.size(0), self.d_inner, self.d_state, device=deltaA.device)
             hs = []
 
diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index a200e9c..b5e726a 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -1,7 +1,7 @@
 import math
 import torch
 import torch.nn as nn
-from ..normalization_layers import (
+from ..layer_utils.normalization_layers import (
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,
diff --git a/mambular/arch_utils/neural_decision_tree.py b/mambular/arch_utils/neural_decision_tree.py
new file mode 100644
index 0000000..d8f8a9f
--- /dev/null
+++ b/mambular/arch_utils/neural_decision_tree.py
@@ -0,0 +1,184 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class NeuralDecisionTree(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        depth,
+        output_dim=1,
+        lamda=1e-3,
+        temperature=0.0,
+        node_sampling=0.3,
+    ):
+        """
+        Initialize the neural decision tree with a neural network at each leaf.
+
+        Parameters:
+        -----------
+        input_dim: int
+            The number of input features.
+        depth: int
+            The depth of the tree. The number of leaves will be 2^depth.
+        output_dim: int
+            The number of output classes (default is 1 for regression tasks).
+        lamda: float
+            Regularization parameter.
+        """
+        super(NeuralDecisionTree, self).__init__()
+        self.internal_node_num_ = 2**depth - 1
+        self.leaf_node_num_ = 2**depth
+        self.lamda = lamda
+        self.depth = depth
+        self.temperature = temperature
+        self.node_sampling = node_sampling
+
+        # Different penalty coefficients for nodes in different layers
+        self.penalty_list = [self.lamda * (2 ** (-d)) for d in range(0, depth)]
+
+        # Initialize internal nodes with linear layers followed by hard thresholds
+        self.inner_nodes = nn.Sequential(
+            nn.Linear(input_dim + 1, self.internal_node_num_, bias=False),
+        )
+
+        self.leaf_nodes = nn.Linear(self.leaf_node_num_, output_dim, bias=False)
+
+    def forward(self, X, return_penalty=False):
+        if return_penalty:
+            _mu, _penalty = self._penalty_forward(X)
+        else:
+            _mu = self._forward(X)
+        y_pred = self.leaf_nodes(_mu)
+        if return_penalty:
+            return y_pred, _penalty
+        else:
+            return y_pred
+
+    def _penalty_forward(self, X):
+        """Implementation of the forward pass with hard decision boundaries."""
+        batch_size = X.size()[0]
+        X = self._data_augment(X)
+
+        # Get the decision boundaries for the internal nodes
+        decision_boundaries = self.inner_nodes(X)
+
+        # Apply hard thresholding to simulate binary decisions
+        if self.temperature > 0.0:
+            # Replace sigmoid with Gumbel-Softmax for path_prob calculation
+            logits = decision_boundaries / self.temperature
+            path_prob = (
+                (logits > 0).float() + logits.sigmoid() - logits.sigmoid().detach()
+            )
+        else:
+            path_prob = (decision_boundaries > 0).float()
+
+        # Prepare for routing at the internal nodes
+        path_prob = torch.unsqueeze(path_prob, dim=2)
+        path_prob = torch.cat((path_prob, 1 - path_prob), dim=2)
+
+        _mu = X.data.new(batch_size, 1, 1).fill_(1.0)
+        _penalty = torch.tensor(0.0)
+
+        # Iterate through internal odes in each layer to compute the final path
+        # probabilities and the regularization term.
+        begin_idx = 0
+        end_idx = 1
+
+        for layer_idx in range(0, self.depth):
+            _path_prob = path_prob[:, begin_idx:end_idx, :]
+
+            # Extract internal nodes in the current layer to compute the
+            # regularization term
+            _penalty = _penalty + self._cal_penalty(layer_idx, _mu, _path_prob)
+            _mu = _mu.view(batch_size, -1, 1).repeat(1, 1, 2)
+
+            _mu = _mu * _path_prob  # update path probabilities
+
+            begin_idx = end_idx
+            end_idx = begin_idx + 2 ** (layer_idx + 1)
+
+        mu = _mu.view(batch_size, self.leaf_node_num_)
+
+        return mu, _penalty
+
+    def _forward(self, X):
+        """Implementation of the forward pass with hard decision boundaries."""
+        batch_size = X.size()[0]
+        X = self._data_augment(X)
+
+        # Get the decision boundaries for the internal nodes
+        decision_boundaries = self.inner_nodes(X)
+
+        # Apply hard thresholding to simulate binary decisions
+        if self.temperature > 0.0:
+            # Replace sigmoid with Gumbel-Softmax for path_prob calculation
+            logits = decision_boundaries / self.temperature
+            path_prob = (
+                (logits > 0).float() + logits.sigmoid() - logits.sigmoid().detach()
+            )
+        else:
+            path_prob = (decision_boundaries > 0).float()
+
+        # Prepare for routing at the internal nodes
+        path_prob = torch.unsqueeze(path_prob, dim=2)
+        path_prob = torch.cat((path_prob, 1 - path_prob), dim=2)
+
+        _mu = X.data.new(batch_size, 1, 1).fill_(1.0)
+
+        # Iterate through internal nodes in each layer to compute the final path
+        # probabilities and the regularization term.
+        begin_idx = 0
+        end_idx = 1
+
+        for layer_idx in range(0, self.depth):
+            _path_prob = path_prob[:, begin_idx:end_idx, :]
+
+            _mu = _mu.view(batch_size, -1, 1).repeat(1, 1, 2)
+
+            _mu = _mu * _path_prob  # update path probabilities
+
+            begin_idx = end_idx
+            end_idx = begin_idx + 2 ** (layer_idx + 1)
+
+        mu = _mu.view(batch_size, self.leaf_node_num_)
+
+        return mu
+
+    def _cal_penalty(self, layer_idx, _mu, _path_prob):
+        """
+        Calculate the regularization penalty by sampling a fraction of nodes with safeguards against NaNs.
+        """
+        batch_size = _mu.size(0)
+
+        # Reshape _mu and _path_prob for broadcasting
+        _mu = _mu.view(batch_size, 2**layer_idx)
+        _path_prob = _path_prob.view(batch_size, 2 ** (layer_idx + 1))
+
+        # Determine sample size
+        num_nodes = _path_prob.size(1)
+        sample_size = max(1, int(self.node_sampling * num_nodes))
+
+        # Randomly sample nodes for penalty calculation
+        indices = torch.randperm(num_nodes)[:sample_size]
+        sampled_path_prob = _path_prob[:, indices]
+        sampled_mu = _mu[:, indices // 2]
+
+        # Calculate alpha in a batched manner
+        epsilon = 1e-6  # Small constant to prevent division by zero
+        alpha = torch.sum(sampled_path_prob * sampled_mu, dim=0) / (
+            torch.sum(sampled_mu, dim=0) + epsilon
+        )
+
+        # Clip alpha to avoid NaNs in log calculation
+        alpha = alpha.clamp(epsilon, 1 - epsilon)
+
+        # Calculate penalty with broadcasting
+        coeff = self.penalty_list[layer_idx]
+        penalty = -0.5 * coeff * (torch.log(alpha) + torch.log(1 - alpha)).sum()
+
+        return penalty
+
+    def _data_augment(self, X):
+        return F.pad(X, (1, 0), value=1)
diff --git a/mambular/arch_utils/rnn_utils.py b/mambular/arch_utils/rnn_utils.py
index e505043..03b5ab1 100644
--- a/mambular/arch_utils/rnn_utils.py
+++ b/mambular/arch_utils/rnn_utils.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+from .lstm_utils import mLSTMblock, sLSTMblock
 
 
 class ConvRNN(nn.Module):
@@ -20,7 +21,13 @@ def __init__(
         super(ConvRNN, self).__init__()
 
         # Choose RNN layer based on model_type
-        rnn_layer = {"RNN": nn.RNN, "LSTM": nn.LSTM, "GRU": nn.GRU}[model_type]
+        rnn_layer = {
+            "RNN": nn.RNN,
+            "LSTM": nn.LSTM,
+            "GRU": nn.GRU,
+            "mLSTM": mLSTMblock,
+            "sLSTM": sLSTMblock,
+        }[model_type]
 
         self.input_size = input_size  # Number of input features (128 in your case)
         self.hidden_size = hidden_size  # Number of hidden units in RNN
@@ -31,6 +38,7 @@ def __init__(
 
         # Convolutional layers
         self.convs = nn.ModuleList()
+        self.layernorms_conv = nn.ModuleList()  # LayerNorms for Conv layers
 
         if self.residuals:
             self.residual_matrix = nn.ParameterList(
@@ -43,59 +51,62 @@ def __init__(
         # First Conv1d layer uses input_size
         self.convs.append(
             nn.Conv1d(
-                in_channels=self.input_size,  # Input size for first layer
-                out_channels=self.input_size,  # Output channels (128)
+                in_channels=self.input_size,
+                out_channels=self.input_size,
                 kernel_size=d_conv,
-                padding=d_conv - 1,  # Padding to maintain sequence length
+                padding=d_conv - 1,
                 bias=conv_bias,
-                groups=self.input_size,  # Depthwise convolution, each channel independent
+                groups=self.input_size,
             )
         )
+        self.layernorms_conv.append(
+            nn.LayerNorm(self.input_size)
+        )  # LayerNorm for first Conv layer
 
         # Subsequent Conv1d layers use hidden_size as input
         for i in range(self.num_layers - 1):
             self.convs.append(
                 nn.Conv1d(
-                    in_channels=self.hidden_size,  # Hidden size for subsequent layers
-                    out_channels=self.hidden_size,  # Output channels
+                    in_channels=self.hidden_size,
+                    out_channels=self.hidden_size,
                     kernel_size=d_conv,
-                    padding=d_conv - 1,  # Padding to maintain sequence length
+                    padding=d_conv - 1,
                     bias=conv_bias,
-                    groups=self.hidden_size,  # Depthwise convolution
+                    groups=self.hidden_size,
                 )
             )
+            self.layernorms_conv.append(
+                nn.LayerNorm(self.hidden_size)
+            )  # LayerNorm for Conv layers
 
         # Initialize the RNN layers
         self.rnns = nn.ModuleList()
+        self.layernorms_rnn = nn.ModuleList()  # LayerNorms for RNN layers
+
         for i in range(self.num_layers):
-            if model_type == "RNN":
+            if model_type in ["RNN"]:
                 rnn = rnn_layer(
-                    input_size=(
-                        self.input_size if i == 0 else self.hidden_size
-                    ),  # First layer uses input_size
+                    input_size=(self.input_size if i == 0 else self.hidden_size),
                     hidden_size=self.hidden_size,
-                    num_layers=1,  # One RNN layer at a time
+                    num_layers=1,
                     bidirectional=self.bidirectional,
                     batch_first=True,
                     dropout=rnn_dropout if i < self.num_layers - 1 else 0,
                     bias=bias,
-                    nonlinearity=(
-                        rnn_activation if model_type == "RNN" else None
-                    ),  # Only RNN uses nonlinearity
+                    nonlinearity=rnn_activation,
                 )
-            else:  # For LSTM or GRU
+            else:
                 rnn = rnn_layer(
-                    input_size=(
-                        self.input_size if i == 0 else self.hidden_size
-                    ),  # First layer uses input_size
+                    input_size=(self.input_size if i == 0 else self.hidden_size),
                     hidden_size=self.hidden_size,
-                    num_layers=1,  # One RNN layer at a time
+                    num_layers=1,
                     bidirectional=self.bidirectional,
                     batch_first=True,
                     dropout=rnn_dropout if i < self.num_layers - 1 else 0,
                     bias=bias,
                 )
             self.rnns.append(rnn)
+            self.layernorms_rnn.append(nn.LayerNorm(self.hidden_size))
 
     def forward(self, x):
         """
@@ -118,6 +129,7 @@ def forward(self, x):
         # Loop through the RNN layers and apply 1D convolution before each
         for i in range(self.num_layers):
             # Transpose to (batch_size, input_size, seq_length) for Conv1d
+            x = self.layernorms_conv[i](x)
             x = x.transpose(1, 2)
 
             # Apply the 1D convolution

From de77ed51110b4eab310ce6231a907649538110a8 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 10:47:12 +0000
Subject: [PATCH 043/132] Adjust new imports

---
 mambular/base_models/ft_transformer.py    |  4 ++--
 mambular/base_models/lightning_wrapper.py | 16 ++++++++++++----
 mambular/base_models/mambatab.py          |  2 +-
 mambular/base_models/mambattn.py          |  2 +-
 mambular/base_models/mambular.py          |  2 +-
 mambular/base_models/mlp.py               |  4 ++--
 mambular/base_models/resnet.py            |  4 ++--
 mambular/base_models/tabtransformer.py    |  4 ++--
 mambular/base_models/tabularnn.py         |  4 ++--
 9 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/mambular/base_models/ft_transformer.py b/mambular/base_models/ft_transformer.py
index ddbf03c..2af362b 100644
--- a/mambular/base_models/ft_transformer.py
+++ b/mambular/base_models/ft_transformer.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 from ..arch_utils.mlp_utils import MLP
-from ..arch_utils.normalization_layers import (
+from ..arch_utils.layer_utils.normalization_layers import (
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,
@@ -9,7 +9,7 @@
     InstanceNorm,
     GroupNorm,
 )
-from ..arch_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer
 from ..configs.fttransformer_config import DefaultFTTransformerConfig
 from .basemodel import BaseModel
diff --git a/mambular/base_models/lightning_wrapper.py b/mambular/base_models/lightning_wrapper.py
index 2bfc25b..276b797 100644
--- a/mambular/base_models/lightning_wrapper.py
+++ b/mambular/base_models/lightning_wrapper.py
@@ -147,7 +147,7 @@ def compute_loss(self, predictions, y_true):
 
     def training_step(self, batch, batch_idx):
         """
-        Training step for a single batch.
+        Training step for a single batch, incorporating penalty if the model has a penalty_forward method.
 
         Parameters
         ----------
@@ -161,11 +161,19 @@ def training_step(self, batch, batch_idx):
         Tensor
             Training loss.
         """
-
         cat_features, num_features, labels = batch
-        preds = self(num_features=num_features, cat_features=cat_features)
-        loss = self.compute_loss(preds, labels)
 
+        # Check if the model has a `penalty_forward` method
+        if hasattr(self.base_model, "penalty_forward"):
+            preds, penalty = self.base_model.penalty_forward(
+                num_features=num_features, cat_features=cat_features
+            )
+            loss = self.compute_loss(preds, labels) + penalty
+        else:
+            preds = self(num_features=num_features, cat_features=cat_features)
+            loss = self.compute_loss(preds, labels)
+
+        # Log the training loss
         self.log(
             "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
         )
diff --git a/mambular/base_models/mambatab.py b/mambular/base_models/mambatab.py
index f436590..2405adc 100644
--- a/mambular/base_models/mambatab.py
+++ b/mambular/base_models/mambatab.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 from ..arch_utils.mamba_utils.mamba_arch import Mamba
 from ..arch_utils.mlp_utils import MLP
-from ..arch_utils.normalization_layers import (
+from ..arch_utils.layer_utils.normalization_layers import (
     LayerNorm,
 )
 from ..configs.mambatab_config import DefaultMambaTabConfig
diff --git a/mambular/base_models/mambattn.py b/mambular/base_models/mambattn.py
index f96daf1..1241ae6 100644
--- a/mambular/base_models/mambattn.py
+++ b/mambular/base_models/mambattn.py
@@ -5,7 +5,7 @@
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..configs.mambattention_config import DefaultMambAttentionConfig
 from .basemodel import BaseModel
-from ..arch_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 
 
 class MambAttention(BaseModel):
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index 821e438..246ef1f 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -3,7 +3,7 @@
 from ..arch_utils.mlp_utils import MLP
 from ..configs.mambular_config import DefaultMambularConfig
 from .basemodel import BaseModel
-from ..arch_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.mamba_utils.mamba_original import MambaOriginal
 
diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index 4e50366..d9e24e3 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 from ..configs.mlp_config import DefaultMLPConfig
 from .basemodel import BaseModel
-from ..arch_utils.normalization_layers import (
+from ..arch_utils.layer_utils.normalization_layers import (
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,
@@ -10,7 +10,7 @@
     InstanceNorm,
     GroupNorm,
 )
-from ..arch_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 
 
 class MLP(BaseModel):
diff --git a/mambular/base_models/resnet.py b/mambular/base_models/resnet.py
index 2b584d3..ec62bc2 100644
--- a/mambular/base_models/resnet.py
+++ b/mambular/base_models/resnet.py
@@ -3,7 +3,7 @@
 from typing import Any
 from ..configs.resnet_config import DefaultResNetConfig
 from .basemodel import BaseModel
-from ..arch_utils.normalization_layers import (
+from ..arch_utils.layer_utils.normalization_layers import (
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,
@@ -12,7 +12,7 @@
     GroupNorm,
 )
 from ..arch_utils.resnet_utils import ResidualBlock
-from ..arch_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 
 
 class ResNet(BaseModel):
diff --git a/mambular/base_models/tabtransformer.py b/mambular/base_models/tabtransformer.py
index 0e4f472..2229faa 100644
--- a/mambular/base_models/tabtransformer.py
+++ b/mambular/base_models/tabtransformer.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 from ..arch_utils.mlp_utils import MLP
-from ..arch_utils.normalization_layers import (
+from ..arch_utils.layer_utils.normalization_layers import (
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,
@@ -9,7 +9,7 @@
     InstanceNorm,
     GroupNorm,
 )
-from ..arch_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..configs.tabtransformer_config import DefaultTabTransformerConfig
 from .basemodel import BaseModel
 from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer
diff --git a/mambular/base_models/tabularnn.py b/mambular/base_models/tabularnn.py
index 4433c7e..3cc5fc3 100644
--- a/mambular/base_models/tabularnn.py
+++ b/mambular/base_models/tabularnn.py
@@ -3,9 +3,9 @@
 from ..arch_utils.mlp_utils import MLP
 from ..configs.tabularnn_config import DefaultTabulaRNNConfig
 from .basemodel import BaseModel
-from ..arch_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.rnn_utils import ConvRNN
-from ..arch_utils.normalization_layers import (
+from ..arch_utils.layer_utils.normalization_layers import (
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,

From 732283947cc444ab2a39fd28e89b1c0be14664cc Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 10:47:23 +0000
Subject: [PATCH 044/132] add neural Decision Forest base architecture

---
 mambular/base_models/ndtf.py | 146 +++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 mambular/base_models/ndtf.py

diff --git a/mambular/base_models/ndtf.py b/mambular/base_models/ndtf.py
new file mode 100644
index 0000000..fca2165
--- /dev/null
+++ b/mambular/base_models/ndtf.py
@@ -0,0 +1,146 @@
+import torch
+import torch.nn as nn
+from ..configs.ndtf_config import DefaultNDTFConfig
+from .basemodel import BaseModel
+from ..arch_utils.neural_decision_tree import NeuralDecisionTree
+import numpy as np
+
+
+class NDTF(BaseModel):
+    def __init__(
+        self,
+        cat_feature_info,
+        num_feature_info,
+        num_classes: int = 1,
+        config: DefaultNDTFConfig = DefaultNDTFConfig(),
+        **kwargs,
+    ):
+        """
+        Initializes the NDTF model with the given configuration.
+
+        Parameters
+        ----------
+        cat_feature_info : Any
+            Information about categorical features.
+        num_feature_info : Any
+            Information about numerical features.
+
+        num_classes : int, optional
+            Number of output classes, by default 1.
+        config : DefaultNDTFConfig, optional
+            Configuration dataclass containing hyperparameters, by default DefaultNDTFConfig().
+        """
+        super().__init__(**kwargs)
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        self.lr = self.hparams.get("lr", config.lr)
+        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
+        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
+        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.cat_feature_info = cat_feature_info
+        self.num_feature_info = num_feature_info
+        self.penalty_factor = config.penalty_factor
+
+        input_dim = 0
+        for feature_name, input_shape in num_feature_info.items():
+            input_dim += input_shape
+        for feature_name, input_shape in cat_feature_info.items():
+            input_dim += 1
+
+        self.input_dimensions = [input_dim]
+
+        for _ in range(config.n_ensembles - 1):
+            self.input_dimensions.append(np.random.randint(1, input_dim))
+
+        self.trees = nn.ModuleList(
+            [
+                NeuralDecisionTree(
+                    input_dim=self.input_dimensions[idx],
+                    depth=np.random.randint(config.min_depth, config.max_depth),
+                    output_dim=num_classes,
+                    lamda=config.lamda,
+                    temperature=config.temperature + np.abs(np.random.normal(0, 0.1)),
+                    node_sampling=config.node_sampling,
+                )
+                for idx in range(config.n_ensembles)
+            ]
+        )
+
+        self.conv_layer = nn.Conv1d(
+            in_channels=self.input_dimensions[0],
+            out_channels=1,  # Single channel output if one feature interaction is desired
+            kernel_size=self.input_dimensions[0],  # Choose appropriate kernel size
+            padding=self.input_dimensions[0]
+            - 1,  # To keep output size the same as input_dim if desired
+            bias=True,
+        )
+
+        self.tree_weights = nn.Parameter(
+            torch.full((config.n_ensembles, 1), 1.0 / config.n_ensembles),
+            requires_grad=True,
+        )
+
+    def forward(self, num_features, cat_features) -> torch.Tensor:
+        """
+        Forward pass of the NDTF model.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor.
+        """
+        x = num_features + cat_features
+        x = torch.cat(x, dim=1)
+        x = self.conv_layer(x.unsqueeze(2))
+        x = x.transpose(1, 2).squeeze(-1)
+
+        preds = []
+
+        for idx, tree in enumerate(self.trees):
+            tree_input = x[:, : self.input_dimensions[idx]]
+            preds.append(tree(tree_input, return_penalty=False))
+
+        preds = torch.stack(preds, dim=1).squeeze(-1)
+
+        return preds @ self.tree_weights
+
+    def penalty_forward(self, num_features, cat_features) -> torch.Tensor:
+        """
+        Forward pass of the NDTF model.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor.
+        """
+        x = num_features + cat_features
+        x = torch.cat(x, dim=1)
+        x = self.conv_layer(x.unsqueeze(2))
+        x = x.transpose(1, 2).squeeze(-1)
+
+        penalty = 0.0
+        preds = []
+
+        # Iterate over trees and collect predictions and penalties
+        for idx, tree in enumerate(self.trees):
+            # Select subset of features for the current tree
+            tree_input = x[:, : self.input_dimensions[idx]]
+
+            # Get prediction and penalty from the current tree
+            pred, pen = tree(tree_input, return_penalty=True)
+            preds.append(pred)
+            penalty += pen
+
+        # Stack predictions and calculate mean across trees
+        preds = torch.stack(preds, dim=1).squeeze(-1)
+        return preds @ self.tree_weights, self.penalty_factor * penalty

From 3f25fd6f549ac6d5a388a514549fa135c4ec14c3 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 10:47:38 +0000
Subject: [PATCH 045/132] add ndtf to new models in __init__

---
 mambular/models/__init__.py |   5 +
 mambular/models/ndtf.py     | 255 ++++++++++++++++++++++++++++++++++++
 2 files changed, 260 insertions(+)
 create mode 100644 mambular/models/ndtf.py

diff --git a/mambular/models/__init__.py b/mambular/models/__init__.py
index 720d264..fc7d27f 100644
--- a/mambular/models/__init__.py
+++ b/mambular/models/__init__.py
@@ -23,6 +23,8 @@
     MambAttentionLSS,
 )
 
+from .ndtf import NDTFClassifier, NDTFRegressor, NDTFLSS
+
 
 __all__ = [
     "MambularClassifier",
@@ -52,4 +54,7 @@
     "MambAttentionClassifier",
     "MambAttentionRegressor",
     "MambAttentionLSS",
+    "NDTFClassifier",
+    "NDTFRegressor",
+    "NDTFLSS",
 ]
diff --git a/mambular/models/ndtf.py b/mambular/models/ndtf.py
new file mode 100644
index 0000000..851118b
--- /dev/null
+++ b/mambular/models/ndtf.py
@@ -0,0 +1,255 @@
+from .sklearn_base_regressor import SklearnBaseRegressor
+from .sklearn_base_classifier import SklearnBaseClassifier
+from .sklearn_base_lss import SklearnBaseLSS
+from ..base_models.ndtf import NDTF
+from ..configs.ndtf_config import DefaultNDTFConfig
+
+
+class NDTFRegressor(SklearnBaseRegressor):
+    """
+    Multi-Layer Perceptron regressor. This class extends the SklearnBaseRegressor class and uses the NDTF model
+    with the default NDTF configuration.
+
+    The accepted arguments to the NDTFRegressor class include both the attributes in the DefaultNDTFConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    family : str, default=None
+        Distributional family to be used for the model.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    min_depth : int, default=2
+        Minimum depth of trees in the forest. Controls the simplest model structure.
+    max_depth : int, default=10
+        Maximum depth of trees in the forest. Controls the maximum complexity of the trees.
+    temperature : float, default=0.1
+        Temperature parameter for softening the node decisions during path probability calculation.
+    node_sampling : float, default=0.3
+        Fraction of nodes sampled for regularization penalty calculation. Reduces computation by focusing on a subset of nodes.
+    lamda : float, default=0.3
+        Regularization parameter to control the complexity of the paths, penalizing overconfident or imbalanced paths.
+    n_ensembles : int, default=12
+        Number of trees in the forest
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    task : str, default="regression"
+        Indicates the type of machine learning task ('regression' or 'classification'). This can
+        influence certain preprocessing behaviors, especially when using decision tree-based binning as ple.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+
+
+    Notes
+    -----
+    - The accepted arguments to the NDTFRegressor class are the same as the attributes in the DefaultNDTFConfig dataclass.
+    - NDTFRegressor uses SklearnBaseRegressor as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseRegressor : The parent class for NDTFRegressor.
+
+    Examples
+    --------
+    >>> from mambular.models import NDTFRegressor
+    >>> model = NDTFRegressor(layer_sizes=[128, 128, 64], activation=nn.ReLU())
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=NDTF, config=DefaultNDTFConfig, **kwargs)
+
+
+class NDTFClassifier(SklearnBaseClassifier):
+    """
+    Multi-Layer Perceptron classifier. This class extends the SklearnBaseClassifier class and uses the NDTF model
+    with the default NDTF configuration.
+
+    The accepted arguments to the NDTFClassifier class include both the attributes in the DefaultNDTFConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    family : str, default=None
+        Distributional family to be used for the model.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    min_depth : int, default=2
+        Minimum depth of trees in the forest. Controls the simplest model structure.
+    max_depth : int, default=10
+        Maximum depth of trees in the forest. Controls the maximum complexity of the trees.
+    temperature : float, default=0.1
+        Temperature parameter for softening the node decisions during path probability calculation.
+    node_sampling : float, default=0.3
+        Fraction of nodes sampled for regularization penalty calculation. Reduces computation by focusing on a subset of nodes.
+    lamda : float, default=0.3
+        Regularization parameter to control the complexity of the paths, penalizing overconfident or imbalanced paths.
+    n_ensembles : int, default=12
+        Number of trees in the forest
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    task : str, default="regression"
+        Indicates the type of machine learning task ('regression' or 'classification'). This can
+        influence certain preprocessing behaviors, especially when using decision tree-based binning as ple.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+
+
+    Notes
+    -----
+    - The accepted arguments to the NDTFClassifier class are the same as the attributes in the DefaultNDTFConfig dataclass.
+    - NDTFClassifier uses SklearnBaseClassifieras the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseRegressor : The parent class for NDTFClassifier.
+
+    Examples
+    --------
+    >>> from mambular.models import NDTFClassifier
+    >>> model = NDTFClassifier(layer_sizes=[128, 128, 64], activation=nn.ReLU())
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=NDTF, config=DefaultNDTFConfig, **kwargs)
+
+
+class NDTFLSS(SklearnBaseLSS):
+    """
+    Multi-Layer Perceptron for distributional regression. This class extends the SklearnBaseLSS class and uses the NDTF model
+    with the default NDTF configuration.
+
+    The accepted arguments to the NDTFLSS class include both the attributes in the DefaultNDTFConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    family : str, default=None
+        Distributional family to be used for the model.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    min_depth : int, default=2
+        Minimum depth of trees in the forest. Controls the simplest model structure.
+    max_depth : int, default=10
+        Maximum depth of trees in the forest. Controls the maximum complexity of the trees.
+    temperature : float, default=0.1
+        Temperature parameter for softening the node decisions during path probability calculation.
+    node_sampling : float, default=0.3
+        Fraction of nodes sampled for regularization penalty calculation. Reduces computation by focusing on a subset of nodes.
+    lamda : float, default=0.3
+        Regularization parameter to control the complexity of the paths, penalizing overconfident or imbalanced paths.
+    n_ensembles : int, default=12
+        Number of trees in the forest
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    task : str, default="regression"
+        Indicates the type of machine learning task ('regression' or 'classification'). This can
+        influence certain preprocessing behaviors, especially when using decision tree-based binning as ple.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the NDTFLSS class are the same as the attributes in the DefaultNDTFConfig dataclass.
+    - NDTFLSS uses SklearnBaseLSS as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseLSS : The parent class for NDTFLSS.
+
+    Examples
+    --------
+    >>> from mambular.models import NDTFLSS
+    >>> model = NDTFLSS(layer_sizes=[128, 128, 64], activation=nn.ReLU())
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=NDTF, config=DefaultNDTFConfig, **kwargs)

From 385f2dd1d8fb3a333910ea662eb6d527e76606af Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 10:48:02 +0000
Subject: [PATCH 046/132] add new configs. include mLSTM/sLSTM in rnn config

---
 mambular/configs/ndtf_config.py      | 46 ++++++++++++++++++++++++++++
 mambular/configs/tabularnn_config.py |  2 +-
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 mambular/configs/ndtf_config.py

diff --git a/mambular/configs/ndtf_config.py b/mambular/configs/ndtf_config.py
new file mode 100644
index 0000000..ba3c675
--- /dev/null
+++ b/mambular/configs/ndtf_config.py
@@ -0,0 +1,46 @@
+from dataclasses import dataclass
+import torch.nn as nn
+
+
+@dataclass
+class DefaultNDTFConfig:
+    """
+    Configuration class for the default Neural Decision Tree Forest (NDTF) model with predefined hyperparameters.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which the learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) applied to the model's weights during optimization.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced when a plateau is reached.
+    min_depth : int, default=2
+        Minimum depth of trees in the forest. Controls the simplest model structure.
+    max_depth : int, default=10
+        Maximum depth of trees in the forest. Controls the maximum complexity of the trees.
+    temperature : float, default=0.1
+        Temperature parameter for softening the node decisions during path probability calculation.
+    node_sampling : float, default=0.3
+        Fraction of nodes sampled for regularization penalty calculation. Reduces computation by focusing on a subset of nodes.
+    lamda : float, default=0.3
+        Regularization parameter to control the complexity of the paths, penalizing overconfident or imbalanced paths.
+    n_ensembles : int, default=12
+        Number of trees in the forest
+    penalty_factor : float, default=0.01
+        Factor with which the penalty is multiplied
+    """
+
+    lr: float = 1e-4
+    lr_patience: int = 5
+    weight_decay: float = 1e-7
+    lr_factor: float = 0.1
+    min_depth: int = 4
+    max_depth: int = 16
+    temperature: float = 0.1
+    node_sampling: float = 0.3
+    lamda: float = 0.3
+    n_ensembles: int = 12
+    penalty_factor: float = 1e-08
diff --git a/mambular/configs/tabularnn_config.py b/mambular/configs/tabularnn_config.py
index 9ca6636..cedc885 100644
--- a/mambular/configs/tabularnn_config.py
+++ b/mambular/configs/tabularnn_config.py
@@ -12,7 +12,7 @@ class DefaultTabulaRNNConfig:
     lr : float, default=1e-04
         Learning rate for the optimizer.
     model_type : str, default="RNN"
-        type of model, one of "RNN", "LSTM", "GRU"
+        type of model, one of "RNN", "LSTM", "GRU", "mLSTM", "sLSTM"
     lr_patience : int, default=10
         Number of epochs with no improvement after which learning rate will be reduced.
     weight_decay : float, default=1e-06

From bdac22be88f1672acaa5e97f81b026bbf796e0bc Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 10:50:15 +0000
Subject: [PATCH 047/132] add ntdf config in init

---
 mambular/configs/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mambular/configs/__init__.py b/mambular/configs/__init__.py
index a990e05..31ee342 100644
--- a/mambular/configs/__init__.py
+++ b/mambular/configs/__init__.py
@@ -6,6 +6,7 @@
 from .mambatab_config import DefaultMambaTabConfig
 from .tabularnn_config import DefaultTabulaRNNConfig
 from .mambattention_config import DefaultMambAttentionConfig
+from .ndtf_config import DefaultNDTFConfig
 
 
 __all__ = [
@@ -17,4 +18,5 @@
     "DefaultMambaTabConfig",
     "DefaultTabulaRNNConfig",
     "DefaultMambAttentionConfig",
+    "DefaultNDTFConfig",
 ]

From 9eb5d421b071f50b27db3ed0cd0e4105231deff0 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:51:30 +0000
Subject: [PATCH 048/132] add sparsemax

---
 mambular/arch_utils/layer_utils/sparsemax.py | 117 +++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 mambular/arch_utils/layer_utils/sparsemax.py

diff --git a/mambular/arch_utils/layer_utils/sparsemax.py b/mambular/arch_utils/layer_utils/sparsemax.py
new file mode 100644
index 0000000..cfcc00f
--- /dev/null
+++ b/mambular/arch_utils/layer_utils/sparsemax.py
@@ -0,0 +1,117 @@
+import torch
+from torch.autograd import Function
+
+
+def _make_ix_like(input, dim=0):
+    """
+    Creates a tensor of indices like the input tensor along the specified dimension.
+
+    Parameters
+    ----------
+    input : torch.Tensor
+        Input tensor whose shape will be used to determine the shape of the output tensor.
+    dim : int, optional
+        Dimension along which to create the index tensor. Default is 0.
+
+    Returns
+    -------
+    torch.Tensor
+        A tensor containing indices along the specified dimension.
+    """
+    d = input.size(dim)
+    rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
+    view = [1] * input.dim()
+    view[0] = -1
+    return rho.view(view).transpose(0, dim)
+
+
+class SparsemaxFunction(Function):
+    """
+    Implements the sparsemax function, a sparse alternative to softmax.
+
+    References
+    ----------
+    Martins, A. F., & Astudillo, R. F. (2016). "From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification."
+    """
+
+    @staticmethod
+    def forward(ctx, input, dim=-1):
+        """
+        Forward pass of sparsemax: a normalizing, sparse transformation.
+
+        Parameters
+        ----------
+        input : torch.Tensor
+            The input tensor on which sparsemax will be applied.
+        dim : int, optional
+            Dimension along which to apply sparsemax. Default is -1.
+
+        Returns
+        -------
+        torch.Tensor
+            A tensor with the same shape as the input, with sparsemax applied.
+        """
+        ctx.dim = dim
+        max_val, _ = input.max(dim=dim, keepdim=True)
+        input -= max_val  # Numerical stability trick, as with softmax.
+        tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim)
+        output = torch.clamp(input - tau, min=0)
+        ctx.save_for_backward(supp_size, output)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        Backward pass of sparsemax, calculating gradients.
+
+        Parameters
+        ----------
+        grad_output : torch.Tensor
+            Gradient of the loss with respect to the output of sparsemax.
+
+        Returns
+        -------
+        tuple
+            Gradients of the loss with respect to the input of sparsemax and None for the dimension argument.
+        """
+        supp_size, output = ctx.saved_tensors
+        dim = ctx.dim
+        grad_input = grad_output.clone()
+        grad_input[output == 0] = 0
+
+        v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze()
+        v_hat = v_hat.unsqueeze(dim)
+        grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
+        return grad_input, None
+
+    @staticmethod
+    def _threshold_and_support(input, dim=-1):
+        """
+        Computes the threshold and support for sparsemax.
+
+        Parameters
+        ----------
+        input : torch.Tensor
+            The input tensor on which to compute the threshold and support.
+        dim : int, optional
+            Dimension along which to compute the threshold and support. Default is -1.
+
+        Returns
+        -------
+        tuple
+            - torch.Tensor : The threshold value for sparsemax.
+            - torch.Tensor : The support size tensor.
+        """
+        input_srt, _ = torch.sort(input, descending=True, dim=dim)
+        input_cumsum = input_srt.cumsum(dim) - 1
+        rhos = _make_ix_like(input, dim)
+        support = rhos * input_srt > input_cumsum
+
+        support_size = support.sum(dim=dim).unsqueeze(dim)
+        tau = input_cumsum.gather(dim, support_size - 1)
+        tau /= support_size.to(input.dtype)
+        return tau, support_size
+
+
+sparsemax = lambda input, dim=-1: SparsemaxFunction.apply(input, dim)
+sparsemoid = lambda input: (0.5 * input + 0.5).clamp_(0, 1)

From ab3abbf87ce92b2fc9e833a56eada1b3704df9d5 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:51:46 +0000
Subject: [PATCH 049/132] data-aware initialization module

---
 .../arch_utils/data_aware_initialization.py   | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 mambular/arch_utils/data_aware_initialization.py

diff --git a/mambular/arch_utils/data_aware_initialization.py b/mambular/arch_utils/data_aware_initialization.py
new file mode 100644
index 0000000..00e58a7
--- /dev/null
+++ b/mambular/arch_utils/data_aware_initialization.py
@@ -0,0 +1,29 @@
+import torch.nn as nn
+import torch
+
+
+class ModuleWithInit(nn.Module):
+    """Base class for pytorch module with data-aware initializer on first batch
+    See https://github.com/yandex-research/rtdl-revisiting-models/tree/main/lib/node
+
+    Helps to avoid nans in feature logits before being passed to sparsemax"""
+
+    def __init__(self):
+        super().__init__()
+        self._is_initialized_tensor = nn.Parameter(
+            torch.tensor(0, dtype=torch.uint8), requires_grad=False
+        )
+        self._is_initialized_bool = None
+
+    def initialize(self, *args, **kwargs):
+        """initialize module tensors using first batch of data"""
+        raise NotImplementedError("Please implement ")
+
+    def __call__(self, *args, **kwargs):
+        if self._is_initialized_bool is None:
+            self._is_initialized_bool = bool(self._is_initialized_tensor.item())
+        if not self._is_initialized_bool:
+            self.initialize(*args, **kwargs)
+            self._is_initialized_tensor.data[...] = 1
+            self._is_initialized_bool = True
+        return super().__call__(*args, **kwargs)

From 473db6be67ef6573764aeeb07b8ac8c9b19fe0ae Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:52:01 +0000
Subject: [PATCH 050/132] utils func for checking if tensor or np.array

---
 mambular/arch_utils/numpy_utils.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 mambular/arch_utils/numpy_utils.py

diff --git a/mambular/arch_utils/numpy_utils.py b/mambular/arch_utils/numpy_utils.py
new file mode 100644
index 0000000..82098dc
--- /dev/null
+++ b/mambular/arch_utils/numpy_utils.py
@@ -0,0 +1,11 @@
+import torch
+import numpy as np
+
+
+def check_numpy(x):
+    """Makes sure x is a numpy array"""
+    if isinstance(x, torch.Tensor):
+        x = x.detach().cpu().numpy()
+    x = np.asarray(x)
+    assert isinstance(x, np.ndarray)
+    return x

From e59bfcb35f3f29a615cbefee65dadccedfb4bc63 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:52:10 +0000
Subject: [PATCH 051/132] add ODST and DenseBlock

---
 mambular/arch_utils/node_utils.py | 370 ++++++++++++++++++++++++++++++
 1 file changed, 370 insertions(+)
 create mode 100644 mambular/arch_utils/node_utils.py

diff --git a/mambular/arch_utils/node_utils.py b/mambular/arch_utils/node_utils.py
new file mode 100644
index 0000000..03fc2e6
--- /dev/null
+++ b/mambular/arch_utils/node_utils.py
@@ -0,0 +1,370 @@
+# Source: https://github.com/Qwicen/node
+from warnings import warn
+
+import numpy as np
+import torch
+import torch.nn as nn
+from .layer_utils.sparsemax import sparsemax, sparsemoid
+import torch.functional as F
+from .data_aware_initialization import ModuleWithInit
+from .numpy_utils import check_numpy
+
+
+class ODST(ModuleWithInit):
+    def __init__(
+        self,
+        in_features,
+        num_trees,
+        depth=6,
+        tree_dim=1,
+        flatten_output=True,
+        choice_function=sparsemax,
+        bin_function=sparsemoid,
+        initialize_response_=nn.init.normal_,
+        initialize_selection_logits_=nn.init.uniform_,
+        threshold_init_beta=1.0,
+        threshold_init_cutoff=1.0,
+    ):
+        """
+        Oblivious Differentiable Sparsemax Trees (ODST).
+
+        ODST is a differentiable module for decision tree-based models, where each tree
+        is trained using sparsemax to compute feature weights and sparsemoid to compute
+        binary leaf weights. This class is designed as a drop-in replacement for `nn.Linear` layers.
+
+        Parameters
+        ----------
+        in_features : int
+            Number of features in the input tensor.
+        num_trees : int
+            Number of trees in this layer.
+        depth : int, optional
+            Number of splits (depth) in each tree. Default is 6.
+        tree_dim : int, optional
+            Number of output channels for each tree's response. Default is 1.
+        flatten_output : bool, optional
+            If True, returns output in a flattened shape of [..., num_trees * tree_dim];
+            otherwise returns [..., num_trees, tree_dim]. Default is True.
+        choice_function : callable, optional
+            Function that computes feature weights as a simplex, such that
+            `choice_function(tensor, dim).sum(dim) == 1`. Default is `sparsemax`.
+        bin_function : callable, optional
+            Function that computes tree leaf weights as values in the range [0, 1].
+            Default is `sparsemoid`.
+        initialize_response_ : callable, optional
+            In-place initializer for the response tensor in each tree. Default is `nn.init.normal_`.
+        initialize_selection_logits_ : callable, optional
+            In-place initializer for the feature selection logits. Default is `nn.init.uniform_`.
+        threshold_init_beta : float, optional
+            Initializes thresholds based on quantiles of the data using a Beta distribution.
+            Controls the initial threshold distribution; values > 1 make thresholds closer to the median.
+            Default is 1.0.
+        threshold_init_cutoff : float, optional
+            Initializer for log-temperatures, with values > 1.0 adding margin between data points
+            and sparse-sigmoid cutoffs. Default is 1.0.
+
+        Attributes
+        ----------
+        response : torch.nn.Parameter
+            Parameter for tree responses.
+        feature_selection_logits : torch.nn.Parameter
+            Logits that select features for the trees.
+        feature_thresholds : torch.nn.Parameter
+            Threshold values for feature splits in the trees.
+        log_temperatures : torch.nn.Parameter
+            Log-temperatures for threshold adjustments.
+        bin_codes_1hot : torch.nn.Parameter
+            One-hot encoded binary codes for leaf mapping.
+
+        Methods
+        -------
+        forward(input)
+            Forward pass through the ODST model.
+        initialize(input, eps=1e-6)
+            Data-aware initialization of thresholds and log-temperatures based on input data.
+        """
+
+        super().__init__()
+        self.depth, self.num_trees, self.tree_dim, self.flatten_output = (
+            depth,
+            num_trees,
+            tree_dim,
+            flatten_output,
+        )
+        self.choice_function, self.bin_function = choice_function, bin_function
+        self.threshold_init_beta, self.threshold_init_cutoff = (
+            threshold_init_beta,
+            threshold_init_cutoff,
+        )
+
+        self.response = nn.Parameter(
+            torch.zeros([num_trees, tree_dim, 2**depth]), requires_grad=True
+        )
+        initialize_response_(self.response)
+
+        self.feature_selection_logits = nn.Parameter(
+            torch.zeros([in_features, num_trees, depth]), requires_grad=True
+        )
+        initialize_selection_logits_(self.feature_selection_logits)
+
+        self.feature_thresholds = nn.Parameter(
+            torch.full([num_trees, depth], float("nan"), dtype=torch.float32),
+            requires_grad=True,
+        )  # nan values will be initialized on first batch (data-aware init)
+
+        self.log_temperatures = nn.Parameter(
+            torch.full([num_trees, depth], float("nan"), dtype=torch.float32),
+            requires_grad=True,
+        )
+
+        # binary codes for mapping between 1-hot vectors and bin indices
+        with torch.no_grad():
+            indices = torch.arange(2**self.depth)
+            offsets = 2 ** torch.arange(self.depth)
+            bin_codes = (indices.view(1, -1) // offsets.view(-1, 1) % 2).to(
+                torch.float32
+            )
+            bin_codes_1hot = torch.stack([bin_codes, 1.0 - bin_codes], dim=-1)
+            self.bin_codes_1hot = nn.Parameter(bin_codes_1hot, requires_grad=False)
+            # ^-- [depth, 2 ** depth, 2]
+
+    def forward(self, input):
+        """
+        Forward pass through ODST model.
+
+        Parameters
+        ----------
+        input : torch.Tensor
+            Input tensor of shape [batch_size, in_features] or higher dimensions.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor of shape [batch_size, num_trees * tree_dim] if `flatten_output` is True,
+            otherwise [batch_size, num_trees, tree_dim].
+        """
+        assert len(input.shape) >= 2
+        if len(input.shape) > 2:
+            return self.forward(input.view(-1, input.shape[-1])).view(
+                *input.shape[:-1], -1
+            )
+        # new input shape: [batch_size, in_features]
+
+        feature_logits = self.feature_selection_logits
+        feature_selectors = self.choice_function(feature_logits, dim=0)
+        # ^--[in_features, num_trees, depth]
+
+        feature_values = torch.einsum("bi,ind->bnd", input, feature_selectors)
+        # ^--[batch_size, num_trees, depth]
+
+        threshold_logits = (feature_values - self.feature_thresholds) * torch.exp(
+            -self.log_temperatures
+        )
+
+        threshold_logits = torch.stack([-threshold_logits, threshold_logits], dim=-1)
+        # ^--[batch_size, num_trees, depth, 2]
+
+        bins = self.bin_function(threshold_logits)
+        # ^--[batch_size, num_trees, depth, 2], approximately binary
+
+        bin_matches = torch.einsum("btds,dcs->btdc", bins, self.bin_codes_1hot)
+        # ^--[batch_size, num_trees, depth, 2 ** depth]
+
+        response_weights = torch.prod(bin_matches, dim=-2)
+        # ^-- [batch_size, num_trees, 2 ** depth]
+
+        response = torch.einsum("bnd,ncd->bnc", response_weights, self.response)
+        # ^-- [batch_size, num_trees, tree_dim]
+
+        return response.flatten(1, 2) if self.flatten_output else response
+
+    def initialize(self, input, eps=1e-6):
+        """
+        Data-aware initialization of thresholds and log-temperatures based on input data.
+
+        Parameters
+        ----------
+        input : torch.Tensor
+            Tensor of shape [batch_size, in_features] used for threshold initialization.
+        eps : float, optional
+            Small value added to avoid log(0) errors in temperature initialization. Default is 1e-6.
+        """
+        # data-aware initializer
+        assert len(input.shape) == 2
+        if input.shape[0] < 1000:
+            warn(
+                "Data-aware initialization is performed on less than 1000 data points. This may cause instability."
+                "To avoid potential problems, run this model on a data batch with at least 1000 data samples."
+                "You can do so manually before training. Use with torch.no_grad() for memory efficiency."
+            )
+        with torch.no_grad():
+            feature_selectors = self.choice_function(
+                self.feature_selection_logits, dim=0
+            )
+            # ^--[in_features, num_trees, depth]
+
+            feature_values = torch.einsum("bi,ind->bnd", input, feature_selectors)
+            # ^--[batch_size, num_trees, depth]
+
+            # initialize thresholds: sample random percentiles of data
+            percentiles_q = 100 * np.random.beta(
+                self.threshold_init_beta,
+                self.threshold_init_beta,
+                size=[self.num_trees, self.depth],
+            )
+            self.feature_thresholds.data[...] = torch.as_tensor(
+                list(
+                    map(
+                        np.percentile,
+                        check_numpy(feature_values.flatten(1, 2).t()),
+                        percentiles_q.flatten(),
+                    )
+                ),
+                dtype=feature_values.dtype,
+                device=feature_values.device,
+            ).view(self.num_trees, self.depth)
+
+            # init temperatures: make sure enough data points are in the linear region of sparse-sigmoid
+            temperatures = np.percentile(
+                check_numpy(abs(feature_values - self.feature_thresholds)),
+                q=100 * min(1.0, self.threshold_init_cutoff),
+                axis=0,
+            )
+
+            # if threshold_init_cutoff > 1, scale everything down by it
+            temperatures /= max(1.0, self.threshold_init_cutoff)
+            self.log_temperatures.data[...] = torch.log(
+                torch.as_tensor(temperatures) + eps
+            )
+
+    def __repr__(self):
+        return "{}(in_features={}, num_trees={}, depth={}, tree_dim={}, flatten_output={})".format(
+            self.__class__.__name__,
+            self.feature_selection_logits.shape[0],
+            self.num_trees,
+            self.depth,
+            self.tree_dim,
+            self.flatten_output,
+        )
+
+
+class DenseBlock(nn.Sequential):
+    """
+    DenseBlock is a multi-layer module that sequentially stacks instances of `Module`,
+    typically decision tree models like `ODST`. Each layer in the block produces additional
+    features, enabling the model to learn complex representations.
+
+    Parameters
+    ----------
+    input_dim : int
+        Dimensionality of the input features.
+    layer_dim : int
+        Dimensionality of each layer in the block.
+    num_layers : int
+        Number of layers to stack in the block.
+    tree_dim : int, optional
+        Dimensionality of the output channels from each tree. Default is 1.
+    max_features : int, optional
+        Maximum dimensionality for feature expansion. If None, feature expansion is unrestricted.
+        Default is None.
+    input_dropout : float, optional
+        Dropout rate applied to the input features of each layer during training. Default is 0.0.
+    flatten_output : bool, optional
+        If True, flattens the output along the tree dimension. Default is True.
+    Module : nn.Module, optional
+        Module class to use for each layer in the block, typically a decision tree model.
+        Default is `ODST`.
+    **kwargs : dict
+        Additional keyword arguments for the `Module` instances.
+
+    Attributes
+    ----------
+    num_layers : int
+        Number of layers in the block.
+    layer_dim : int
+        Dimensionality of each layer.
+    tree_dim : int
+        Dimensionality of each tree's output in the layer.
+    max_features : int or None
+        Maximum feature dimensionality allowed for expansion.
+    flatten_output : bool
+        Determines whether to flatten the output.
+    input_dropout : float
+        Dropout rate applied to each layer's input.
+
+    Methods
+    -------
+    forward(x)
+        Performs the forward pass through the block, producing feature-expanded outputs.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        layer_dim,
+        num_layers,
+        tree_dim=1,
+        max_features=None,
+        input_dropout=0.0,
+        flatten_output=True,
+        Module=ODST,
+        **kwargs
+    ):
+        layers = []
+        for i in range(num_layers):
+            oddt = Module(
+                input_dim, layer_dim, tree_dim=tree_dim, flatten_output=True, **kwargs
+            )
+            input_dim = min(
+                input_dim + layer_dim * tree_dim, max_features or float("inf")
+            )
+            layers.append(oddt)
+
+        super().__init__(*layers)
+        self.num_layers, self.layer_dim, self.tree_dim = num_layers, layer_dim, tree_dim
+        self.max_features, self.flatten_output = max_features, flatten_output
+        self.input_dropout = input_dropout
+
+    def forward(self, x):
+        """
+        Forward pass through the DenseBlock.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape [batch_size, input_dim] or higher dimensions.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor with expanded features, where shape depends on `flatten_output`.
+            If `flatten_output` is True, returns tensor of shape
+            [..., num_layers * layer_dim * tree_dim].
+            Otherwise, returns [..., num_layers * layer_dim, tree_dim].
+        """
+        initial_features = x.shape[-1]
+        for layer in self:
+            layer_inp = x
+            if self.max_features is not None:
+                tail_features = (
+                    min(self.max_features, layer_inp.shape[-1]) - initial_features
+                )
+                if tail_features != 0:
+                    layer_inp = torch.cat(
+                        [
+                            layer_inp[..., :initial_features],
+                            layer_inp[..., -tail_features:],
+                        ],
+                        dim=-1,
+                    )
+            if self.training and self.input_dropout:
+                layer_inp = F.dropout(layer_inp, self.input_dropout)
+            h = layer(layer_inp)
+            x = torch.cat([x, h], dim=-1)
+
+        outputs = x[..., initial_features:]
+        if not self.flatten_output:
+            outputs = outputs.view(
+                *outputs.shape[:-1], self.num_layers * self.layer_dim, self.tree_dim
+            )
+        return outputs

From 8d362dfbb32b4df4f445dab0c478cf9cc6e81f50 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:52:23 +0000
Subject: [PATCH 052/132] add node into basemodels - includes tabular MLP head

---
 mambular/base_models/node.py | 159 +++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 mambular/base_models/node.py

diff --git a/mambular/base_models/node.py b/mambular/base_models/node.py
new file mode 100644
index 0000000..e61a817
--- /dev/null
+++ b/mambular/base_models/node.py
@@ -0,0 +1,159 @@
+from .basemodel import BaseModel
+from ..configs.node_config import DefaultNODEConfig
+import torch
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.node_utils import DenseBlock
+from ..arch_utils.mlp_utils import MLP
+
+
+class NODE(BaseModel):
+    """
+    Neural Oblivious Decision Ensemble (NODE) Model. Slightly different with a MLP as a tabular task specific head.
+
+    NODE is a neural decision tree model that processes both categorical and numerical features.
+    This class combines embedding layers, a dense decision tree block, and an MLP head for tabular
+    data prediction tasks.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary mapping categorical feature names to their input shapes.
+    num_feature_info : dict
+        Dictionary mapping numerical feature names to their input shapes.
+    num_classes : int, optional
+        Number of output classes. Default is 1.
+    config : DefaultNODEConfig, optional
+        Configuration object that holds model hyperparameters. Default is `DefaultNODEConfig`.
+    **kwargs : dict
+        Additional arguments for the base model.
+
+    Attributes
+    ----------
+    lr : float
+        Learning rate for the optimizer.
+    lr_patience : int
+        Number of epochs without improvement before reducing the learning rate.
+    weight_decay : float
+        Weight decay factor for regularization.
+    lr_factor : float
+        Factor by which to reduce the learning rate.
+    cat_feature_info : dict
+        Information about categorical features.
+    num_feature_info : dict
+        Information about numerical features.
+    use_embeddings : bool
+        Whether to use embeddings for categorical and numerical features.
+    embedding_layer : EmbeddingLayer, optional
+        Embedding layer for feature transformation.
+    d_out : int
+        Output dimensionality.
+    block : DenseBlock
+        DenseBlock layer that implements the decision tree ensemble.
+    tabular_head : MLP
+        MLP layer that serves as the output head of the model.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Performs the forward pass, processing numerical and categorical features to produce predictions.
+    """
+
+    def __init__(
+        self,
+        cat_feature_info,
+        num_feature_info,
+        num_classes: int = 1,
+        config: DefaultNODEConfig = DefaultNODEConfig(),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        self.lr = self.hparams.get("lr", config.lr)
+        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
+        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
+        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.cat_feature_info = cat_feature_info
+        self.num_feature_info = num_feature_info
+        self.use_embeddings = self.hparams.get("use_embeddings", config.use_embeddings)
+
+        input_dim = 0
+        for feature_name, input_shape in num_feature_info.items():
+            input_dim += input_shape
+        for feature_name, input_shape in cat_feature_info.items():
+            input_dim += 1
+
+        if self.use_embeddings:
+            input_dim = (
+                len(num_feature_info) * config.d_model
+                + len(cat_feature_info) * config.d_model
+            )
+
+            self.embedding_layer = EmbeddingLayer(
+                num_feature_info=num_feature_info,
+                cat_feature_info=cat_feature_info,
+                d_model=self.hparams.get("d_model", config.d_model),
+                embedding_activation=self.hparams.get(
+                    "embedding_activation", config.embedding_activation
+                ),
+                layer_norm_after_embedding=self.hparams.get(
+                    "layer_norm_after_embedding"
+                ),
+                use_cls=False,
+            )
+
+        self.d_out = num_classes
+        self.block = DenseBlock(
+            input_dim=input_dim,
+            num_layers=config.num_layers,
+            layer_dim=config.layer_dim,
+            depth=config.depth,
+            tree_dim=config.tree_dim,
+            flatten_output=True,
+        )
+
+        head_activation = self.hparams.get("head_activation", config.head_activation)
+
+        self.tabular_head = MLP(
+            config.num_layers * config.layer_dim,
+            hidden_units_list=self.hparams.get(
+                "head_layer_sizes", config.head_layer_sizes
+            ),
+            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
+            use_skip_layers=self.hparams.get(
+                "head_skip_layers", config.head_skip_layers
+            ),
+            activation_fn=head_activation,
+            use_batch_norm=self.hparams.get(
+                "head_use_batch_norm", config.head_use_batch_norm
+            ),
+            n_output_units=num_classes,
+        )
+
+    def forward(self, num_features, cat_features):
+        """
+        Forward pass through the NODE model.
+
+        Parameters
+        ----------
+        num_features : torch.Tensor
+            Numerical features tensor of shape [batch_size, num_numerical_features].
+        cat_features : torch.Tensor
+            Categorical features tensor of shape [batch_size, num_categorical_features].
+
+        Returns
+        -------
+        torch.Tensor
+            Model output of shape [batch_size, num_classes].
+        """
+        if self.use_embeddings:
+            x = self.embedding_layer(num_features, cat_features)
+            B, S, D = x.shape
+            x = x.reshape(B, S * D)
+        else:
+            x = num_features + cat_features
+            x = torch.cat(x, dim=1)
+
+        x = self.block(x).squeeze(-1)
+        x = self.tabular_head(x)
+        return x

From 8b12c61342cf562ec333de0a6a6047d42a36e0a8 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:52:35 +0000
Subject: [PATCH 053/132] add default config for NODE model

---
 mambular/configs/node_config.py | 69 +++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 mambular/configs/node_config.py

diff --git a/mambular/configs/node_config.py b/mambular/configs/node_config.py
new file mode 100644
index 0000000..d574f25
--- /dev/null
+++ b/mambular/configs/node_config.py
@@ -0,0 +1,69 @@
+from dataclasses import dataclass
+import torch.nn as nn
+
+
+@dataclass
+class DefaultNODEConfig:
+    """
+    Configuration class for the default Neural Oblivious Decision Ensemble (NODE) model.
+
+    This class provides default hyperparameters for training and configuring a NODE model.
+
+    Attributes
+    ----------
+    lr : float, optional
+        Learning rate for the optimizer. Default is 1e-4.
+    lr_patience : int, optional
+        Number of epochs without improvement after which the learning rate will be reduced. Default is 10.
+    weight_decay : float, optional
+        Weight decay (L2 regularization penalty) applied by the optimizer. Default is 1e-6.
+    lr_factor : float, optional
+        Factor by which the learning rate is reduced when there is no improvement. Default is 0.1.
+    norm : str, optional
+        Type of normalization to use. Default is None.
+    use_embeddings : bool, optional
+        Whether to use embedding layers for categorical features. Default is False.
+    embedding_activation : callable, optional
+        Activation function to apply to embeddings. Default is `nn.Identity`.
+    layer_norm_after_embedding : bool, optional
+        Whether to apply layer normalization after embedding layers. Default is False.
+    d_model : int, optional
+        Dimensionality of the embedding space. Default is 32.
+    num_layers : int, optional
+        Number of dense layers in the model. Default is 4.
+    layer_dim : int, optional
+        Dimensionality of each dense layer. Default is 128.
+    tree_dim : int, optional
+        Dimensionality of the output from each tree leaf. Default is 1.
+    depth : int, optional
+        Depth of each decision tree in the ensemble. Default is 6.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    """
+
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-06
+    lr_factor: float = 0.1
+    norm: str = None
+    use_embeddings: bool = False
+    embedding_activation: callable = nn.Identity()
+    layer_norm_after_embedding: bool = False
+    d_model: int = 32
+    num_layers: int = 4
+    layer_dim: int = 128
+    tree_dim: int = 1
+    depth: int = 6
+    head_layer_sizes: list = ()
+    head_dropout: float = 0.5
+    head_skip_layers: bool = False
+    head_activation: callable = nn.SELU()
+    head_use_batch_norm: bool = False

From 90e1476bcf75a7620f45fca6ee201d4a3c531f01 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:52:50 +0000
Subject: [PATCH 054/132] add Node to models and __init__

---
 mambular/models/__init__.py |   4 +
 mambular/models/node.py     | 287 ++++++++++++++++++++++++++++++++++++
 2 files changed, 291 insertions(+)
 create mode 100644 mambular/models/node.py

diff --git a/mambular/models/__init__.py b/mambular/models/__init__.py
index fc7d27f..f9d82f6 100644
--- a/mambular/models/__init__.py
+++ b/mambular/models/__init__.py
@@ -24,6 +24,7 @@
 )
 
 from .ndtf import NDTFClassifier, NDTFRegressor, NDTFLSS
+from .node import NODEClassifier, NODERegressor, NODELSS
 
 
 __all__ = [
@@ -57,4 +58,7 @@
     "NDTFClassifier",
     "NDTFRegressor",
     "NDTFLSS",
+    "NODEClassifier",
+    "NODERegressor",
+    "NODELSS",
 ]
diff --git a/mambular/models/node.py b/mambular/models/node.py
new file mode 100644
index 0000000..cfd9d52
--- /dev/null
+++ b/mambular/models/node.py
@@ -0,0 +1,287 @@
+from .sklearn_base_regressor import SklearnBaseRegressor
+from .sklearn_base_classifier import SklearnBaseClassifier
+from .sklearn_base_lss import SklearnBaseLSS
+from ..base_models.node import NODE
+from ..configs.node_config import DefaultNODEConfig
+
+
+class NODERegressor(SklearnBaseRegressor):
+    """
+    Neural Oblivious Decision Ensemble (NODE) Regressor. Slightly different with a MLP as a tabular task specific head. This class extends the SklearnBaseRegressor class and uses the NODE model
+    with the default NODE configuration.
+
+    The accepted arguments to the NODERegressor class include both the attributes in the DefaultNODEConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, optional
+        Learning rate for the optimizer. Default is 1e-4.
+    lr_patience : int, optional
+        Number of epochs without improvement after which the learning rate will be reduced. Default is 10.
+    weight_decay : float, optional
+        Weight decay (L2 regularization penalty) applied by the optimizer. Default is 1e-6.
+    lr_factor : float, optional
+        Factor by which the learning rate is reduced when there is no improvement. Default is 0.1.
+    norm : str, optional
+        Type of normalization to use. Default is None.
+    use_embeddings : bool, optional
+        Whether to use embedding layers for categorical features. Default is False.
+    embedding_activation : callable, optional
+        Activation function to apply to embeddings. Default is `nn.Identity`.
+    layer_norm_after_embedding : bool, optional
+        Whether to apply layer normalization after embedding layers. Default is False.
+    d_model : int, optional
+        Dimensionality of the embedding space. Default is 32.
+    num_layers : int, optional
+        Number of dense layers in the model. Default is 4.
+    layer_dim : int, optional
+        Dimensionality of each dense layer. Default is 128.
+    tree_dim : int, optional
+        Dimensionality of the output from each tree leaf. Default is 1.
+    depth : int, optional
+        Depth of each decision tree in the ensemble. Default is 6.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the NODERegressor class are the same as the attributes in the DefaultNODEConfig dataclass.
+    - NODERegressor uses SklearnBaseRegressor as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseRegressor : The parent class for NODERegressor.
+
+    Examples
+    --------
+    >>> from mambular.models import NODERegressor
+    >>> model = NODERegressor(layer_sizes=[128, 128, 64], activation=nn.ReLU())
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=NODE, config=DefaultNODEConfig, **kwargs)
+
+
+class NODEClassifier(SklearnBaseClassifier):
+    """
+    Neural Oblivious Decision Ensemble (NODE) Classifier. Slightly different with a MLP as a tabular task specific head. This class extends the SklearnBaseClassifier class and uses the NODE model
+    with the default NODE configuration.
+
+    The accepted arguments to the NODEClassifier class include both the attributes in the DefaultNODEConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, optional
+        Learning rate for the optimizer. Default is 1e-4.
+    lr_patience : int, optional
+        Number of epochs without improvement after which the learning rate will be reduced. Default is 10.
+    weight_decay : float, optional
+        Weight decay (L2 regularization penalty) applied by the optimizer. Default is 1e-6.
+    lr_factor : float, optional
+        Factor by which the learning rate is reduced when there is no improvement. Default is 0.1.
+    norm : str, optional
+        Type of normalization to use. Default is None.
+    use_embeddings : bool, optional
+        Whether to use embedding layers for categorical features. Default is False.
+    embedding_activation : callable, optional
+        Activation function to apply to embeddings. Default is `nn.Identity`.
+    layer_norm_after_embedding : bool, optional
+        Whether to apply layer normalization after embedding layers. Default is False.
+    d_model : int, optional
+        Dimensionality of the embedding space. Default is 32.
+    num_layers : int, optional
+        Number of dense layers in the model. Default is 4.
+    layer_dim : int, optional
+        Dimensionality of each dense layer. Default is 128.
+    tree_dim : int, optional
+        Dimensionality of the output from each tree leaf. Default is 1.
+    depth : int, optional
+        Depth of each decision tree in the ensemble. Default is 6.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the NODEClassifier class are the same as the attributes in the DefaultNODEConfig dataclass.
+    - NODEClassifier uses SklearnBaseClassifieras the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseClassifier : The parent class for NODEClassifier.
+
+    Examples
+    --------
+    >>> from mambular.models import NODEClassifier
+    >>> model = NODEClassifier(layer_sizes=[128, 128, 64], activation=nn.ReLU())
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=NODE, config=DefaultNODEConfig, **kwargs)
+
+
+class NODELSS(SklearnBaseLSS):
+    """
+    Neural Oblivious Decision Ensemble (NODE) for disrtibutional regression. Slightly different with a MLP as a tabular task specific head. This class extends the SklearnBaseLSS class and uses the NODE model
+    with the default NODE configuration.
+
+    The accepted arguments to the NODELSS class include both the attributes in the DefaultNODEConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, optional
+        Learning rate for the optimizer. Default is 1e-4.
+    lr_patience : int, optional
+        Number of epochs without improvement after which the learning rate will be reduced. Default is 10.
+    weight_decay : float, optional
+        Weight decay (L2 regularization penalty) applied by the optimizer. Default is 1e-6.
+    lr_factor : float, optional
+        Factor by which the learning rate is reduced when there is no improvement. Default is 0.1.
+    norm : str, optional
+        Type of normalization to use. Default is None.
+    use_embeddings : bool, optional
+        Whether to use embedding layers for categorical features. Default is False.
+    embedding_activation : callable, optional
+        Activation function to apply to embeddings. Default is `nn.Identity`.
+    layer_norm_after_embedding : bool, optional
+        Whether to apply layer normalization after embedding layers. Default is False.
+    d_model : int, optional
+        Dimensionality of the embedding space. Default is 32.
+    num_layers : int, optional
+        Number of dense layers in the model. Default is 4.
+    layer_dim : int, optional
+        Dimensionality of each dense layer. Default is 128.
+    tree_dim : int, optional
+        Dimensionality of the output from each tree leaf. Default is 1.
+    depth : int, optional
+        Depth of each decision tree in the ensemble. Default is 6.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    task : str, default="regression"
+        Indicates the type of machine learning task ('regression' or 'classification'). This can
+        influence certain preprocessing behaviors, especially when using decision tree-based binning as ple.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the NODELSS class are the same as the attributes in the DefaultNODEConfig dataclass.
+    - NODELSS uses SklearnBaseLSS as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseLSS : The parent class for NODELSS.
+
+    Examples
+    --------
+    >>> from mambular.models import NODELSS
+    >>> model = NODELSS(layer_sizes=[128, 128, 64], activation=nn.ReLU())
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=NODE, config=DefaultNODEConfig, **kwargs)

From 9bdbff3fe5a83262cae0d53131da3ea3187814b7 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:55:24 +0000
Subject: [PATCH 055/132] refactor normalization layer ->
 get_normalization_layer included in __init__

---
 mambular/base_models/ft_transformer.py | 27 ++-----------------
 mambular/base_models/mlp.py            | 25 ++---------------
 mambular/base_models/tabtransformer.py | 26 +++---------------
 mambular/base_models/tabularnn.py      | 37 ++------------------------
 4 files changed, 9 insertions(+), 106 deletions(-)

diff --git a/mambular/base_models/ft_transformer.py b/mambular/base_models/ft_transformer.py
index 2af362b..c349e02 100644
--- a/mambular/base_models/ft_transformer.py
+++ b/mambular/base_models/ft_transformer.py
@@ -1,14 +1,7 @@
 import torch
 import torch.nn as nn
 from ..arch_utils.mlp_utils import MLP
-from ..arch_utils.layer_utils.normalization_layers import (
-    RMSNorm,
-    LayerNorm,
-    LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
-)
+from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer
 from ..configs.fttransformer_config import DefaultFTTransformerConfig
@@ -101,23 +94,7 @@ def __init__(
             bias=self.hparams.get("bias", config.bias),
         )
 
-        norm_layer = self.hparams.get("norm", config.norm)
-        if norm_layer == "RMSNorm":
-            self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model))
-        elif norm_layer == "LayerNorm":
-            self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model))
-        elif norm_layer == "BatchNorm":
-            self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model))
-        elif norm_layer == "InstanceNorm":
-            self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model))
-        elif norm_layer == "GroupNorm":
-            self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model))
-        elif norm_layer == "LearnableLayerScaling":
-            self.norm_f = LearnableLayerScaling(
-                self.hparams.get("d_model", config.d_model)
-            )
-        else:
-            self.norm_f = None
+        self.norm_f = get_normalization_layer(config)
 
         self.encoder = nn.TransformerEncoder(
             encoder_layer,
diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index d9e24e3..4aebee5 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -2,14 +2,7 @@
 import torch.nn as nn
 from ..configs.mlp_config import DefaultMLPConfig
 from .basemodel import BaseModel
-from ..arch_utils.layer_utils.normalization_layers import (
-    RMSNorm,
-    LayerNorm,
-    LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
-)
+from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 
 
@@ -74,21 +67,7 @@ def __init__(
         if config.batch_norm:
             self.layers.append(nn.BatchNorm1d(self.layer_sizes[0]))
 
-        norm_layer = self.hparams.get("norm", config.norm)
-        if norm_layer == "RMSNorm":
-            self.norm_f = RMSNorm(self.layer_sizes[0])
-        elif norm_layer == "LayerNorm":
-            self.norm_f = LayerNorm(self.layer_sizes[0])
-        elif norm_layer == "BatchNorm":
-            self.norm_f = BatchNorm(self.layer_sizes[0])
-        elif norm_layer == "InstanceNorm":
-            self.norm_f = InstanceNorm(self.layer_sizes[0])
-        elif norm_layer == "GroupNorm":
-            self.norm_f = GroupNorm(1, self.layer_sizes[0])
-        elif norm_layer == "LearnableLayerScaling":
-            self.norm_f = LearnableLayerScaling(self.layer_sizes[0])
-        else:
-            self.norm_f = None
+        self.norm_f = get_normalization_layer(config)
 
         if self.norm_f is not None:
             self.layers.append(self.norm_f(self.layer_sizes[0]))
diff --git a/mambular/base_models/tabtransformer.py b/mambular/base_models/tabtransformer.py
index 2229faa..48fd8ed 100644
--- a/mambular/base_models/tabtransformer.py
+++ b/mambular/base_models/tabtransformer.py
@@ -1,18 +1,12 @@
 import torch
 import torch.nn as nn
 from ..arch_utils.mlp_utils import MLP
-from ..arch_utils.layer_utils.normalization_layers import (
-    RMSNorm,
-    LayerNorm,
-    LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
-)
+from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..configs.tabtransformer_config import DefaultTabTransformerConfig
 from .basemodel import BaseModel
 from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer
+from ..arch_utils.layer_utils.normalization_layers import LayerNorm
 
 
 class TabTransformer(BaseModel):
@@ -109,21 +103,7 @@ def __init__(
             bias=self.hparams.get("bias", config.bias),
         )
 
-        norm_layer = self.hparams.get("norm", config.norm)
-        if norm_layer == "RMSNorm":
-            self.norm_f = RMSNorm(layer_norm_dim)
-        elif norm_layer == "LayerNorm":
-            self.norm_f = LayerNorm(layer_norm_dim)
-        elif norm_layer == "BatchNorm":
-            self.norm_f = BatchNorm(layer_norm_dim)
-        elif norm_layer == "InstanceNorm":
-            self.norm_f = InstanceNorm(layer_norm_dim)
-        elif norm_layer == "GroupNorm":
-            self.norm_f = GroupNorm(1, layer_norm_dim)
-        elif norm_layer == "LearnableLayerScaling":
-            self.norm_f = LearnableLayerScaling(layer_norm_dim)
-        else:
-            self.norm_f = None
+        self.norm_f = get_normalization_layer(config)
 
         self.norm_embedding = LayerNorm(self.hparams.get("d_model", config.d_model))
         self.encoder = nn.TransformerEncoder(
diff --git a/mambular/base_models/tabularnn.py b/mambular/base_models/tabularnn.py
index 3cc5fc3..4ce5a64 100644
--- a/mambular/base_models/tabularnn.py
+++ b/mambular/base_models/tabularnn.py
@@ -5,14 +5,7 @@
 from .basemodel import BaseModel
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.rnn_utils import ConvRNN
-from ..arch_utils.layer_utils.normalization_layers import (
-    RMSNorm,
-    LayerNorm,
-    LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
-)
+from ..arch_utils.get_norm_fn import get_normalization_layer
 
 
 class TabulaRNN(BaseModel):
@@ -35,33 +28,7 @@ def __init__(
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        norm_layer = self.hparams.get("norm", config.norm)
-        if norm_layer == "RMSNorm":
-            self.norm_f = RMSNorm(
-                self.hparams.get("dim_feedforward", config.dim_feedforward)
-            )
-        elif norm_layer == "LayerNorm":
-            self.norm_f = LayerNorm(
-                self.hparams.get("dim_feedforward", config.dim_feedforward)
-            )
-        elif norm_layer == "BatchNorm":
-            self.norm_f = BatchNorm(
-                self.hparams.get("dim_feedforward", config.dim_feedforward)
-            )
-        elif norm_layer == "InstanceNorm":
-            self.norm_f = InstanceNorm(
-                self.hparams.get("dim_feedforward", config.dim_feedforward)
-            )
-        elif norm_layer == "GroupNorm":
-            self.norm_f = GroupNorm(
-                1, self.hparams.get("dim_feedforward", config.dim_feedforward)
-            )
-        elif norm_layer == "LearnableLayerScaling":
-            self.norm_f = LearnableLayerScaling(
-                self.hparams.get("dim_feedforward", config.dim_feedforward)
-            )
-        else:
-            self.norm_f = None
+        self.norm_f = get_normalization_layer(config)
 
         self.rnn = ConvRNN(
             model_type=self.hparams.get("model_type", config.model_type),

From b0c0bf4d2860b81eb61055bb1b74c2a0669f7e1e Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:55:35 +0000
Subject: [PATCH 056/132] add nodeconfig in __init__

---
 mambular/configs/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mambular/configs/__init__.py b/mambular/configs/__init__.py
index 31ee342..45dc765 100644
--- a/mambular/configs/__init__.py
+++ b/mambular/configs/__init__.py
@@ -7,6 +7,7 @@
 from .tabularnn_config import DefaultTabulaRNNConfig
 from .mambattention_config import DefaultMambAttentionConfig
 from .ndtf_config import DefaultNDTFConfig
+from .node_config import DefaultNODEConfig
 
 
 __all__ = [
@@ -19,4 +20,5 @@
     "DefaultTabulaRNNConfig",
     "DefaultMambAttentionConfig",
     "DefaultNDTFConfig",
+    "DefaultNODEConfig",
 ]

From bea4bc3097ce255c45a44e71ef586ffbab8e2ce1 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 14:55:54 +0000
Subject: [PATCH 057/132] fix typo in docstrings

---
 mambular/models/mlp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/models/mlp.py b/mambular/models/mlp.py
index 60d77e3..b286c45 100644
--- a/mambular/models/mlp.py
+++ b/mambular/models/mlp.py
@@ -172,7 +172,7 @@ class MLPClassifier(SklearnBaseClassifier):
 
     See Also
     --------
-    mambular.models.SklearnBaseRegressor : The parent class for MLPClassifier.
+    mambular.models.SklearnBaseClassifier : The parent class for MLPClassifier.
 
     Examples
     --------

From 4e3bcdaf44268c7e469b0b1476ebcb8bd5459bf3 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 5 Nov 2024 15:31:12 +0000
Subject: [PATCH 058/132] adapt config for normalization layer in rnn

---
 mambular/base_models/tabularnn.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mambular/base_models/tabularnn.py b/mambular/base_models/tabularnn.py
index 4ce5a64..b15c580 100644
--- a/mambular/base_models/tabularnn.py
+++ b/mambular/base_models/tabularnn.py
@@ -6,6 +6,7 @@
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.rnn_utils import ConvRNN
 from ..arch_utils.get_norm_fn import get_normalization_layer
+from dataclasses import replace
 
 
 class TabulaRNN(BaseModel):
@@ -28,8 +29,6 @@ def __init__(
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        self.norm_f = get_normalization_layer(config)
-
         self.rnn = ConvRNN(
             model_type=self.hparams.get("model_type", config.model_type),
             input_size=self.hparams.get("d_model", config.d_model),
@@ -82,6 +81,9 @@ def __init__(
             self.hparams.get("dim_feedforward", config.dim_feedforward),
         )
 
+        temp_config = replace(config, d_model=config.dim_feedforward)
+        self.norm_f = get_normalization_layer(temp_config)
+
     def forward(self, num_features, cat_features):
         """
         Defines the forward pass of the model.

From fb74d8e6cb10a7ea9b18073d18af2dcfbb115405 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:49:27 +0000
Subject: [PATCH 059/132] adjust readme and include new models

---
 README.md | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 49b017b..4ba6274 100644
--- a/README.md
+++ b/README.md
@@ -56,12 +56,15 @@ Mambular is a Python package that brings the power of advanced deep learning arc
 | Model            | Description                                                                                                                                             |
 | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `Mambular`       | A sequential model using Mamba blocks [Gu and Dao](https://arxiv.org/pdf/2312.00752)  specifically designed for various tabular data tasks.             |
+| `TabM`           | Batch Ensembling for a MLP as introduced by [Gorishniy et al.](https://arxiv.org/abs/2410.24210)                                                        |
+| `NODE`           | Neural Oblivious Decision Ensembles as introduced by [Popov et al.](https://arxiv.org/abs/1909.06312)                                                   |
+| `BatchTabRNN`    | A sequential model using RNN and batch ensembling. [TBD]()                                                                                              |
 | `FTTransformer`  | A model leveraging transformer encoders, as introduced by [Gorishniy et al.](https://arxiv.org/abs/2106.11959), for tabular data.                       |
 | `MLP`            | A classical Multi-Layer Perceptron (MLP) model for handling tabular data tasks.                                                                         |
 | `ResNet`         | An adaptation of the ResNet architecture for tabular data applications.                                                                                 |
 | `TabTransformer` | A transformer-based model for tabular data introduced by [Huang et al.](https://arxiv.org/abs/2012.06678), enhancing feature learning capabilities.     |
 | `MambaTab`       | A tabular model using a Mamba-Block on a joint input representation described [here](https://arxiv.org/abs/2401.08867) . Not a sequential model.        |
-| `TabulaRNN`      | A Recurrent Neural Network for Tabular data. Not yet included in the benchmarks                                                                         |
+| `TabulaRNN`      | A Recurrent Neural Network for Tabular data. Not yet included in the benchmarks. Paper Link will follow                                                 |
 | `MambAttention`  | A combination between Mamba and Transformers, similar to Jamba by [Lieber et al.](https://arxiv.org/abs/2403.19887). Not yet included in the benchmarks |
 
 
@@ -326,6 +329,51 @@ Here's how you can implement a custom model with Mambular:
    regressor.fit(X_train, y_train, max_epochs=50)
    ```
 
+# Custom Training
+If you prefer to setup custom training, preprocessing and evaluation, you can simply use the `mambular.base_models`.
+Just be careful that all basemodels expect lists of features as inputs. More precisely as list for numerical features and a list for categorical features. A custom training loop, with random data could look like this.
+
+```python
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from mambular.base_models import Mambular
+from mambular.configs import DefaultMambularConfig
+
+# Dummy data and configuration
+cat_feature_info = {"cat1": 5, "cat2": 5}  # Example categorical feature information
+num_feature_info = {"num1": 1, "num2": 1}  # Example numerical feature information
+num_classes = 1
+config = DefaultMambularConfig()  # Use the desired configuration
+
+# Initialize model, loss function, and optimizer
+model = Mambular(cat_feature_info, num_feature_info, num_classes, config)
+criterion = nn.MSELoss()  # Use MSE for regression; change as appropriate for your task
+optimizer = optim.Adam(model.parameters(), lr=0.001)
+
+# Example training loop
+for epoch in range(10):  # Number of epochs
+    model.train()
+    optimizer.zero_grad()
+
+    # Dummy Data
+    num_features = [torch.randn(32, 1) for _ in num_feature_info]
+    cat_features = [torch.randint(0, 5, (32,)) for _ in cat_feature_info]
+    labels = torch.randn(32, num_classes)  
+
+    # Forward pass
+    outputs = model(num_features, cat_features)
+    loss = criterion(outputs, labels)
+
+    # Backward pass and optimization
+    loss.backward()
+    optimizer.step()
+
+    # Print loss for monitoring
+    print(f"Epoch [{epoch+1}/10], Loss: {loss.item():.4f}")
+
+```
+
 # 🏷️ Citation
 
 If you find this project useful in your research, please consider cite:

From ef1166d8aa30c85e4b4bc95fc422f145fb07acb6 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:49:46 +0000
Subject: [PATCH 060/132] LinearBatchEnsemlbe layer as used in TabM paper

---
 .../layer_utils/batch_ensemble_layer.py       | 296 ++++++++++++++++++
 1 file changed, 296 insertions(+)
 create mode 100644 mambular/arch_utils/layer_utils/batch_ensemble_layer.py

diff --git a/mambular/arch_utils/layer_utils/batch_ensemble_layer.py b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
new file mode 100644
index 0000000..ea7330e
--- /dev/null
+++ b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
@@ -0,0 +1,296 @@
+import torch
+import torch.nn as nn
+from typing import Literal
+import math
+from typing import Callable
+
+
+class LinearBatchEnsembleLayer(nn.Module):
+    """
+    A configurable BatchEnsemble layer that supports optional input scaling, output scaling,
+    and output bias terms as per the 'BatchEnsemble' paper.
+    It provides initialization options for scaling terms to diversify ensemble members.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        ensemble_size: int,
+        ensemble_scaling_in: bool = True,
+        ensemble_scaling_out: bool = True,
+        ensemble_bias: bool = False,
+        scaling_init: Literal["ones", "random-signs"] = "ones",
+    ):
+        super(LinearBatchEnsembleLayer, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.ensemble_size = ensemble_size
+
+        # Base weight matrix W, shared across ensemble members
+        self.W = nn.Parameter(torch.randn(out_features, in_features))
+
+        # Optional scaling factors and shifts for each ensemble member
+        self.r = (
+            nn.Parameter(torch.empty(ensemble_size, in_features))
+            if ensemble_scaling_in
+            else None
+        )
+        self.s = (
+            nn.Parameter(torch.empty(ensemble_size, out_features))
+            if ensemble_scaling_out
+            else None
+        )
+        self.bias = (
+            nn.Parameter(torch.empty(out_features))
+            if not ensemble_bias and out_features > 0
+            else nn.Parameter(torch.empty(ensemble_size, out_features))
+            if ensemble_bias
+            else None
+        )
+
+        # Initialize parameters
+        self.reset_parameters(scaling_init)
+
+    def reset_parameters(self, scaling_init: Literal["ones", "random-signs"]):
+        # Initialize W using a uniform distribution
+        nn.init.kaiming_uniform_(self.W, a=math.sqrt(5))
+
+        # Initialize scaling factors r and s based on selected initialization
+        scaling_init_fn = {
+            "ones": nn.init.ones_,
+            "random-signs": lambda x: torch.sign(torch.randn_like(x)),
+        }
+
+        if self.r is not None:
+            scaling_init_fn[scaling_init](self.r)
+        if self.s is not None:
+            scaling_init_fn[scaling_init](self.s)
+
+        # Initialize bias
+        if self.bias is not None:
+            if self.bias.shape == (self.out_features,):
+                nn.init.uniform_(self.bias, -0.1, 0.1)
+            else:
+                nn.init.zeros_(self.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.dim() == 2:
+            x = x.unsqueeze(1).expand(
+                -1, self.ensemble_size, -1
+            )  # Shape: (B, n_ensembles, N)
+        elif x.size(1) != self.ensemble_size:
+            raise ValueError(
+                f"Input shape {x.shape} is invalid. Expected shape: (B, n_ensembles, N)"
+            )
+
+        # Apply input scaling if enabled
+        if self.r is not None:
+            x = x * self.r
+
+        # Linear transformation with W
+        output = torch.einsum("bki,oi->bko", x, self.W)
+
+        # Apply output scaling if enabled
+        if self.s is not None:
+            output = output * self.s
+
+        # Add bias if enabled
+        if self.bias is not None:
+            output = output + self.bias
+
+        return output
+
+
+class RNNBatchEnsembleLayer(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        ensemble_size: int,
+        nonlinearity: Callable = torch.tanh,
+        dropout: float = 0.0,
+        ensemble_scaling_in: bool = True,
+        ensemble_scaling_out: bool = True,
+        ensemble_bias: bool = False,
+        scaling_init: Literal["ones", "random-signs"] = "ones",
+    ):
+        """
+        A batch ensemble RNN layer with optional bidirectionality and shared weights.
+
+        Parameters
+        ----------
+        input_size : int
+            The number of input features.
+        hidden_size : int
+            The number of features in the hidden state.
+        ensemble_size : int
+            The number of ensemble members.
+        nonlinearity : Callable, default=torch.tanh
+            Activation function to apply after each RNN step.
+        dropout : float, default=0.0
+            Dropout rate applied to the hidden state.
+        ensemble_scaling_in : bool, default=True
+            Whether to use input scaling for each ensemble member.
+        ensemble_scaling_out : bool, default=True
+            Whether to use output scaling for each ensemble member.
+        ensemble_bias : bool, default=False
+            Whether to use a unique bias term for each ensemble member.
+        """
+        super(RNNBatchEnsembleLayer, self).__init__()
+        self.input_size = input_size
+        self.ensemble_size = ensemble_size
+        self.nonlinearity = nonlinearity
+        self.dropout_layer = nn.Dropout(dropout)
+        self.bidirectional = False
+        self.num_directions = 1
+        self.hidden_size = hidden_size
+
+        # Shared RNN weight matrices for all ensemble members
+        self.W_ih = nn.Parameter(torch.empty(hidden_size, input_size))
+        self.W_hh = nn.Parameter(torch.empty(hidden_size, hidden_size))
+
+        # Ensemble-specific scaling factors and bias for each ensemble member
+        self.r = (
+            nn.Parameter(torch.empty(ensemble_size, input_size))
+            if ensemble_scaling_in
+            else None
+        )
+        self.s = (
+            nn.Parameter(torch.empty(ensemble_size, hidden_size))
+            if ensemble_scaling_out
+            else None
+        )
+        self.bias = (
+            nn.Parameter(torch.zeros(ensemble_size, hidden_size))
+            if ensemble_bias
+            else None
+        )
+
+        # Initialize parameters
+        self.reset_parameters(scaling_init)
+
+    def reset_parameters(self, scaling_init: Literal["ones", "random-signs"]):
+        # Initialize scaling factors r and s based on selected initialization
+        scaling_init_fn = {
+            "ones": nn.init.ones_,
+            "random-signs": lambda x: torch.sign(torch.randn_like(x)),
+        }
+
+        if self.r is not None:
+            scaling_init_fn[scaling_init](self.r)
+        if self.s is not None:
+            scaling_init_fn[scaling_init](self.s)
+
+        # Xavier initialization for W_ih and W_hh like a standard RNN
+        nn.init.xavier_uniform_(self.W_ih)
+        nn.init.xavier_uniform_(self.W_hh)
+
+        # Initialize bias to zeros if applicable
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+
+    def forward(self, x: torch.Tensor, hidden: torch.Tensor = None) -> torch.Tensor:
+        """
+        Forward pass for the BatchEnsembleRNNLayer.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, seq_len, input_size).
+        hidden : torch.Tensor, optional
+            Hidden state tensor of shape (num_directions, ensemble_size, batch_size, hidden_size), by default None.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor of shape (batch_size, seq_len, ensemble_size, hidden_size * num_directions).
+        """
+        # Check input shape and expand if necessary
+        if x.dim() == 3:  # Case: (B, L, D) - no ensembles
+            batch_size, seq_len, input_size = x.shape
+            x = x.unsqueeze(2).expand(
+                -1, -1, self.ensemble_size, -1
+            )  # Shape: (B, L, ensemble_size, D)
+        elif (
+            x.dim() == 4 and x.size(2) == self.ensemble_size
+        ):  # Case: (B, L, ensemble_size, D)
+            batch_size, seq_len, ensemble_size, _ = x.shape
+            if ensemble_size != self.ensemble_size:
+                raise ValueError(
+                    f"Input shape {x.shape} is invalid. Expected shape: (B, S, ensemble_size, N)"
+                )
+        else:
+            raise ValueError(
+                f"Input shape {x.shape} is invalid. Expected shape: (B, L, D) or (B, L, ensemble_size, D)"
+            )
+
+        # Initialize hidden state if not provided
+        if hidden is None:
+            hidden = torch.zeros(
+                self.num_directions,
+                self.ensemble_size,
+                batch_size,
+                self.hidden_size,
+                device=x.device,
+            )
+
+        outputs = []
+
+        for t in range(seq_len):
+            hidden_next_directions = []
+
+            for direction in range(self.num_directions):
+                # Select forward or backward timestep `t`
+
+                t_index = t if direction == 0 else seq_len - 1 - t
+                x_t = x[:, t_index, :, :]
+
+                # Apply input scaling if enabled
+                if self.r is not None:
+                    x_t = x_t * self.r
+
+                # Input and hidden term calculations with shared weights
+                input_term = torch.einsum("bki,hi->bkh", x_t, self.W_ih)
+                # Access the hidden state for the current direction, reshape for matrix multiplication
+                hidden_direction = hidden[direction]  # Shape: (E, B, hidden_size)
+                hidden_direction = hidden_direction.permute(
+                    1, 0, 2
+                )  # Shape: (B, E, hidden_size)
+                hidden_term = torch.einsum(
+                    "bki,hi->bkh", hidden_direction, self.W_hh
+                )  # Shape: (B, E, hidden_size)
+                hidden_next = input_term + hidden_term
+
+                # Apply output scaling, bias, and non-linearity
+                if self.s is not None:
+                    hidden_next = hidden_next * self.s
+                if self.bias is not None:
+                    hidden_next = hidden_next + self.bias
+
+                hidden_next = self.nonlinearity(hidden_next)
+                hidden_next = hidden_next.permute(1, 0, 2)
+
+                hidden_next_directions.append(hidden_next)
+
+            # Stack `hidden_next_directions` along the first dimension to update `hidden` for all directions
+            hidden = torch.stack(
+                hidden_next_directions, dim=0
+            )  # Shape: (num_directions, ensemble_size, batch_size, hidden_size)
+
+            # Concatenate outputs for both directions along the last dimension if bidirectional
+            output = torch.cat(
+                [hn.permute(1, 0, 2) for hn in hidden_next_directions], dim=-1
+            )  # Shape: (batch_size, ensemble_size, hidden_size * num_directions)
+            outputs.append(output)
+
+        # Apply dropout only to the final layer output if dropout is set
+        if self.dropout_layer is not None:
+            outputs[-1] = self.dropout_layer(outputs[-1])
+
+        # Stack outputs for all timesteps
+        outputs = torch.stack(
+            outputs, dim=1
+        )  # Shape: (batch_size, seq_len, ensemble_size, hidden_size * num_directions)
+
+        return outputs, hidden

From 1bb2dc4ca30f6e19c2efc7edae8d373c97ca5d18 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:50:01 +0000
Subject: [PATCH 061/132] only use config in embedding layer as arg

---
 .../arch_utils/layer_utils/embedding_layer.py | 110 ++++++++----------
 1 file changed, 49 insertions(+), 61 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index 3a5846f..7fbcbdc 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -4,18 +4,7 @@
 
 
 class EmbeddingLayer(nn.Module):
-    def __init__(
-        self,
-        num_feature_info,
-        cat_feature_info,
-        d_model,
-        embedding_activation=nn.Identity(),
-        layer_norm_after_embedding=False,
-        use_cls=False,
-        cls_position=0,
-        cat_encoding="int",
-        embedding_layer="linear",
-    ):
+    def __init__(self, num_feature_info, cat_feature_info, config):
         """
         Embedding layer that handles numerical and categorical embeddings.
 
@@ -25,75 +14,65 @@ def __init__(
             Dictionary where keys are numerical feature names and values are their respective input dimensions.
         cat_feature_info : dict
             Dictionary where keys are categorical feature names and values are the number of categories for each feature.
-        d_model : int
-            Dimensionality of the embeddings.
-        embedding_activation : nn.Module, optional
-            Activation function to apply after embedding. Default is `nn.Identity()`.
-        layer_norm_after_embedding : bool, optional
-            If True, applies layer normalization after embeddings. Default is `False`.
-        use_cls : bool, optional
-            If True, includes a class token in the embeddings. Default is `False`.
-        cls_position : int, optional
-            Position to place the class token, either at the start (0) or end (1) of the sequence. Default is `0`.
-
-        Methods
-        -------
-        forward(num_features=None, cat_features=None)
-            Defines the forward pass of the model.
+        config : Config
+            Configuration object containing all required settings.
         """
         super(EmbeddingLayer, self).__init__()
 
-        self.d_model = d_model
-        self.embedding_activation = embedding_activation
-        self.layer_norm_after_embedding = layer_norm_after_embedding
-        self.use_cls = use_cls
-        self.cls_position = cls_position
-        if embedding_layer == "ndt":
-            self.num_embeddings = nn.ModuleList(
-                [
-                    nn.Sequential(
-                        NeuralEmbeddingTree(
-                            input_dim=1, output_dim=d_model, temperature=0.3
-                        ),
-                    )
-                    for feature_name, input_shape in num_feature_info.items()
-                ]
-            )
-
-        else:
-            self.num_embeddings = nn.ModuleList(
-                [
-                    nn.Sequential(
-                        nn.Linear(input_shape, d_model, bias=False),
-                        self.embedding_activation,
-                    )
-                    for feature_name, input_shape in num_feature_info.items()
-                ]
-            )
+        self.d_model = config.d_model
+        self.embedding_activation = getattr(
+            config, "embedding_activation", nn.Identity()
+        )
+        self.layer_norm_after_embedding = getattr(
+            config, "layer_norm_after_embedding", False
+        )
+        self.use_cls = getattr(config, "use_cls", False)
+        self.cls_position = getattr(config, "cls_position", 0)
+        self.cat_encoding = getattr(config, "cat_encoding", "int")
+        self.embedding_dropout = (
+            nn.Dropout(getattr(config, "embedding_dropout", 0.0))
+            if getattr(config, "embedding_dropout", None) is not None
+            else None
+        )
+
+        self.num_embeddings = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Linear(input_shape, self.d_model, bias=False),
+                    self.embedding_activation,
+                )
+                for feature_name, input_shape in num_feature_info.items()
+            ]
+        )
 
+        # Initialize categorical embeddings
         self.cat_embeddings = nn.ModuleList()
         for feature_name, num_categories in cat_feature_info.items():
-            if cat_encoding == "int":
+            if self.cat_encoding == "int":
                 self.cat_embeddings.append(
                     nn.Sequential(
-                        nn.Embedding(num_categories + 1, d_model),
+                        nn.Embedding(num_categories + 1, self.d_model),
                         self.embedding_activation,
                     )
                 )
-            elif cat_encoding == "one-hot":
+            elif self.cat_encoding == "one-hot":
                 self.cat_embeddings.append(
                     nn.Sequential(
                         OneHotEncoding(num_categories),
-                        nn.Linear(num_categories, d_model, bias=False),
+                        nn.Linear(num_categories, self.d_model, bias=False),
                         self.embedding_activation,
                     )
                 )
 
+        # Class token if required
         if self.use_cls:
-            self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
-        if layer_norm_after_embedding:
-            self.embedding_norm = nn.LayerNorm(d_model)
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, self.d_model))
+
+        # Layer normalization if required
+        if self.layer_norm_after_embedding:
+            self.embedding_norm = nn.LayerNorm(self.d_model)
 
+        # Sequence length
         self.seq_len = len(self.num_embeddings) + len(self.cat_embeddings)
 
     def forward(self, num_features=None, cat_features=None):
@@ -117,6 +96,7 @@ def forward(self, num_features=None, cat_features=None):
         ValueError
             If no features are provided to the model.
         """
+        # Class token initialization
         if self.use_cls:
             batch_size = (
                 cat_features[0].size(0)
@@ -125,6 +105,7 @@ def forward(self, num_features=None, cat_features=None):
             )
             cls_tokens = self.cls_token.expand(batch_size, -1, -1)
 
+        # Process categorical embeddings
         if self.cat_embeddings and cat_features is not None:
             cat_embeddings = [
                 emb(cat_features[i]) for i, emb in enumerate(self.cat_embeddings)
@@ -136,6 +117,7 @@ def forward(self, num_features=None, cat_features=None):
         else:
             cat_embeddings = None
 
+        # Process numerical embeddings
         if self.num_embeddings and num_features is not None:
             num_embeddings = [
                 emb(num_features[i]) for i, emb in enumerate(self.num_embeddings)
@@ -146,6 +128,7 @@ def forward(self, num_features=None, cat_features=None):
         else:
             num_embeddings = None
 
+        # Combine categorical and numerical embeddings
         if cat_embeddings is not None and num_embeddings is not None:
             x = torch.cat([cat_embeddings, num_embeddings], dim=1)
         elif cat_embeddings is not None:
@@ -155,6 +138,7 @@ def forward(self, num_features=None, cat_features=None):
         else:
             raise ValueError("No features provided to the model.")
 
+        # Add class token if required
         if self.use_cls:
             if self.cls_position == 0:
                 x = torch.cat([cls_tokens, x], dim=1)
@@ -165,6 +149,10 @@ def forward(self, num_features=None, cat_features=None):
                     "Invalid cls_position value. It should be either 0 or 1."
                 )
 
+        # Apply dropout to embeddings if specified in config
+        if self.embedding_dropout is not None:
+            x = self.embedding_dropout(x)
+
         return x
 
 

From 5998fd38bafa33a3328d97dfecf48fd2bff84653 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:50:16 +0000
Subject: [PATCH 062/132] allow for None as input

---
 mambular/arch_utils/get_norm_fn.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mambular/arch_utils/get_norm_fn.py b/mambular/arch_utils/get_norm_fn.py
index f90352e..d32ff16 100644
--- a/mambular/arch_utils/get_norm_fn.py
+++ b/mambular/arch_utils/get_norm_fn.py
@@ -45,5 +45,7 @@ def get_normalization_layer(config):
         return GroupNorm(1, d_model, eps=layer_norm_eps)
     elif norm_layer == "LearnableLayerScaling":
         return LearnableLayerScaling(d_model)
+    elif norm_layer is None:
+        return None
     else:
         raise ValueError(f"Unsupported normalization layer: {norm_layer}")

From abf741d6b2af29ff2508f1fbec43a8e7766ca5d7 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:50:30 +0000
Subject: [PATCH 063/132] rename MLp to MLPhead and only use config as input

---
 mambular/arch_utils/mlp_utils.py | 45 ++++++++++++++------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/mambular/arch_utils/mlp_utils.py b/mambular/arch_utils/mlp_utils.py
index 956a015..7c95cd3 100644
--- a/mambular/arch_utils/mlp_utils.py
+++ b/mambular/arch_utils/mlp_utils.py
@@ -151,7 +151,7 @@ def forward(self, x):
         return self.block(x)
 
 
-class MLP(nn.Module):
+class MLPhead(nn.Module):
     """
     A multi-layer perceptron (MLP) for regression tasks, configurable with optional skip connections and batch normalization.
 
@@ -180,34 +180,27 @@ class MLP(nn.Module):
         The final linear layer of the MLP.
     """
 
-    def __init__(
-        self,
-        n_input_units,
-        hidden_units_list=[64, 32, 32],
-        n_output_units: int = 1,
-        dropout_rate: float = 0.1,
-        use_skip_layers: bool = False,
-        activation_fn=nn.LeakyReLU(),
-        use_batch_norm: bool = False,
-    ):
-        super(MLP, self).__init__()
-        self.n_input_units = n_input_units
-        self.hidden_units_list = hidden_units_list
-        self.dropout_rate = dropout_rate
-        self.n_output_units = n_output_units
+    def __init__(self, input_dim, output_dim, config):
+        super(MLPhead, self).__init__()
+
+        self.hidden_units_list = getattr(config, "head_layer_sizes", [128, 64])
+        self.dropout_rate = getattr(config, "head_dropout", 0.5)
+        self.skip_layers = getattr(config, "head_skip_layers", False)
+        self.batch_norm = getattr(config, "head_use_batch_norm", False)
+        self.activation = getattr(config, "head_activation", nn.ReLU())
 
         layers = []
-        input_units = n_input_units
+        input_units = input_dim
 
-        for n_hidden_units in hidden_units_list:
-            if use_skip_layers and input_units == n_hidden_units:
+        for n_hidden_units in self.hidden_units_list:
+            if self.skip_layers and input_units == n_hidden_units:
                 layers.append(
                     Linear_skip_block(
                         input_units,
                         n_hidden_units,
-                        dropout_rate,
-                        activation_fn,
-                        use_batch_norm,
+                        self.dropout_rate,
+                        self.activation,
+                        self.batch_norm,
                     )
                 )
             else:
@@ -215,15 +208,15 @@ def __init__(
                     Linear_block(
                         input_units,
                         n_hidden_units,
-                        dropout_rate,
-                        activation_fn,
-                        use_batch_norm,
+                        self.dropout_rate,
+                        self.activation,
+                        self.batch_norm,
                     )
                 )
             input_units = n_hidden_units  # Update input_units for the next layer
 
         self.hidden_layers = nn.Sequential(*layers)
-        self.linear_final = nn.Linear(input_units, n_output_units)  # Final layer
+        self.linear_final = nn.Linear(input_units, output_dim)  # Final layer
 
     def forward(self, x):
         """

From d0440bb640ef7f2f9c331e6d83af85021f88adc2 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:50:47 +0000
Subject: [PATCH 064/132] use config as input in ConvRNN and introduce
 batchEnsemble RNN layer

---
 mambular/arch_utils/rnn_utils.py | 210 ++++++++++++++++++++++---------
 1 file changed, 152 insertions(+), 58 deletions(-)

diff --git a/mambular/arch_utils/rnn_utils.py b/mambular/arch_utils/rnn_utils.py
index 03b5ab1..705d4a2 100644
--- a/mambular/arch_utils/rnn_utils.py
+++ b/mambular/arch_utils/rnn_utils.py
@@ -1,25 +1,28 @@
 import torch
 import torch.nn as nn
 from .lstm_utils import mLSTMblock, sLSTMblock
+from .layer_utils.batch_ensemble_layer import RNNBatchEnsembleLayer
+from typing import Callable, Literal
 
 
 class ConvRNN(nn.Module):
-    def __init__(
-        self,
-        model_type: str,  # 'RNN', 'LSTM', or 'GRU'
-        input_size: int,  # Number of input features (128 in your case)
-        hidden_size: int,  # Number of hidden units in RNN layers
-        num_layers: int,  # Number of RNN layers
-        bidirectional: bool,  # Whether RNN is bidirectional
-        rnn_dropout: float,  # Dropout rate for RNN
-        bias: bool,  # Bias for RNN
-        conv_bias: bool,  # Bias for Conv1d
-        rnn_activation: str = None,  # Only for RNN
-        d_conv: int = 4,  # Kernel size for Conv1d
-        residuals: bool = False,  # Whether to use residual connections
-    ):
+    def __init__(self, config):
         super(ConvRNN, self).__init__()
 
+        # Configuration parameters with defaults where needed
+        self.model_type = getattr(
+            config, "model_type", "RNN"
+        )  # 'RNN', 'LSTM', or 'GRU'
+        self.input_size = getattr(config, "d_model", 128)
+        self.hidden_size = getattr(config, "dim_feedforward", 128)
+        self.num_layers = getattr(config, "n_layers", 4)
+        self.rnn_dropout = getattr(config, "rnn_dropout", 0.0)
+        self.bias = getattr(config, "bias", True)
+        self.conv_bias = getattr(config, "conv_bias", True)
+        self.rnn_activation = getattr(config, "rnn_activation", "relu")
+        self.d_conv = getattr(config, "d_conv", 4)
+        self.residuals = getattr(config, "residuals", False)
+
         # Choose RNN layer based on model_type
         rnn_layer = {
             "RNN": nn.RNN,
@@ -27,14 +30,7 @@ def __init__(
             "GRU": nn.GRU,
             "mLSTM": mLSTMblock,
             "sLSTM": sLSTMblock,
-        }[model_type]
-
-        self.input_size = input_size  # Number of input features (128 in your case)
-        self.hidden_size = hidden_size  # Number of hidden units in RNN
-        self.num_layers = num_layers  # Number of RNN layers
-        self.bidirectional = bidirectional  # Whether RNN is bidirectional
-        self.rnn_type = model_type
-        self.residuals = residuals
+        }[self.model_type]
 
         # Convolutional layers
         self.convs = nn.ModuleList()
@@ -43,8 +39,8 @@ def __init__(
         if self.residuals:
             self.residual_matrix = nn.ParameterList(
                 [
-                    nn.Parameter(torch.randn(hidden_size, hidden_size))
-                    for _ in range(num_layers)
+                    nn.Parameter(torch.randn(self.hidden_size, self.hidden_size))
+                    for _ in range(self.num_layers)
                 ]
             )
 
@@ -53,15 +49,13 @@ def __init__(
             nn.Conv1d(
                 in_channels=self.input_size,
                 out_channels=self.input_size,
-                kernel_size=d_conv,
-                padding=d_conv - 1,
-                bias=conv_bias,
+                kernel_size=self.d_conv,
+                padding=self.d_conv - 1,
+                bias=self.conv_bias,
                 groups=self.input_size,
             )
         )
-        self.layernorms_conv.append(
-            nn.LayerNorm(self.input_size)
-        )  # LayerNorm for first Conv layer
+        self.layernorms_conv.append(nn.LayerNorm(self.input_size))
 
         # Subsequent Conv1d layers use hidden_size as input
         for i in range(self.num_layers - 1):
@@ -69,43 +63,30 @@ def __init__(
                 nn.Conv1d(
                     in_channels=self.hidden_size,
                     out_channels=self.hidden_size,
-                    kernel_size=d_conv,
-                    padding=d_conv - 1,
-                    bias=conv_bias,
+                    kernel_size=self.d_conv,
+                    padding=self.d_conv - 1,
+                    bias=self.conv_bias,
                     groups=self.hidden_size,
                 )
             )
-            self.layernorms_conv.append(
-                nn.LayerNorm(self.hidden_size)
-            )  # LayerNorm for Conv layers
+            self.layernorms_conv.append(nn.LayerNorm(self.hidden_size))
 
         # Initialize the RNN layers
         self.rnns = nn.ModuleList()
         self.layernorms_rnn = nn.ModuleList()  # LayerNorms for RNN layers
 
         for i in range(self.num_layers):
-            if model_type in ["RNN"]:
-                rnn = rnn_layer(
-                    input_size=(self.input_size if i == 0 else self.hidden_size),
-                    hidden_size=self.hidden_size,
-                    num_layers=1,
-                    bidirectional=self.bidirectional,
-                    batch_first=True,
-                    dropout=rnn_dropout if i < self.num_layers - 1 else 0,
-                    bias=bias,
-                    nonlinearity=rnn_activation,
-                )
-            else:
-                rnn = rnn_layer(
-                    input_size=(self.input_size if i == 0 else self.hidden_size),
-                    hidden_size=self.hidden_size,
-                    num_layers=1,
-                    bidirectional=self.bidirectional,
-                    batch_first=True,
-                    dropout=rnn_dropout if i < self.num_layers - 1 else 0,
-                    bias=bias,
-                )
-            self.rnns.append(rnn)
+            rnn_args = {
+                "input_size": self.input_size if i == 0 else self.hidden_size,
+                "hidden_size": self.hidden_size,
+                "num_layers": 1,
+                "batch_first": True,
+                "dropout": self.rnn_dropout if i < self.num_layers - 1 else 0,
+                "bias": self.bias,
+            }
+            if self.model_type == "RNN":
+                rnn_args["nonlinearity"] = self.rnn_activation
+            self.rnns.append(rnn_layer(**rnn_args))
             self.layernorms_rnn.append(nn.LayerNorm(self.hidden_size))
 
     def forward(self, x):
@@ -129,6 +110,7 @@ def forward(self, x):
         # Loop through the RNN layers and apply 1D convolution before each
         for i in range(self.num_layers):
             # Transpose to (batch_size, input_size, seq_length) for Conv1d
+
             x = self.layernorms_conv[i](x)
             x = x.transpose(1, 2)
 
@@ -151,3 +133,115 @@ def forward(self, x):
                 residual = x
 
         return x, _
+
+
+class EnsembleConvRNN(nn.Module):
+    def __init__(
+        self,
+        config,
+    ):
+        super(EnsembleConvRNN, self).__init__()
+
+        self.input_size = getattr(config, "d_model", 128)
+        self.hidden_size = getattr(config, "dim_feedforward", 128)
+        self.ensemble_size = getattr(config, "ensemble_size", 16)
+        self.num_layers = getattr(config, "n_layers", 4)
+        self.rnn_dropout = getattr(config, "rnn_dropout", 0.5)
+        self.bias = getattr(config, "bias", True)
+        self.conv_bias = getattr(config, "conv_bias", True)
+        self.rnn_activation = getattr(config, "rnn_activation", torch.tanh)
+        self.d_conv = getattr(config, "d_conv", 4)
+        self.residuals = getattr(config, "residuals", False)
+        self.ensemble_scaling_in = getattr(config, "ensemble_scaling_in", True)
+        self.ensemble_scaling_out = getattr(config, "ensemble_scaling_out", True)
+        self.ensemble_bias = getattr(config, "ensemble_bias", False)
+        self.scaling_init = getattr(config, "scaling_init", "ones")
+
+        # Convolutional layers
+        self.convs = nn.ModuleList()
+        self.layernorms_conv = nn.ModuleList()  # LayerNorms for Conv layers
+
+        if self.residuals:
+            self.residual_matrix = nn.ParameterList(
+                [
+                    nn.Parameter(torch.randn(self.hidden_size, self.hidden_size))
+                    for _ in range(self.num_layers)
+                ]
+            )
+
+        # First Conv1d layer uses input_size
+        self.conv = nn.Conv1d(
+            in_channels=self.input_size,
+            out_channels=self.input_size,
+            kernel_size=self.d_conv,
+            padding=self.d_conv - 1,
+            bias=self.conv_bias,
+            groups=self.input_size,
+        )
+
+        self.layernorms_conv = nn.LayerNorm(self.input_size)
+
+        # Initialize the RNN layers
+        self.rnns = nn.ModuleList()
+        self.layernorms_rnn = nn.ModuleList()  # LayerNorms for RNN layers
+
+        for i in range(self.num_layers):
+            rnn = RNNBatchEnsembleLayer(
+                input_size=(self.input_size if i == 0 else self.hidden_size),
+                hidden_size=self.hidden_size,
+                ensemble_size=self.ensemble_size,
+                ensemble_scaling_in=self.ensemble_scaling_in,
+                ensemble_scaling_out=self.ensemble_scaling_out,
+                ensemble_bias=self.ensemble_bias,
+                dropout=self.rnn_dropout if i < self.num_layers - 1 else 0,
+                nonlinearity=self.rnn_activation,
+                scaling_init=self.scaling_init,
+            )
+
+            self.rnns.append(rnn)
+            self.layernorms_rnn.append(nn.LayerNorm(self.hidden_size))
+
+    def forward(self, x):
+        """
+        Forward pass through Conv-RNN layers.
+
+        Parameters
+        -----------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, seq_length, input_size).
+
+        Returns
+        --------
+        output : torch.Tensor
+            Output tensor after passing through Conv-RNN layers.
+        """
+        _, L, _ = x.shape
+        if self.residuals:
+            residual = x
+
+        x = self.layernorms_conv(x)
+        x = x.transpose(1, 2)
+
+        # Apply the 1D convolution
+        x = self.conv(x)[:, :, :L]
+
+        # Transpose back to (batch_size, seq_length, input_size)
+        x = x.transpose(1, 2)
+
+        # Loop through the RNN layers and apply 1D convolution before each
+        for i, layer in enumerate(self.rnns):
+            # Transpose to (batch_size, input_size, seq_length) for Conv1d
+
+            # Pass through the RNN layer
+            x, _ = layer(x)
+
+            # Residual connection with learnable matrix
+            if self.residuals:
+                if i < self.num_layers and i > 0:
+                    residual_proj = torch.matmul(residual, self.residual_matrix[i])
+                    x = x + residual_proj
+
+                # Update residual for next layer
+                residual = x
+
+        return x, _

From 6fb11faa577588a6429758a91d7e9bf2b1f457cd Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:51:00 +0000
Subject: [PATCH 065/132] only use config in TransformerEncoder Layer

---
 mambular/arch_utils/transformer_utils.py | 25 ++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/mambular/arch_utils/transformer_utils.py b/mambular/arch_utils/transformer_utils.py
index c4aaf6b..34233d8 100644
--- a/mambular/arch_utils/transformer_utils.py
+++ b/mambular/arch_utils/transformer_utils.py
@@ -24,23 +24,32 @@ def forward(self, x):
 
 
 class CustomTransformerEncoderLayer(nn.TransformerEncoderLayer):
-    def __init__(self, *args, activation=F.relu, **kwargs):
-        super(CustomTransformerEncoderLayer, self).__init__(
-            *args, activation=activation, **kwargs
+    def __init__(self, config):
+        super().__init__(
+            d_model=getattr(config, "d_model", 128),
+            nhead=getattr(config, "n_heads", 8),
+            dim_feedforward=getattr(config, "transformer_dim_feedforward", 2048),
+            dropout=getattr(config, "attn_dropout", 0.1),
+            activation=getattr(config, "transformer_activation", F.relu),
+            layer_norm_eps=getattr(config, "layer_norm_eps", 1e-5),
+            norm_first=getattr(config, "norm_first", False),
         )
-        self.custom_activation = activation
+        self.bias = getattr(config, "bias", True)
+        self.custom_activation = getattr(config, "transformer_activation", F.relu)
 
-        # Check if the activation function is an instance of a GLU variant
-        if activation in [ReGLU, GLU] or isinstance(activation, (ReGLU, GLU)):
+        # Additional setup based on the activation function
+        if self.custom_activation in [ReGLU, GLU] or isinstance(
+            self.custom_activation, (ReGLU, GLU)
+        ):
             self.linear1 = nn.Linear(
                 self.linear1.in_features,
                 self.linear1.out_features * 2,
-                bias=kwargs.get("bias", True),
+                bias=self.bias,
             )
             self.linear2 = nn.Linear(
                 self.linear2.in_features,
                 self.linear2.out_features,
-                bias=kwargs.get("bias", True),
+                bias=self.bias,
             )
 
     def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal=False):

From fd740371bbcaab5b43a52b35a2af8ed6b021e586 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:51:19 +0000
Subject: [PATCH 066/132] include pooling and init pooling in basemodel class

---
 mambular/base_models/basemodel.py | 83 +++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/mambular/base_models/basemodel.py b/mambular/base_models/basemodel.py
index 28fb6be..b18aa3a 100644
--- a/mambular/base_models/basemodel.py
+++ b/mambular/base_models/basemodel.py
@@ -146,3 +146,86 @@ def print_summary(self):
         print("\nParameter counts by layer:")
         for name, count in self.parameter_count().items():
             print(f"  {name}: {count}")
+
+    def initialize_pooling_layers(self, config, n_inputs):
+        """
+        Initializes the layers needed for learnable pooling methods based on self.pooling_method.
+        """
+        if self.pooling_method == "learned_flatten":
+            # Flattening + Linear layer
+            self.learned_flatten_pooling = nn.Linear(
+                n_inputs * config.dim_feedforward, config.dim_feedforward
+            )
+
+        elif self.pooling_method == "attention":
+            # Attention-based pooling with learnable attention weights
+            self.attention_weights = nn.Parameter(torch.randn(config.dim_feedforward))
+
+        elif self.pooling_method == "gated":
+            # Gated pooling with a learned gating layer
+            self.gate_layer = nn.Linear(config.dim_feedforward, config.dim_feedforward)
+
+        elif self.pooling_method == "rnn":
+            # RNN-based pooling: Use a small RNN (e.g., LSTM)
+            self.pooling_rnn = nn.LSTM(
+                input_size=config.dim_feedforward,
+                hidden_size=config.dim_feedforward,
+                num_layers=1,
+                batch_first=True,
+                bidirectional=False,
+            )
+
+        elif self.pooling_method == "conv":
+            # Conv1D-based pooling with global max pooling
+            self.conv1d_pooling = nn.Conv1d(
+                in_channels=config.dim_feedforward,
+                out_channels=config.dim_feedforward,
+                kernel_size=3,  # or a configurable kernel size
+                padding=1,  # ensures output has the same sequence length
+            )
+
+    def pool_sequence(self, out):
+        """
+        Pools the sequence dimension based on self.pooling_method.
+        """
+
+        if self.pooling_method == "avg":
+            return out.mean(
+                dim=1
+            )  # Shape: (batch_size, ensemble_size, hidden_size) or (batch_size, hidden_size)
+        elif self.pooling_method == "max":
+            return out.max(dim=1)[0]
+        elif self.pooling_method == "sum":
+            return out.sum(dim=1)
+        elif self.pooling_method == "last":
+            return out[:, -1, :]
+        elif self.pooling_method == "cls":
+            return out[:, 0, :]
+        elif self.pooling_method == "learned_flatten":
+            # Flatten sequence and apply a learned linear layer
+            batch_size, seq_len, hidden_size = out.shape
+            out = out.reshape(
+                batch_size, -1
+            )  # Shape: (batch_size, seq_len * hidden_size)
+            return self.learned_flatten_pooling(out)  # Shape: (batch_size, hidden_size)
+        elif self.pooling_method == "attention":
+            # Attention-based pooling
+            attention_scores = torch.einsum(
+                "bsh,h->bs", out, self.attention_weights
+            )  # Shape: (batch_size, seq_len)
+            attention_weights = torch.softmax(attention_scores, dim=1).unsqueeze(
+                -1
+            )  # Shape: (batch_size, seq_len, 1)
+            out = (out * attention_weights).sum(
+                dim=1
+            )  # Weighted sum across the sequence, Shape: (batch_size, hidden_size)
+            return out
+        elif self.pooling_method == "gated":
+            # Gated pooling
+            gates = torch.sigmoid(
+                self.gate_layer(out)
+            )  # Shape: (batch_size, seq_len, hidden_size)
+            out = (out * gates).sum(dim=1)  # Shape: (batch_size, hidden_size)
+            return out
+        else:
+            raise ValueError(f"Invalid pooling method: {self.pooling_method}")

From 3212cc5cef0e91c09be56603194c70c5ab99c8a7 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:51:35 +0000
Subject: [PATCH 067/132] new arch from utils -> only config as arg

---
 mambular/base_models/ft_transformer.py | 133 ++++++++-----------------
 1 file changed, 42 insertions(+), 91 deletions(-)

diff --git a/mambular/base_models/ft_transformer.py b/mambular/base_models/ft_transformer.py
index c349e02..7019f38 100644
--- a/mambular/base_models/ft_transformer.py
+++ b/mambular/base_models/ft_transformer.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from ..arch_utils.mlp_utils import MLP
+from ..arch_utils.mlp_utils import MLPhead
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer
@@ -10,53 +10,45 @@
 
 class FTTransformer(BaseModel):
     """
-    A PyTorch model for tasks utilizing the Transformer architecture and various normalization techniques.
+    A Feature Transformer model for tabular data with categorical and numerical features, using embedding, transformer
+    encoding, and pooling to produce final predictions.
 
     Parameters
     ----------
     cat_feature_info : dict
-        Dictionary containing information about categorical features.
+        Dictionary containing information about categorical features, including their names and dimensions.
     num_feature_info : dict
-        Dictionary containing information about numerical features.
+        Dictionary containing information about numerical features, including their names and dimensions.
     num_classes : int, optional
-        Number of output classes (default is 1).
+        The number of output classes or target dimensions for regression, by default 1.
     config : DefaultFTTransformerConfig, optional
-        Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()).
+        Configuration object containing model hyperparameters such as dropout rates, hidden layer sizes,
+        transformer settings, and other architectural configurations, by default DefaultFTTransformerConfig().
     **kwargs : dict
-        Additional keyword arguments.
+        Additional keyword arguments for the BaseModel class.
 
     Attributes
     ----------
-    lr : float
-        Learning rate.
-    lr_patience : int
-        Patience for learning rate scheduler.
-    weight_decay : float
-        Weight decay for optimizer.
-    lr_factor : float
-        Factor by which the learning rate will be reduced.
     pooling_method : str
-        Method to pool the features.
+        The pooling method to aggregate features after transformer encoding.
     cat_feature_info : dict
-        Dictionary containing information about categorical features.
+        Stores categorical feature information.
     num_feature_info : dict
-        Dictionary containing information about numerical features.
-    embedding_activation : callable
-        Activation function for embeddings.
-    encoder: callable
-        stack of N encoder layers
+        Stores numerical feature information.
+    embedding_layer : EmbeddingLayer
+        Layer for embedding categorical and numerical features.
     norm_f : nn.Module
-        Normalization layer.
-    num_embeddings : nn.ModuleList
-        Module list for numerical feature embeddings.
-    cat_embeddings : nn.ModuleList
-        Module list for categorical feature embeddings.
-    tabular_head : MLP
-        Multi-layer perceptron head for tabular data.
-    cls_token : nn.Parameter
-        Class token parameter.
-    embedding_norm : nn.Module, optional
-        Layer normalization applied after embedding if specified.
+        Normalization layer for the transformer output.
+    encoder : nn.TransformerEncoder
+        Transformer encoder for sequential processing of embedded features.
+    tabular_head : MLPhead
+        MLPhead layer to produce the final prediction based on the output of the transformer encoder.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Perform a forward pass through the model, including embedding, transformer encoding, pooling, and prediction steps.
+
     """
 
     def __init__(
@@ -70,71 +62,39 @@ def __init__(
         super().__init__(**kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
         self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        encoder_layer = CustomTransformerEncoderLayer(
-            d_model=self.hparams.get("d_model", config.d_model),
-            nhead=self.hparams.get("n_heads", config.n_heads),
-            batch_first=True,
-            dim_feedforward=self.hparams.get(
-                "transformer_dim_feedforward", config.transformer_dim_feedforward
-            ),
-            dropout=self.hparams.get("attn_dropout", config.attn_dropout),
-            activation=self.hparams.get(
-                "transformer_activation", config.transformer_activation
-            ),
-            layer_norm_eps=self.hparams.get("layer_norm_eps", config.layer_norm_eps),
-            norm_first=self.hparams.get("norm_first", config.norm_first),
-            bias=self.hparams.get("bias", config.bias),
+        # embedding layer
+        self.embedding_layer = EmbeddingLayer(
+            num_feature_info=num_feature_info,
+            cat_feature_info=cat_feature_info,
+            config=config,
         )
 
+        # transformer encoder
         self.norm_f = get_normalization_layer(config)
-
+        encoder_layer = CustomTransformerEncoderLayer(config=config)
         self.encoder = nn.TransformerEncoder(
             encoder_layer,
             num_layers=self.hparams.get("n_layers", config.n_layers),
             norm=self.norm_f,
         )
 
-        self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
-            d_model=self.hparams.get("d_model", config.d_model),
-            embedding_activation=self.hparams.get(
-                "embedding_activation", config.embedding_activation
-            ),
-            layer_norm_after_embedding=self.hparams.get(
-                "layer_norm_after_embedding", config.layer_norm_after_embedding
-            ),
-            use_cls=True,
-            cls_position=0,
-            cat_encoding=self.hparams.get("cat_encoding", config.cat_encoding),
-        )
-
+        # tabular head
         head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        self.tabular_head = MLP(
-            self.hparams.get("d_model", config.d_model),
-            hidden_units_list=self.hparams.get(
-                "head_layer_sizes", config.head_layer_sizes
-            ),
-            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
-            use_skip_layers=self.hparams.get(
-                "head_skip_layers", config.head_skip_layers
-            ),
-            activation_fn=head_activation,
-            use_batch_norm=self.hparams.get(
-                "head_use_batch_norm", config.head_use_batch_norm
-            ),
-            n_output_units=num_classes,
+        self.tabular_head = MLPhead(
+            input_dim=self.hparams.get("d_model", config.d_model),
+            config=config,
+            output_dim=num_classes,
         )
 
+        # pooling
+        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
+
     def forward(self, num_features, cat_features):
         """
         Defines the forward pass of the model.
@@ -155,16 +115,7 @@ def forward(self, num_features, cat_features):
 
         x = self.encoder(x)
 
-        if self.pooling_method == "avg":
-            x = torch.mean(x, dim=1)
-        elif self.pooling_method == "max":
-            x, _ = torch.max(x, dim=1)
-        elif self.pooling_method == "sum":
-            x = torch.sum(x, dim=1)
-        elif self.pooling_method == "cls":
-            x = x[:, 0]
-        else:
-            raise ValueError(f"Invalid pooling method: {self.pooling_method}")
+        x = self.pool_sequence(x)
 
         if self.norm_f is not None:
             x = self.norm_f(x)

From 230b2767081b8c543900c2e0a4adb8cef214ea5e Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:52:02 +0000
Subject: [PATCH 068/132] adjust all models to new embeddinglayer and new layer
 utils

---
 mambular/base_models/mambatab.py       |  69 +++++++++----
 mambular/base_models/mambattn.py       | 124 ++++++++---------------
 mambular/base_models/mambular.py       | 131 ++++++++-----------------
 mambular/base_models/mlp.py            |  83 +++++++++-------
 mambular/base_models/ndtf.py           |  63 ++++++++----
 mambular/base_models/node.py           |  83 +++++-----------
 mambular/base_models/resnet.py         | 127 +++++++++++-------------
 mambular/base_models/tabtransformer.py |  86 ++++------------
 mambular/base_models/tabularnn.py      |  60 +++--------
 9 files changed, 341 insertions(+), 485 deletions(-)

diff --git a/mambular/base_models/mambatab.py b/mambular/base_models/mambatab.py
index 2405adc..3e1da9e 100644
--- a/mambular/base_models/mambatab.py
+++ b/mambular/base_models/mambatab.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 from ..arch_utils.mamba_utils.mamba_arch import Mamba
-from ..arch_utils.mlp_utils import MLP
+from ..arch_utils.mlp_utils import MLPhead
 from ..arch_utils.layer_utils.normalization_layers import (
     LayerNorm,
 )
@@ -11,6 +11,51 @@
 
 
 class MambaTab(BaseModel):
+    """
+    A MambaTab model for tabular data processing, integrating feature embeddings, normalization, and a configurable
+    architecture for flexible deployment of Mamba-based feature transformation layers.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features, including their names and dimensions.
+    num_feature_info : dict
+        Dictionary containing information about numerical features, including their names and dimensions.
+    num_classes : int, optional
+        The number of output classes or target dimensions for regression, by default 1.
+    config : DefaultMambaTabConfig, optional
+        Configuration object with model hyperparameters such as dropout rates, hidden layer sizes, Mamba version, and
+        other architectural configurations, by default DefaultMambaTabConfig().
+    **kwargs : dict
+        Additional keyword arguments for the BaseModel class.
+
+    Attributes
+    ----------
+    cat_feature_info : dict
+        Stores categorical feature information.
+    num_feature_info : dict
+        Stores numerical feature information.
+    initial_layer : nn.Linear
+        Linear layer for the initial transformation of concatenated feature embeddings.
+    norm_f : LayerNorm
+        Layer normalization applied after the initial transformation.
+    embedding_activation : callable
+        Activation function applied to the embedded features.
+    axis : int
+        Axis used to adjust the shape of features during transformation.
+    tabular_head : MLPhead
+        MLPhead layer to produce the final prediction based on transformed features.
+    mamba : Mamba or MambaOriginal
+        Mamba-based feature transformation layer based on the version specified in config.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Perform a forward pass through the model, including feature concatenation, initial transformation,
+        Mamba processing, and prediction steps.
+
+    """
+
     def __init__(
         self,
         cat_feature_info,
@@ -28,10 +73,6 @@ def __init__(
         for feature_name, input_shape in cat_feature_info.items():
             input_dim += 1
 
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
@@ -46,20 +87,10 @@ def __init__(
 
         head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        self.tabular_head = MLP(
-            self.hparams.get("d_model", config.d_model),
-            hidden_units_list=self.hparams.get(
-                "head_layer_sizes", config.head_layer_sizes
-            ),
-            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
-            use_skip_layers=self.hparams.get(
-                "head_skip_layers", config.head_skip_layers
-            ),
-            activation_fn=head_activation,
-            use_batch_norm=self.hparams.get(
-                "head_use_batch_norm", config.head_use_batch_norm
-            ),
-            n_output_units=num_classes,
+        self.tabular_head = MLPhead(
+            input_dim=self.hparams.get("d_model", config.d_model),
+            config=config,
+            output_dim=num_classes,
         )
 
         if config.mamba_version == "mamba-torch":
diff --git a/mambular/base_models/mambattn.py b/mambular/base_models/mambattn.py
index 1241ae6..86f1231 100644
--- a/mambular/base_models/mambattn.py
+++ b/mambular/base_models/mambattn.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 from ..arch_utils.mamba_utils.mambattn_arch import MambAttn
-from ..arch_utils.mlp_utils import MLP
+from ..arch_utils.mlp_utils import MLPhead
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..configs.mambattention_config import DefaultMambAttentionConfig
 from .basemodel import BaseModel
@@ -10,53 +10,46 @@
 
 class MambAttention(BaseModel):
     """
-    A PyTorch model for tasks utilizing the Mamba architecture and various normalization techniques.
+    A MambAttention model for tabular data, integrating feature embeddings, attention-based Mamba transformations, and
+    a customizable architecture for handling categorical and numerical features.
 
     Parameters
     ----------
     cat_feature_info : dict
-        Dictionary containing information about categorical features.
+        Dictionary containing information about categorical features, including their names and dimensions.
     num_feature_info : dict
-        Dictionary containing information about numerical features.
+        Dictionary containing information about numerical features, including their names and dimensions.
     num_classes : int, optional
-        Number of output classes (default is 1).
-    config : DefaultMambularConfig, optional
-        Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()).
+        The number of output classes or target dimensions for regression, by default 1.
+    config : DefaultMambAttentionConfig, optional
+        Configuration object with model hyperparameters such as dropout rates, head layer sizes, attention settings,
+        and other architectural configurations, by default DefaultMambAttentionConfig().
     **kwargs : dict
-        Additional keyword arguments.
+        Additional keyword arguments for the BaseModel class.
 
     Attributes
     ----------
-    lr : float
-        Learning rate.
-    lr_patience : int
-        Patience for learning rate scheduler.
-    weight_decay : float
-        Weight decay for optimizer.
-    lr_factor : float
-        Factor by which the learning rate will be reduced.
     pooling_method : str
-        Method to pool the features.
-    cat_feature_info : dict
-        Dictionary containing information about categorical features.
-    num_feature_info : dict
-        Dictionary containing information about numerical features.
-    embedding_activation : callable
-        Activation function for embeddings.
-    mamba : Mamba
-        Mamba architecture component.
+        Pooling method to aggregate features after the Mamba attention layer.
+    shuffle_embeddings : bool
+        Flag indicating if embeddings should be shuffled, as specified in the configuration.
+    mamba : MambAttn
+        Mamba attention layer to process embedded features.
     norm_f : nn.Module
-        Normalization layer.
-    num_embeddings : nn.ModuleList
-        Module list for numerical feature embeddings.
-    cat_embeddings : nn.ModuleList
-        Module list for categorical feature embeddings.
-    tabular_head : MLP
-        Multi-layer perceptron head for tabular data.
-    cls_token : nn.Parameter
-        Class token parameter.
-    embedding_norm : nn.Module, optional
-        Layer normalization applied after embedding if specified.
+        Normalization layer for the processed features.
+    embedding_layer : EmbeddingLayer
+        Layer for embedding categorical and numerical features.
+    tabular_head : MLPhead
+        MLPhead layer to produce the final prediction based on the output of the Mamba attention layer.
+    perm : torch.Tensor, optional
+        Permutation tensor used for shuffling embeddings, if enabled.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Perform a forward pass through the model, including embedding, Mamba attention transformation, pooling,
+        and prediction steps.
+
     """
 
     def __init__(
@@ -70,62 +63,36 @@ def __init__(
         super().__init__(**kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
         self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
         self.shuffle_embeddings = self.hparams.get(
             "shuffle_embeddings", config.shuffle_embeddings
         )
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
 
         self.mamba = MambAttn(config)
-        norm_layer = self.hparams.get("norm", config.norm)
         self.norm_f = get_normalization_layer(config)
 
+        # embedding layer
         self.embedding_layer = EmbeddingLayer(
             num_feature_info=num_feature_info,
             cat_feature_info=cat_feature_info,
-            d_model=self.hparams.get("d_model", config.d_model),
-            embedding_activation=self.hparams.get(
-                "embedding_activation", config.embedding_activation
-            ),
-            layer_norm_after_embedding=self.hparams.get(
-                "layer_norm_after_embedding", config.layer_norm_after_embedding
-            ),
-            use_cls=False,
-            cls_position=-1,
-            cat_encoding=self.hparams.get("cat_encoding", config.cat_encoding),
+            config=config,
         )
 
         head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        self.tabular_head = MLP(
-            self.hparams.get("d_model", config.d_model),
-            hidden_units_list=self.hparams.get(
-                "head_layer_sizes", config.head_layer_sizes
-            ),
-            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
-            use_skip_layers=self.hparams.get(
-                "head_skip_layers", config.head_skip_layers
-            ),
-            activation_fn=head_activation,
-            use_batch_norm=self.hparams.get(
-                "head_use_batch_norm", config.head_use_batch_norm
-            ),
-            n_output_units=num_classes,
+        self.tabular_head = MLPhead(
+            input_dim=self.hparams.get("d_model", config.d_model),
+            config=config,
+            output_dim=num_classes,
         )
 
-        if self.pooling_method == "cls":
-            self.use_cls = True
-        else:
-            self.use_cls = self.hparams.get("use_cls", config.use_cls)
-
         if self.shuffle_embeddings:
             self.perm = torch.randperm(self.embedding_layer.seq_len)
 
+        # pooling
+        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
+
     def forward(self, num_features, cat_features):
         """
         Defines the forward pass of the model.
@@ -149,18 +116,7 @@ def forward(self, num_features, cat_features):
 
         x = self.mamba(x)
 
-        if self.pooling_method == "avg":
-            x = torch.mean(x, dim=1)
-        elif self.pooling_method == "max":
-            x, _ = torch.max(x, dim=1)
-        elif self.pooling_method == "sum":
-            x = torch.sum(x, dim=1)
-        elif self.pooling_method == "cls_token":
-            x = x[:, -1]
-        elif self.pooling_method == "last":
-            x = x[:, -1]
-        else:
-            raise ValueError(f"Invalid pooling method: {self.pooling_method}")
+        x = self.pool_sequence(x)
 
         x = self.norm_f(x)
         preds = self.tabular_head(x)
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index 246ef1f..04a6e2b 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -1,6 +1,6 @@
 import torch
 from ..arch_utils.mamba_utils.mamba_arch import Mamba
-from ..arch_utils.mlp_utils import MLP
+from ..arch_utils.mlp_utils import MLPhead
 from ..configs.mambular_config import DefaultMambularConfig
 from .basemodel import BaseModel
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
@@ -10,53 +10,46 @@
 
 class Mambular(BaseModel):
     """
-    A PyTorch model for tasks utilizing the Mamba architecture and various normalization techniques.
+    A Mambular model for tabular data, integrating feature embeddings, Mamba transformations, and a configurable architecture
+    for processing categorical and numerical features with pooling and normalization.
 
     Parameters
     ----------
     cat_feature_info : dict
-        Dictionary containing information about categorical features.
+        Dictionary containing information about categorical features, including their names and dimensions.
     num_feature_info : dict
-        Dictionary containing information about numerical features.
+        Dictionary containing information about numerical features, including their names and dimensions.
     num_classes : int, optional
-        Number of output classes (default is 1).
+        The number of output classes or target dimensions for regression, by default 1.
     config : DefaultMambularConfig, optional
-        Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()).
+        Configuration object with model hyperparameters such as dropout rates, head layer sizes, Mamba version, and
+        other architectural configurations, by default DefaultMambularConfig().
     **kwargs : dict
-        Additional keyword arguments.
+        Additional keyword arguments for the BaseModel class.
 
     Attributes
     ----------
-    lr : float
-        Learning rate.
-    lr_patience : int
-        Patience for learning rate scheduler.
-    weight_decay : float
-        Weight decay for optimizer.
-    lr_factor : float
-        Factor by which the learning rate will be reduced.
     pooling_method : str
-        Method to pool the features.
-    cat_feature_info : dict
-        Dictionary containing information about categorical features.
-    num_feature_info : dict
-        Dictionary containing information about numerical features.
-    embedding_activation : callable
-        Activation function for embeddings.
-    mamba : Mamba
-        Mamba architecture component.
+        Pooling method to aggregate features after the Mamba layer.
+    shuffle_embeddings : bool
+        Flag indicating if embeddings should be shuffled, as specified in the configuration.
+    embedding_layer : EmbeddingLayer
+        Layer for embedding categorical and numerical features.
+    mamba : Mamba or MambaOriginal
+        Mamba-based transformation layer based on the version specified in config.
     norm_f : nn.Module
-        Normalization layer.
-    num_embeddings : nn.ModuleList
-        Module list for numerical feature embeddings.
-    cat_embeddings : nn.ModuleList
-        Module list for categorical feature embeddings.
+        Normalization layer for the processed features.
     tabular_head : MLP
-        Multi-layer perceptron head for tabular data.
-    cls_token : nn.Parameter
-        Class token parameter.
-    embedding_norm : nn.Module, optional
-        Layer normalization applied after embedding if specified.
+        MLP layer to produce the final prediction based on the output of the Mamba layer.
+    perm : torch.Tensor, optional
+        Permutation tensor used for shuffling embeddings, if enabled.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Perform a forward pass through the model, including embedding, Mamba transformation, pooling,
+        and prediction steps.
+
     """
 
     def __init__(
@@ -70,16 +63,17 @@ def __init__(
         super().__init__(**kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
         self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
         self.shuffle_embeddings = self.hparams.get(
             "shuffle_embeddings", config.shuffle_embeddings
         )
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
+
+        # embedding layer
+        self.embedding_layer = EmbeddingLayer(
+            num_feature_info=num_feature_info,
+            cat_feature_info=cat_feature_info,
+            config=config,
+        )
 
         if config.mamba_version == "mamba-torch":
             self.mamba = Mamba(config)
@@ -87,47 +81,19 @@ def __init__(
             self.mamba = MambaOriginal(config)
         self.norm_f = get_normalization_layer(config)
 
-        self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
-            d_model=self.hparams.get("d_model", config.d_model),
-            embedding_activation=self.hparams.get(
-                "embedding_activation", config.embedding_activation
-            ),
-            layer_norm_after_embedding=self.hparams.get(
-                "layer_norm_after_embedding", config.layer_norm_after_embedding
-            ),
-            use_cls=False,
-            cls_position=-1,
-            cat_encoding=self.hparams.get("cat_encoding", config.cat_encoding),
-        )
-
-        head_activation = self.hparams.get("head_activation", config.head_activation)
-
-        self.tabular_head = MLP(
-            self.hparams.get("d_model", config.d_model),
-            hidden_units_list=self.hparams.get(
-                "head_layer_sizes", config.head_layer_sizes
-            ),
-            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
-            use_skip_layers=self.hparams.get(
-                "head_skip_layers", config.head_skip_layers
-            ),
-            activation_fn=head_activation,
-            use_batch_norm=self.hparams.get(
-                "head_use_batch_norm", config.head_use_batch_norm
-            ),
-            n_output_units=num_classes,
+        self.tabular_head = MLPhead(
+            input_dim=self.hparams.get("d_model", config.d_model),
+            config=config,
+            output_dim=num_classes,
         )
 
-        if self.pooling_method == "cls":
-            self.use_cls = True
-        else:
-            self.use_cls = self.hparams.get("use_cls", config.use_cls)
-
         if self.shuffle_embeddings:
             self.perm = torch.randperm(self.embedding_layer.seq_len)
 
+        # pooling
+        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
+
     def forward(self, num_features, cat_features):
         """
         Defines the forward pass of the model.
@@ -151,18 +117,7 @@ def forward(self, num_features, cat_features):
 
         x = self.mamba(x)
 
-        if self.pooling_method == "avg":
-            x = torch.mean(x, dim=1)
-        elif self.pooling_method == "max":
-            x, _ = torch.max(x, dim=1)
-        elif self.pooling_method == "sum":
-            x = torch.sum(x, dim=1)
-        elif self.pooling_method == "cls":
-            x = x[:, -1]
-        elif self.pooling_method == "last":
-            x = x[:, -1]
-        else:
-            raise ValueError(f"Invalid pooling method: {self.pooling_method}")
+        x = self.pool_sequence(x)
 
         x = self.norm_f(x)
         preds = self.tabular_head(x)
diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index 4aebee5..ec64d3d 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -7,6 +7,55 @@
 
 
 class MLP(BaseModel):
+    """
+    A multi-layer perceptron (MLP) model for tabular data processing, with options for embedding, normalization,
+    skip connections, and customizable activation functions.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features, including their names and dimensions.
+    num_feature_info : dict
+        Dictionary containing information about numerical features, including their names and dimensions.
+    num_classes : int, optional
+        The number of output classes or target dimensions for regression, by default 1.
+    config : DefaultMLPConfig, optional
+        Configuration object with model hyperparameters such as layer sizes, dropout rates, activation functions,
+        embedding settings, and normalization options, by default DefaultMLPConfig().
+    **kwargs : dict
+        Additional keyword arguments for the BaseModel class.
+
+    Attributes
+    ----------
+    layer_sizes : list of int
+        List specifying the number of units in each layer of the MLP.
+    cat_feature_info : dict
+        Stores categorical feature information.
+    num_feature_info : dict
+        Stores numerical feature information.
+    layers : nn.ModuleList
+        List containing the layers of the MLP, including linear layers, normalization layers, and activations.
+    skip_connections : bool
+        Flag indicating whether skip connections are enabled between layers.
+    use_glu : bool
+        Flag indicating if gated linear units (GLU) should be used as the activation function.
+    activation : callable
+        Activation function applied between layers.
+    use_embeddings : bool
+        Flag indicating if embeddings should be used for categorical and numerical features.
+    embedding_layer : EmbeddingLayer, optional
+        Embedding layer for features, used if `use_embeddings` is enabled.
+    norm_f : nn.Module, optional
+        Normalization layer applied to the output of the first layer, if specified in the configuration.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Perform a forward pass through the model, including embedding (if enabled), linear transformations,
+        activation, normalization, and prediction steps.
+
+    """
+
     def __init__(
         self,
         cat_feature_info,
@@ -15,28 +64,9 @@ def __init__(
         config: DefaultMLPConfig = DefaultMLPConfig(),
         **kwargs,
     ):
-        """
-        Initializes the MLP model with the given configuration.
-
-        Parameters
-        ----------
-        cat_feature_info : Any
-            Information about categorical features.
-        num_feature_info : Any
-            Information about numerical features.
-
-        num_classes : int, optional
-            Number of output classes, by default 1.
-        config : DefaultMLPConfig, optional
-            Configuration dataclass containing hyperparameters, by default DefaultMLPConfig().
-        """
         super().__init__(**kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
         self.layer_sizes = self.hparams.get("layer_sizes", self.layer_sizes)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
@@ -57,6 +87,7 @@ def __init__(
             input_dim += 1
 
         if self.use_embeddings:
+            self.embedding_layer = EmbeddingLayer(config)
             input_dim = (
                 len(num_feature_info) * config.d_model
                 + len(cat_feature_info) * config.d_model
@@ -96,20 +127,6 @@ def __init__(
         # Output layer
         self.layers.append(nn.Linear(self.layer_sizes[-1], num_classes))
 
-        if self.use_embeddings:
-            self.embedding_layer = EmbeddingLayer(
-                num_feature_info=num_feature_info,
-                cat_feature_info=cat_feature_info,
-                d_model=self.hparams.get("d_model", config.d_model),
-                embedding_activation=self.hparams.get(
-                    "embedding_activation", config.embedding_activation
-                ),
-                layer_norm_after_embedding=self.hparams.get(
-                    "layer_norm_after_embedding"
-                ),
-                use_cls=False,
-            )
-
     def forward(self, num_features, cat_features) -> torch.Tensor:
         """
         Forward pass of the MLP model.
diff --git a/mambular/base_models/ndtf.py b/mambular/base_models/ndtf.py
index fca2165..44e3498 100644
--- a/mambular/base_models/ndtf.py
+++ b/mambular/base_models/ndtf.py
@@ -7,6 +7,50 @@
 
 
 class NDTF(BaseModel):
+    """
+    A Neural Decision Tree Forest (NDTF) model for tabular data, composed of an ensemble of neural decision trees
+    with convolutional feature interactions, capable of producing predictions and penalty-based regularization.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features, including their names and dimensions.
+    num_feature_info : dict
+        Dictionary containing information about numerical features, including their names and dimensions.
+    num_classes : int, optional
+        The number of output classes or target dimensions for regression, by default 1.
+    config : DefaultNDTFConfig, optional
+        Configuration object containing model hyperparameters such as the number of ensembles, tree depth, penalty factor,
+        sampling settings, and temperature, by default DefaultNDTFConfig().
+    **kwargs : dict
+        Additional keyword arguments for the BaseModel class.
+
+    Attributes
+    ----------
+    cat_feature_info : dict
+        Stores categorical feature information.
+    num_feature_info : dict
+        Stores numerical feature information.
+    penalty_factor : float
+        Scaling factor for the penalty applied during training, specified in the config.
+    input_dimensions : list of int
+        List of input dimensions for each tree in the ensemble, with random sampling.
+    trees : nn.ModuleList
+        List of neural decision trees used in the ensemble.
+    conv_layer : nn.Conv1d
+        Convolutional layer for feature interactions before passing inputs to trees.
+    tree_weights : nn.Parameter
+        Learnable parameter to weight each tree's output in the ensemble.
+
+    Methods
+    -------
+    forward(num_features, cat_features) -> torch.Tensor
+        Perform a forward pass through the model, producing predictions based on an ensemble of neural decision trees.
+    penalty_forward(num_features, cat_features) -> tuple of torch.Tensor
+        Perform a forward pass with penalty regularization, returning predictions and the calculated penalty term.
+
+    """
+
     def __init__(
         self,
         cat_feature_info,
@@ -15,28 +59,9 @@ def __init__(
         config: DefaultNDTFConfig = DefaultNDTFConfig(),
         **kwargs,
     ):
-        """
-        Initializes the NDTF model with the given configuration.
-
-        Parameters
-        ----------
-        cat_feature_info : Any
-            Information about categorical features.
-        num_feature_info : Any
-            Information about numerical features.
-
-        num_classes : int, optional
-            Number of output classes, by default 1.
-        config : DefaultNDTFConfig, optional
-            Configuration dataclass containing hyperparameters, by default DefaultNDTFConfig().
-        """
         super().__init__(**kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
         self.penalty_factor = config.penalty_factor
diff --git a/mambular/base_models/node.py b/mambular/base_models/node.py
index e61a817..e49d7e0 100644
--- a/mambular/base_models/node.py
+++ b/mambular/base_models/node.py
@@ -3,59 +3,51 @@
 import torch
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.node_utils import DenseBlock
-from ..arch_utils.mlp_utils import MLP
+from ..arch_utils.mlp_utils import MLPhead
 
 
 class NODE(BaseModel):
     """
-    Neural Oblivious Decision Ensemble (NODE) Model. Slightly different with a MLP as a tabular task specific head.
-
-    NODE is a neural decision tree model that processes both categorical and numerical features.
-    This class combines embedding layers, a dense decision tree block, and an MLP head for tabular
-    data prediction tasks.
+    A Neural Oblivious Decision Ensemble (NODE) model for tabular data, integrating feature embeddings, dense blocks,
+    and customizable heads for predictions.
 
     Parameters
     ----------
     cat_feature_info : dict
-        Dictionary mapping categorical feature names to their input shapes.
+        Dictionary containing information about categorical features, including their names and dimensions.
     num_feature_info : dict
-        Dictionary mapping numerical feature names to their input shapes.
+        Dictionary containing information about numerical features, including their names and dimensions.
     num_classes : int, optional
-        Number of output classes. Default is 1.
+        The number of output classes or target dimensions for regression, by default 1.
     config : DefaultNODEConfig, optional
-        Configuration object that holds model hyperparameters. Default is `DefaultNODEConfig`.
+        Configuration object containing model hyperparameters such as the number of dense layers, layer dimensions,
+        tree depth, embedding settings, and head layer configurations, by default DefaultNODEConfig().
     **kwargs : dict
-        Additional arguments for the base model.
+        Additional keyword arguments for the BaseModel class.
 
     Attributes
     ----------
-    lr : float
-        Learning rate for the optimizer.
-    lr_patience : int
-        Number of epochs without improvement before reducing the learning rate.
-    weight_decay : float
-        Weight decay factor for regularization.
-    lr_factor : float
-        Factor by which to reduce the learning rate.
     cat_feature_info : dict
-        Information about categorical features.
+        Stores categorical feature information.
     num_feature_info : dict
-        Information about numerical features.
+        Stores numerical feature information.
     use_embeddings : bool
-        Whether to use embeddings for categorical and numerical features.
+        Flag indicating if embeddings should be used for categorical and numerical features.
     embedding_layer : EmbeddingLayer, optional
-        Embedding layer for feature transformation.
+        Embedding layer for features, used if `use_embeddings` is enabled.
     d_out : int
-        Output dimensionality.
+        The output dimension, usually set to `num_classes`.
     block : DenseBlock
-        DenseBlock layer that implements the decision tree ensemble.
-    tabular_head : MLP
-        MLP layer that serves as the output head of the model.
+        Dense block layer for feature transformations based on the NODE approach.
+    tabular_head : MLPhead
+        MLPhead layer to produce the final prediction based on the output of the dense block.
 
     Methods
     -------
     forward(num_features, cat_features)
-        Performs the forward pass, processing numerical and categorical features to produce predictions.
+        Perform a forward pass through the model, including embedding (if enabled), dense transformations,
+        and prediction steps.
+
     """
 
     def __init__(
@@ -69,10 +61,6 @@ def __init__(
         super().__init__(**kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
         self.use_embeddings = self.hparams.get("use_embeddings", config.use_embeddings)
@@ -89,18 +77,7 @@ def __init__(
                 + len(cat_feature_info) * config.d_model
             )
 
-            self.embedding_layer = EmbeddingLayer(
-                num_feature_info=num_feature_info,
-                cat_feature_info=cat_feature_info,
-                d_model=self.hparams.get("d_model", config.d_model),
-                embedding_activation=self.hparams.get(
-                    "embedding_activation", config.embedding_activation
-                ),
-                layer_norm_after_embedding=self.hparams.get(
-                    "layer_norm_after_embedding"
-                ),
-                use_cls=False,
-            )
+            self.embedding_layer = EmbeddingLayer(config)
 
         self.d_out = num_classes
         self.block = DenseBlock(
@@ -114,20 +91,10 @@ def __init__(
 
         head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        self.tabular_head = MLP(
-            config.num_layers * config.layer_dim,
-            hidden_units_list=self.hparams.get(
-                "head_layer_sizes", config.head_layer_sizes
-            ),
-            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
-            use_skip_layers=self.hparams.get(
-                "head_skip_layers", config.head_skip_layers
-            ),
-            activation_fn=head_activation,
-            use_batch_norm=self.hparams.get(
-                "head_use_batch_norm", config.head_use_batch_norm
-            ),
-            n_output_units=num_classes,
+        self.tabular_head = MLPhead(
+            input_dim=config.num_layers * config.layer_dim,
+            config=config,
+            output_dim=num_classes,
         )
 
     def forward(self, num_features, cat_features):
diff --git a/mambular/base_models/resnet.py b/mambular/base_models/resnet.py
index ec62bc2..c08ab6f 100644
--- a/mambular/base_models/resnet.py
+++ b/mambular/base_models/resnet.py
@@ -3,19 +3,61 @@
 from typing import Any
 from ..configs.resnet_config import DefaultResNetConfig
 from .basemodel import BaseModel
-from ..arch_utils.layer_utils.normalization_layers import (
-    RMSNorm,
-    LayerNorm,
-    LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
-)
+from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.resnet_utils import ResidualBlock
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 
 
 class ResNet(BaseModel):
+    """
+    A ResNet model for tabular data, combining feature embeddings, residual blocks, and customizable architecture
+    for processing categorical and numerical features.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features, including their names and dimensions.
+    num_feature_info : dict
+        Dictionary containing information about numerical features, including their names and dimensions.
+    num_classes : int, optional
+        The number of output classes or target dimensions for regression, by default 1.
+    config : DefaultResNetConfig, optional
+        Configuration object containing model hyperparameters such as layer sizes, number of residual blocks,
+        dropout rates, activation functions, and normalization settings, by default DefaultResNetConfig().
+    **kwargs : dict
+        Additional keyword arguments for the BaseModel class.
+
+    Attributes
+    ----------
+    layer_sizes : list of int
+        List specifying the number of units in each layer of the ResNet.
+    cat_feature_info : dict
+        Stores categorical feature information.
+    num_feature_info : dict
+        Stores numerical feature information.
+    activation : callable
+        Activation function used in the residual blocks.
+    use_embeddings : bool
+        Flag indicating if embeddings should be used for categorical and numerical features.
+    embedding_layer : EmbeddingLayer, optional
+        Embedding layer for features, used if `use_embeddings` is enabled.
+    initial_layer : nn.Linear
+        Initial linear layer to project input features into the model's hidden dimension.
+    blocks : nn.ModuleList
+        List of residual blocks to process the hidden representations.
+    output_layer : nn.Linear
+        Output layer that produces the final prediction.
+    norm_f : nn.Module, optional
+        Normalization layer applied in each residual block, if specified in the configuration.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Perform a forward pass through the model, including embedding (if enabled), residual blocks,
+        and prediction steps.
+
+    """
+
     def __init__(
         self,
         cat_feature_info,
@@ -24,27 +66,9 @@ def __init__(
         config: DefaultResNetConfig = DefaultResNetConfig(),
         **kwargs,
     ):
-        """
-        ResNet model for structured data.
-
-        Parameters
-        ----------
-        cat_feature_info : Any
-            Information about categorical features.
-        num_feature_info : Any
-            Information about numerical features.
-        num_classes : int, optional
-            Number of output classes, by default 1.
-        config : DefaultResNetConfig, optional
-            Configuration dataclass containing hyperparameters, by default DefaultResNetConfig().
-        """
         super().__init__(**kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
         self.layer_sizes = self.hparams.get("layer_sizes", self.layer_sizes)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
@@ -62,22 +86,12 @@ def __init__(
                 len(num_feature_info) * config.d_model
                 + len(cat_feature_info) * config.d_model
             )
-
-        norm_layer = self.hparams.get("norm", config.norm)
-        if norm_layer == "RMSNorm":
-            self.norm_f = RMSNorm
-        elif norm_layer == "LayerNorm":
-            self.norm_f = LayerNorm
-        elif norm_layer == "BatchNorm":
-            self.norm_f = BatchNorm
-        elif norm_layer == "InstanceNorm":
-            self.norm_f = InstanceNorm
-        elif norm_layer == "GroupNorm":
-            self.norm_f = GroupNorm
-        elif norm_layer == "LearnableLayerScaling":
-            self.norm_f = LearnableLayerScaling
-        else:
-            self.norm_f = None
+            # embedding layer
+            self.embedding_layer = EmbeddingLayer(
+                num_feature_info=num_feature_info,
+                cat_feature_info=cat_feature_info,
+                config=config,
+            )
 
         self.initial_layer = nn.Linear(input_dim, self.layer_sizes[0])
 
@@ -100,36 +114,9 @@ def __init__(
 
         self.output_layer = nn.Linear(self.layer_sizes[-1], num_classes)
 
-        if self.use_embeddings:
-            self.embedding_layer = EmbeddingLayer(
-                num_feature_info=num_feature_info,
-                cat_feature_info=cat_feature_info,
-                d_model=self.hparams.get("d_model", config.d_model),
-                embedding_activation=self.hparams.get(
-                    "embedding_activation", config.embedding_activation
-                ),
-                layer_norm_after_embedding=self.hparams.get(
-                    "layer_norm_after_embedding"
-                ),
-                use_cls=False,
-            )
+        self.norm_f = get_normalization_layer(config)
 
     def forward(self, num_features, cat_features):
-        """
-        Forward pass of the ResNet model.
-
-        Parameters
-        ----------
-        num_features : torch.Tensor
-            Tensor of numerical features.
-        cat_features : torch.Tensor, optional
-            Tensor of categorical features.
-
-        Returns
-        -------
-        torch.Tensor
-            Output tensor.
-        """
         if self.use_embeddings:
             x = self.embedding_layer(num_features, cat_features)
             B, S, D = x.shape
diff --git a/mambular/base_models/tabtransformer.py b/mambular/base_models/tabtransformer.py
index 48fd8ed..acb71b7 100644
--- a/mambular/base_models/tabtransformer.py
+++ b/mambular/base_models/tabtransformer.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from ..arch_utils.mlp_utils import MLP
+from ..arch_utils.mlp_utils import MLPhead
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..configs.tabtransformer_config import DefaultTabTransformerConfig
@@ -52,7 +52,7 @@ class TabTransformer(BaseModel):
         Module list for numerical feature embeddings.
     cat_embeddings : nn.ModuleList
         Module list for categorical feature embeddings.
-    tabular_head : MLP
+    tabular_head : MLPhead
         Multi-layer perceptron head for tabular data.
     cls_token : nn.Parameter
         Class token parameter.
@@ -75,56 +75,24 @@ def __init__(
                 "You are trying to fit a TabTransformer with no categorical features. Try using a different model that is better suited for tasks without categorical features."
             )
 
-        layer_norm_dim = 0
-        for feature_name, input_shape in num_feature_info.items():
-            layer_norm_dim += input_shape
-
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
         self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        encoder_layer = CustomTransformerEncoderLayer(
-            d_model=self.hparams.get("d_model", config.d_model),
-            nhead=self.hparams.get("n_heads", config.n_heads),
-            batch_first=True,
-            dim_feedforward=self.hparams.get(
-                "transformer_dim_feedforward", config.transformer_dim_feedforward
-            ),
-            dropout=self.hparams.get("attn_dropout", config.attn_dropout),
-            activation=self.hparams.get(
-                "transformer_activation", config.transformer_activation
-            ),
-            layer_norm_eps=self.hparams.get("layer_norm_eps", config.layer_norm_eps),
-            norm_first=self.hparams.get("norm_first", config.norm_first),
-            bias=self.hparams.get("bias", config.bias),
+        # embedding layer
+        self.embedding_layer = EmbeddingLayer(
+            num_feature_info=num_feature_info,
+            cat_feature_info=cat_feature_info,
+            config=config,
         )
 
+        # transformer encoder
         self.norm_f = get_normalization_layer(config)
-
-        self.norm_embedding = LayerNorm(self.hparams.get("d_model", config.d_model))
+        encoder_layer = CustomTransformerEncoderLayer(config=config)
         self.encoder = nn.TransformerEncoder(
             encoder_layer,
             num_layers=self.hparams.get("n_layers", config.n_layers),
-            norm=self.norm_embedding,
-        )
-
-        self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
-            d_model=self.hparams.get("d_model", config.d_model),
-            embedding_activation=self.hparams.get(
-                "embedding_activation", config.embedding_activation
-            ),
-            layer_norm_after_embedding=self.hparams.get(
-                "layer_norm_after_embedding", config.layer_norm_after_embedding
-            ),
-            use_cls=True,
-            cls_position=0,
-            cat_encoding=self.hparams.get("cat_encoding", config.cat_encoding),
+            norm=self.norm_f,
         )
 
         head_activation = self.hparams.get("head_activation", config.head_activation)
@@ -133,26 +101,21 @@ def __init__(
         for feature_name, input_shape in num_feature_info.items():
             mlp_input_dim += input_shape
         mlp_input_dim += config.d_model
-        self.tabular_head = MLP(
-            self.hparams.get("d_model", mlp_input_dim),
-            hidden_units_list=self.hparams.get(
-                "head_layer_sizes", config.head_layer_sizes
-            ),
-            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
-            use_skip_layers=self.hparams.get(
-                "head_skip_layers", config.head_skip_layers
-            ),
-            activation_fn=head_activation,
-            use_batch_norm=self.hparams.get(
-                "head_use_batch_norm", config.head_use_batch_norm
-            ),
-            n_output_units=num_classes,
+
+        self.tabular_head = MLPhead(
+            input_dim=mlp_input_dim,
+            config=config,
+            output_dim=num_classes,
         )
 
         self.cls_token = nn.Parameter(
             torch.zeros(1, 1, self.hparams.get("d_model", config.d_model))
         )
 
+        # pooling
+        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
+
     def forward(self, num_features, cat_features):
         """
         Defines the forward pass of the model.
@@ -176,16 +139,7 @@ def forward(self, num_features, cat_features):
 
         x = self.encoder(cat_embeddings)
 
-        if self.pooling_method == "avg":
-            x = torch.mean(x, dim=1)
-        elif self.pooling_method == "max":
-            x, _ = torch.max(x, dim=1)
-        elif self.pooling_method == "sum":
-            x = torch.sum(x, dim=1)
-        elif self.pooling_method == "cls":
-            x = x[:, 0]
-        else:
-            raise ValueError(f"Invalid pooling method: {self.pooling_method}")
+        x = self.pool_sequence(x)
 
         x = torch.cat((x, num_embeddings), axis=1)
         preds = self.tabular_head(x)
diff --git a/mambular/base_models/tabularnn.py b/mambular/base_models/tabularnn.py
index b15c580..56b3b1c 100644
--- a/mambular/base_models/tabularnn.py
+++ b/mambular/base_models/tabularnn.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from ..arch_utils.mlp_utils import MLP
+from ..arch_utils.mlp_utils import MLPhead
 from ..configs.tabularnn_config import DefaultTabulaRNNConfig
 from .basemodel import BaseModel
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
@@ -29,51 +29,20 @@ def __init__(
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        self.rnn = ConvRNN(
-            model_type=self.hparams.get("model_type", config.model_type),
-            input_size=self.hparams.get("d_model", config.d_model),
-            hidden_size=self.hparams.get("dim_feedforward", config.dim_feedforward),
-            num_layers=self.hparams.get("n_layers", config.n_layers),
-            bidirectional=self.hparams.get("bidirectional", config.bidirectional),
-            rnn_dropout=self.hparams.get("rnn_dropout", config.rnn_dropout),
-            bias=self.hparams.get("bias", config.bias),
-            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
-            rnn_activation=self.hparams.get("rnn_activation", config.rnn_activation),
-            d_conv=self.hparams.get("d_conv", config.d_conv),
-            residuals=self.hparams.get("residuals", config.residuals),
-        )
+        self.rnn = ConvRNN(config)
 
         self.embedding_layer = EmbeddingLayer(
             num_feature_info=num_feature_info,
             cat_feature_info=cat_feature_info,
-            d_model=self.hparams.get("d_model", config.d_model),
-            embedding_activation=self.hparams.get(
-                "embedding_activation", config.embedding_activation
-            ),
-            layer_norm_after_embedding=self.hparams.get(
-                "layer_norm_after_embedding", config.layer_norm_after_embedding
-            ),
-            use_cls=False,
-            cls_position=-1,
-            cat_encoding=self.hparams.get("cat_encoding", config.cat_encoding),
+            config=config,
         )
 
         head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        self.tabular_head = MLP(
-            self.hparams.get("dim_feedforward", config.dim_feedforward),
-            hidden_units_list=self.hparams.get(
-                "head_layer_sizes", config.head_layer_sizes
-            ),
-            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
-            use_skip_layers=self.hparams.get(
-                "head_skip_layers", config.head_skip_layers
-            ),
-            activation_fn=head_activation,
-            use_batch_norm=self.hparams.get(
-                "head_use_batch_norm", config.head_use_batch_norm
-            ),
-            n_output_units=num_classes,
+        self.tabular_head = MLPhead(
+            input_dim=self.hparams.get("dim_feedforward", config.dim_feedforward),
+            config=config,
+            output_dim=num_classes,
         )
 
         self.linear = nn.Linear(
@@ -84,6 +53,10 @@ def __init__(
         temp_config = replace(config, d_model=config.dim_feedforward)
         self.norm_f = get_normalization_layer(temp_config)
 
+        # pooling
+        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
+
     def forward(self, num_features, cat_features):
         """
         Defines the forward pass of the model.
@@ -106,16 +79,7 @@ def forward(self, num_features, cat_features):
         out, _ = self.rnn(x)
         z = self.linear(torch.mean(x, dim=1))
 
-        if self.pooling_method == "avg":
-            x = torch.mean(out, dim=1)
-        elif self.pooling_method == "max":
-            x, _ = torch.max(out, dim=1)
-        elif self.pooling_method == "sum":
-            x = torch.sum(out, dim=1)
-        elif self.pooling_method == "last":
-            x = x[:, -1, :]
-        else:
-            raise ValueError(f"Invalid pooling method: {self.pooling_method}")
+        x = self.pool_sequence(out)
         x = x + z
         if self.norm_f is not None:
             x = self.norm_f(x)

From d80d16d74214ca37668eb8de1094d3d82cbce0f9 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:52:12 +0000
Subject: [PATCH 069/132] include TabM as introduce in paper

---
 mambular/base_models/tabm.py | 232 +++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 mambular/base_models/tabm.py

diff --git a/mambular/base_models/tabm.py b/mambular/base_models/tabm.py
new file mode 100644
index 0000000..1c86a5e
--- /dev/null
+++ b/mambular/base_models/tabm.py
@@ -0,0 +1,232 @@
+import torch
+import torch.nn as nn
+from ..configs.tabm_config import DefaultTabMConfig
+from .basemodel import BaseModel
+from ..arch_utils.get_norm_fn import get_normalization_layer
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.layer_utils.batch_ensemble_layer import LinearBatchEnsembleLayer
+
+
+class TabM(BaseModel):
+    """
+    A TabM model for tabular data, integrating feature embeddings, batch ensemble layers, and configurable
+    architecture for processing categorical and numerical features with options for skip connections, GLU activation,
+    and dropout.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features, including their names and dimensions.
+    num_feature_info : dict
+        Dictionary containing information about numerical features, including their names and dimensions.
+    num_classes : int, optional
+        The number of output classes or target dimensions for regression, by default 1.
+    config : DefaultTabMConfig, optional
+        Configuration object containing model hyperparameters such as layer sizes, dropout rates, batch ensemble
+        settings, activation functions, and normalization settings, by default DefaultTabMConfig().
+    **kwargs : dict
+        Additional keyword arguments for the BaseModel class.
+
+    Attributes
+    ----------
+    layer_sizes : list of int
+        List specifying the number of units in each layer of the TabM model.
+    cat_feature_info : dict
+        Stores categorical feature information.
+    num_feature_info : dict
+        Stores numerical feature information.
+    config : DefaultTabMConfig
+        Stores the configuration for the TabM model.
+    layers : nn.ModuleList
+        List containing the layers of the TabM model, including LinearBatchEnsembleLayer, normalization, activation,
+        and dropout layers.
+    skip_connections : bool
+        Flag indicating whether skip connections are enabled between layers.
+    use_glu : bool
+        Flag indicating if gated linear units (GLU) should be used as the activation function.
+    activation : callable
+        Activation function applied between layers.
+    use_embeddings : bool
+        Flag indicating if embeddings should be used for categorical and numerical features.
+    embedding_layer : EmbeddingLayer, optional
+        Embedding layer for features, used if `use_embeddings` is enabled.
+    norm_f : nn.Module, optional
+        Normalization layer applied in each batch ensemble layer, if specified in the configuration.
+    final_layer : nn.Linear, optional
+        Final linear layer applied when ensemble outputs are not averaged.
+
+    Methods
+    -------
+    forward(num_features, cat_features) -> torch.Tensor
+        Perform a forward pass through the model, including embedding (if enabled), batch ensemble layers,
+        optional skip connections, and prediction steps.
+
+    """
+
+    def __init__(
+        self,
+        cat_feature_info,
+        num_feature_info,
+        num_classes: int = 1,
+        config: DefaultTabMConfig = DefaultTabMConfig(),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        self.layer_sizes = self.hparams.get("layer_sizes", config.layer_sizes)
+        self.cat_feature_info = cat_feature_info
+        self.num_feature_info = num_feature_info
+        self.config = config
+
+        # Initialize layers
+        self.layers = nn.ModuleList()
+        self.skip_connections = self.hparams.get(
+            "skip_connections", config.skip_connections
+        )
+        self.use_glu = self.hparams.get("use_glu", config.use_glu)
+        self.activation = self.hparams.get("activation", config.activation)
+        self.use_embeddings = self.hparams.get("use_embeddings", config.use_embeddings)
+
+        # Embedding layer
+        if self.use_embeddings:
+            self.embedding_layer = EmbeddingLayer(config)
+            if self.hparams.get("average_embeddings", config.average_embeddings):
+                input_dim = self.hparams.get("d_model", config.d_model)
+            else:
+                input_dim = (
+                    len(num_feature_info) + len(cat_feature_info)
+                ) * config.d_model
+                print(input_dim)
+
+        else:
+            # Calculate input dimension
+            input_dim = sum(input_shape for input_shape in num_feature_info.values())
+            input_dim += len(cat_feature_info)
+
+        # Input layer with batch ensembling
+        self.layers.append(
+            LinearBatchEnsembleLayer(
+                in_features=input_dim,
+                out_features=self.layer_sizes[0],
+                ensemble_size=config.ensemble_size,
+                ensemble_scaling_in=config.ensemble_scaling_in,
+                ensemble_scaling_out=config.ensemble_scaling_out,
+                ensemble_bias=config.ensemble_bias,
+                scaling_init=config.scaling_init,
+            )
+        )
+        if config.batch_norm:
+            self.layers.append(nn.BatchNorm1d(self.layer_sizes[0]))
+
+        self.norm_f = get_normalization_layer(config)
+        if self.norm_f is not None:
+            self.layers.append(self.norm_f(self.layer_sizes[0]))
+
+        # Optional activation and dropout
+        if config.use_glu:
+            self.layers.append(nn.GLU())
+        else:
+            self.layers.append(self.activation)
+        if config.dropout > 0.0:
+            self.layers.append(nn.Dropout(config.dropout))
+
+        # Hidden layers with batch ensembling
+        for i in range(1, len(self.layer_sizes)):
+            self.layers.append(
+                LinearBatchEnsembleLayer(
+                    in_features=self.layer_sizes[i - 1],
+                    out_features=self.layer_sizes[i],
+                    ensemble_size=config.ensemble_size,
+                    ensemble_scaling_in=config.ensemble_scaling_in,
+                    ensemble_scaling_out=config.ensemble_scaling_out,
+                    ensemble_bias=config.ensemble_bias,
+                    scaling_init=config.scaling_init,
+                )
+            )
+            if config.batch_norm:
+                self.layers.append(nn.BatchNorm1d(self.layer_sizes[i]))
+            if config.layer_norm:
+                self.layers.append(nn.LayerNorm(self.layer_sizes[i]))
+
+            if config.use_glu:
+                self.layers.append(nn.GLU())
+            else:
+                self.layers.append(self.activation)
+            if config.dropout > 0.0:
+                self.layers.append(nn.Dropout(config.dropout))
+
+        # Output layer
+        self.layers.append(
+            LinearBatchEnsembleLayer(
+                in_features=self.layer_sizes[-1],
+                out_features=num_classes,
+                ensemble_size=config.ensemble_size,
+                ensemble_scaling_in=config.ensemble_scaling_in,
+                ensemble_scaling_out=config.ensemble_scaling_out,
+                ensemble_bias=config.ensemble_bias,
+                scaling_init=config.scaling_init,
+            )
+        )
+
+        if not self.hparams.get("average_ensembles", True):
+            self.final_layer = nn.Linear(
+                self.layer_sizes[-1] * config.ensemble_size, num_classes
+            )
+
+    def forward(self, num_features, cat_features) -> torch.Tensor:
+        """
+        Forward pass of the TabM model with batch ensembling.
+
+        Parameters
+        ----------
+        num_features : torch.Tensor
+            Numerical features tensor.
+        cat_features : torch.Tensor
+            Categorical features tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor.
+        """
+        # Handle embeddings if used
+        if self.use_embeddings:
+            x = self.embedding_layer(num_features, cat_features)
+            # Option 1: Average over feature dimension (N)
+            if self.hparams.get("average_embeddings", self.config.average_embeddings):
+                x = x.mean(dim=1)  # Shape: (B, D)
+            # Option 2: Flatten feature and embedding dimensions
+            else:
+                B, N, D = x.shape
+                x = x.reshape(B, N * D)  # Shape: (B, N * D)
+
+        else:
+            x = num_features + cat_features
+            x = torch.cat(x, dim=1)
+
+        # Process through layers with optional skip connections
+        for i in range(len(self.layers) - 1):
+            if isinstance(self.layers[i], LinearBatchEnsembleLayer):
+                out = self.layers[i](x)
+                # `out` shape is expected to be (batch_size, ensemble_size, out_features)
+                if self.skip_connections and x.shape == out.shape:
+                    x = x + out
+                else:
+                    x = out
+            else:
+                x = self.layers[i](x)
+
+        # Final ensemble output from the last ConfigurableBatchEnsembleLayer
+        x = self.layers[-1](x)  # Shape (batch_size, ensemble_size, num_classes)
+
+        # Option 1: Averaging across ensemble outputs
+        if self.hparams.get("average_ensembles", True):
+            x = x.mean(dim=1)  # Shape (batch_size, num_classes)
+
+        # Option 2: Adding a final layer to map to `num_classes`
+        else:
+            x = x.view(x.size(0), -1)  # Flatten ensemble dimension if not averaging
+            x = self.final_layer(x)  # Shape (batch_size, num_classes)
+
+        return x

From a57016688b91b22e428959f50cfd6e4b17559a48 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:52:27 +0000
Subject: [PATCH 070/132] batch Ensemble RNN -> todo bidirectional

---
 mambular/base_models/batch_tabrnn.py | 148 +++++++++++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 mambular/base_models/batch_tabrnn.py

diff --git a/mambular/base_models/batch_tabrnn.py b/mambular/base_models/batch_tabrnn.py
new file mode 100644
index 0000000..ad49ecb
--- /dev/null
+++ b/mambular/base_models/batch_tabrnn.py
@@ -0,0 +1,148 @@
+import torch
+import torch.nn as nn
+from ..arch_utils.mlp_utils import MLPhead
+from ..configs.batchtabrnn_config import DefaultBatchTabRNNConfig
+from .basemodel import BaseModel
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.rnn_utils import EnsembleConvRNN
+from ..arch_utils.get_norm_fn import get_normalization_layer
+from dataclasses import replace
+
+
+class BatchTabRNN(BaseModel):
+    """
+    A batch ensemble model combining RNN and tabular data handling for multivariate time series or sequential tabular data.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features, including their names and dimensions.
+    num_feature_info : dict
+        Dictionary containing information about numerical features, including their names and dimensions.
+    num_classes : int, optional
+        The number of output classes or target dimensions for regression, by default 1.
+    config : DefaultBatchTabRNNConfig, optional
+        Configuration object containing model hyperparameters such as dropout rates, hidden layer sizes, ensemble settings,
+        and other architectural configurations, by default DefaultBatchTabRNNConfig().
+    **kwargs : dict
+        Additional keyword arguments for the BaseModel class.
+
+    Attributes
+    ----------
+    cat_feature_info : dict
+        Stores categorical feature information.
+    num_feature_info : dict
+        Stores numerical feature information.
+    pooling_method : str
+        The pooling method to aggregate sequence or ensemble features, specified in config.
+    ensemble_first : bool
+        Flag indicating if ensembles should be processed before pooling over the sequence.
+    embedding_layer : EmbeddingLayer
+        Layer for embedding categorical and numerical features.
+    rnn : EnsembleConvRNN
+        Ensemble RNN layer for processing sequential data.
+    tabular_head : MLPhead
+        MLPhead layer to produce the final prediction based on the output of the RNN and pooling layers.
+    linear : nn.Linear
+        Linear transformation layer for projecting features into a different dimension.
+    norm_f : nn.Module
+        Normalization layer.
+    ensemble_linear : nn.Linear, optional
+        Linear layer to learn a weighted combination of ensemble outputs, if configured.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Perform a forward pass through the model, including embedding, RNN, pooling, and prediction steps.
+
+    """
+
+    def __init__(
+        self,
+        cat_feature_info,
+        num_feature_info,
+        num_classes=1,
+        config: DefaultBatchTabRNNConfig = DefaultBatchTabRNNConfig(),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        self.cat_feature_info = cat_feature_info
+        self.num_feature_info = num_feature_info
+        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
+        self.ensemble_first = self.hparams.get("ensemble_first", config.ensemble_first)
+
+        self.embedding_layer = EmbeddingLayer(
+            num_feature_info=num_feature_info,
+            cat_feature_info=cat_feature_info,
+            config=config,
+        )
+        self.rnn = EnsembleConvRNN(config=config)
+
+        head_activation = self.hparams.get("head_activation", config.head_activation)
+
+        self.tabular_head = MLPhead(
+            input_dim=self.hparams.get("dim_feedforward", config.dim_feedforward),
+            config=config,
+            output_dim=num_classes,
+        )
+
+        self.linear = nn.Linear(
+            self.hparams.get("d_model", config.d_model),
+            self.hparams.get("dim_feedforward", config.dim_feedforward),
+        )
+
+        temp_config = replace(config, d_model=config.dim_feedforward)
+        self.norm_f = get_normalization_layer(temp_config)
+
+        if not self.hparams.get("average_ensembles", True):
+            self.ensemble_linear = nn.Linear(config.ensemble_size, 1)
+
+        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
+
+    def forward(self, num_features, cat_features):
+        x = self.embedding_layer(num_features, cat_features)
+
+        # RNN forward pass
+        out, _ = self.rnn(
+            x
+        )  # Shape: (batch_size, sequence_length, ensemble_size, hidden_size)
+
+        if self.ensemble_first:
+            # Combine or average over ensembles first, then pool over the sequence
+            if self.hparams.get("average_ensembles", True):
+                # Simple average over ensembles
+                out = out.mean(
+                    dim=2
+                )  # Shape: (batch_size, sequence_length, hidden_size)
+
+            else:
+                # Apply the learned linear combination over ensembles
+                out = self.ensemble_linear(out.permute(0, 1, 3, 2)).squeeze(
+                    -1
+                )  # Shape: (batch_size, sequence_length, hidden_size)
+
+            # Now pool over the sequence
+            out = self.pool_sequence(out)  # Shape: (batch_size, hidden_size)
+
+        else:
+            # Pool over the sequence first, then combine or average over ensembles
+            out = self.pool_sequence(
+                out
+            )  # Shape: (batch_size, ensemble_size, hidden_size)
+
+            if self.hparams.get("average_ensembles", True):
+                # Simple average over ensembles
+                out = out.mean(dim=1)  # Shape: (batch_size, hidden_size)
+
+            else:
+                # Apply the learned linear combination over ensembles
+                out = self.ensemble_linear(out.permute(0, 2, 1)).squeeze(
+                    -1
+                )  # Shape: (batch_size, hidden_size)
+
+        # Final prediction head
+        preds = self.tabular_head(out)
+        return preds

From 1ba10646787cffe27b3fef32205bce3cbfb92a80 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:53:17 +0000
Subject: [PATCH 071/132] include tabm and batchtabrnn configs

---
 mambular/configs/__init__.py           |  4 ++
 mambular/configs/batchtabrnn_config.py | 95 ++++++++++++++++++++++++++
 mambular/configs/tabm_config.py        | 87 +++++++++++++++++++++++
 3 files changed, 186 insertions(+)
 create mode 100644 mambular/configs/batchtabrnn_config.py
 create mode 100644 mambular/configs/tabm_config.py

diff --git a/mambular/configs/__init__.py b/mambular/configs/__init__.py
index 45dc765..7935925 100644
--- a/mambular/configs/__init__.py
+++ b/mambular/configs/__init__.py
@@ -8,6 +8,8 @@
 from .mambattention_config import DefaultMambAttentionConfig
 from .ndtf_config import DefaultNDTFConfig
 from .node_config import DefaultNODEConfig
+from .tabm_config import DefaultTabMConfig
+from .batchtabrnn_config import DefaultBatchTabRNNConfig
 
 
 __all__ = [
@@ -21,4 +23,6 @@
     "DefaultMambAttentionConfig",
     "DefaultNDTFConfig",
     "DefaultNODEConfig",
+    "DefaultTabMConfig",
+    "DefaultBatchTabRNNConfig",
 ]
diff --git a/mambular/configs/batchtabrnn_config.py b/mambular/configs/batchtabrnn_config.py
new file mode 100644
index 0000000..12b8aa5
--- /dev/null
+++ b/mambular/configs/batchtabrnn_config.py
@@ -0,0 +1,95 @@
+from dataclasses import dataclass
+import torch.nn as nn
+from typing import Literal
+
+
+@dataclass
+class DefaultBatchTabRNNConfig:
+    """
+    Configuration class for the default TabulaRNN model with predefined hyperparameters.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    model_type : str, default="RNN"
+        type of model, one of "RNN", "LSTM", "GRU", "mLSTM", "sLSTM"
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    d_model : int, default=64
+        Dimensionality of the model.
+    n_layers : int, default=8
+        Number of layers in the transformer.
+    norm : str, default="RMSNorm"
+        Normalization method to be used.
+    activation : callable, default=nn.SELU()
+        Activation function for the transformer.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for numerical embeddings.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    pooling_method : str, default="cls"
+        Pooling method to be used ('cls', 'avg', etc.).
+    norm_first : bool, default=False
+        Whether to apply normalization before other operations in each transformer block.
+    bias : bool, default=True
+        Whether to use bias in the linear layers.
+    rnn_activation : callable, default=nn.SELU()
+        Activation function for the transformer layers.
+    bidirectional : bool, default=False.
+        Whether to process data bidirectionally
+    cat_encoding : str, default="int"
+        Encoding method for categorical features.
+    """
+
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-05
+    lr_factor: float = 0.1
+    d_model: int = 128
+    n_layers: int = 4
+    rnn_dropout: float = 0.5
+    norm: str = "RMSNorm"
+    activation: callable = nn.SELU()
+    embedding_activation: callable = nn.Identity()
+    embedding_dropout: float = None
+    head_layer_sizes: list = ()
+    head_dropout: float = 0.5
+    head_skip_layers: bool = False
+    head_activation: callable = nn.SELU()
+    head_use_batch_norm: bool = False
+    layer_norm_after_embedding: bool = False
+    pooling_method: str = "avg"
+    norm_first: bool = False
+    bias: bool = True
+    rnn_activation: callable = nn.ReLU()
+    layer_norm_eps: float = 1e-05
+    dim_feedforward: int = 256
+    numerical_embedding: str = "ple"
+    cat_encoding: str = "int"
+    d_conv: int = 4
+    conv_bias: bool = True
+    residuals: bool = False
+
+    # Batch ensembling specific configurations
+    ensemble_size: int = 16
+    ensemble_scaling_in: bool = True
+    ensemble_scaling_out: bool = True
+    ensemble_bias: bool = True
+    scaling_init: Literal["ones", "random-signs"] = "ones"
+    average_ensembles: bool = True
+    ensemble_first: bool = True
diff --git a/mambular/configs/tabm_config.py b/mambular/configs/tabm_config.py
new file mode 100644
index 0000000..7d55360
--- /dev/null
+++ b/mambular/configs/tabm_config.py
@@ -0,0 +1,87 @@
+from dataclasses import dataclass
+import torch.nn as nn
+from typing import Literal
+
+
+@dataclass
+class DefaultTabMConfig:
+    """
+    Configuration class for the TabM model with batch ensembling and predefined hyperparameters.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    layer_sizes : list, default=(256, 128, 32)
+        Sizes of the layers in the model.
+    activation : callable, default=nn.SELU()
+        Activation function for the model layers.
+    skip_layers : bool, default=False
+        Whether to skip layers in the model.
+    dropout : float, default=0.5
+        Dropout rate for regularization.
+    norm : str, default=None
+        Normalization method to be used, if any.
+    use_glu : bool, default=False
+        Whether to use Gated Linear Units (GLU) in the model.
+    skip_connections : bool, default=False
+        Whether to use skip connections in the model.
+    batch_norm : bool, default=False
+        Whether to use batch normalization in the model layers.
+    layer_norm : bool, default=False
+        Whether to use layer normalization in the model layers.
+    use_embeddings : bool, default=False
+        Whether to use embedding layers for all features.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    d_model : int, default=32
+        Dimensionality of the embeddings.
+    ensemble_size : int, default=4
+        Number of ensemble members for batch ensembling.
+    ensemble_scaling_in : bool, default=True
+        Whether to use input scaling for each ensemble member.
+    ensemble_scaling_out : bool, default=True
+        Whether to use output scaling for each ensemble member.
+    ensemble_bias : bool, default=False
+        Whether to use a unique bias term for each ensemble member.
+    scaling_init : Literal['ones', 'random-signs'], default='ones'
+        Initialization method for scaling weights.
+    average_ensembles : bool, default=True
+        Whether to average the outputs of the ensembles.
+    """
+
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-06
+    lr_factor: float = 0.1
+    layer_sizes: list = (256, 256, 32)
+    activation: callable = nn.SELU()
+    skip_layers: bool = False
+    dropout: float = 0.5
+    norm: str = None
+    use_glu: bool = False
+    skip_connections: bool = False
+    batch_norm: bool = False
+    layer_norm: bool = False
+    layer_norm_eps: float = 1e-05
+    use_embeddings: bool = False
+    average_embeddings: bool = True
+    embedding_activation: callable = nn.Identity()
+    layer_norm_after_embedding: bool = False
+    d_model: int = 64
+
+    # Batch ensembling specific configurations
+    ensemble_size: int = 32
+    ensemble_scaling_in: bool = True
+    ensemble_scaling_out: bool = True
+    ensemble_bias: bool = True
+    scaling_init: Literal["ones", "random-signs"] = "ones"
+    average_ensembles: bool = True

From 51d71d19f63f4726de370516430e686defdf38c3 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:53:33 +0000
Subject: [PATCH 072/132] delete bidirectional from config

---
 mambular/configs/tabularnn_config.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mambular/configs/tabularnn_config.py b/mambular/configs/tabularnn_config.py
index cedc885..eb15d55 100644
--- a/mambular/configs/tabularnn_config.py
+++ b/mambular/configs/tabularnn_config.py
@@ -79,7 +79,6 @@ class DefaultTabulaRNNConfig:
     layer_norm_eps: float = 1e-05
     dim_feedforward: int = 256
     numerical_embedding: str = "ple"
-    bidirectional: bool = False
     cat_encoding: str = "int"
     d_conv: int = 4
     conv_bias: bool = True

From 10dca1f863697d7b7c36e5d8f6b33e92055b0ab6 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:53:43 +0000
Subject: [PATCH 073/132] add layer_norm_eps to config

---
 mambular/configs/mlp_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mambular/configs/mlp_config.py b/mambular/configs/mlp_config.py
index adaef3c..9a99a12 100644
--- a/mambular/configs/mlp_config.py
+++ b/mambular/configs/mlp_config.py
@@ -58,6 +58,7 @@ class DefaultMLPConfig:
     skip_connections: bool = False
     batch_norm: bool = False
     layer_norm: bool = False
+    layer_norm_eps: float = 1e-05
     use_embeddings: bool = False
     embedding_activation: callable = nn.Identity()
     layer_norm_after_embedding: bool = False

From b33b2aa47d4f8a8423fc1999d84e8585eedc8ff4 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:54:04 +0000
Subject: [PATCH 074/132] include batchtabrnn for reg/class/lss

---
 mambular/models/batchtabrnn.py | 260 +++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 mambular/models/batchtabrnn.py

diff --git a/mambular/models/batchtabrnn.py b/mambular/models/batchtabrnn.py
new file mode 100644
index 0000000..2f3b711
--- /dev/null
+++ b/mambular/models/batchtabrnn.py
@@ -0,0 +1,260 @@
+from .sklearn_base_regressor import SklearnBaseRegressor
+from .sklearn_base_classifier import SklearnBaseClassifier
+from .sklearn_base_lss import SklearnBaseLSS
+
+from ..base_models.batch_tabrnn import BatchTabRNN
+from ..configs.batchtabrnn_config import DefaultBatchTabRNNConfig
+
+
+class BatchTabRNNRegressor(SklearnBaseRegressor):
+    """
+    RNN regressor. This class extends the SklearnBaseRegressor class and uses the BatchTabRNN model
+    with the default BatchTabRNN configuration.
+
+    The accepted arguments to the BatchTabRNNRegressor class include both the attributes in the DefaultBatchTabRNNConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    model_type : str, default="RNN"
+        type of model, one of "RNN", "LSTM", "GRU"
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    d_model : int, default=64
+        Dimensionality of the model.
+    n_layers : int, default=8
+        Number of layers in the transformer.
+    norm : str, default="RMSNorm"
+        Normalization method to be used.
+    activation : callable, default=nn.SELU()
+        Activation function for the transformer.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for numerical embeddings.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    pooling_method : str, default="cls"
+        Pooling method to be used ('cls', 'avg', etc.).
+    norm_first : bool, default=False
+        Whether to apply normalization before other operations in each transformer block.
+    bias : bool, default=True
+        Whether to use bias in the linear layers.
+    rnn_activation : callable, default=nn.SELU()
+        Activation function for the transformer layers.
+    bidirectional : bool, default=False.
+        Whether to process data bidirectionally
+    cat_encoding : str, default="int"
+        Encoding method for categorical features.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=BatchTabRNN, config=DefaultBatchTabRNNConfig, **kwargs)
+
+
+class BatchTabRNNClassifier(SklearnBaseClassifier):
+    """
+    RNN classifier. This class extends the SklearnBaseClassifier class and uses the BatchTabRNN model
+    with the default BatchTabRNN configuration.
+
+    The accepted arguments to the BatchTabRNNClassifier class include both the attributes in the DefaultBatchTabRNNConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    model_type : str, default="RNN"
+        type of model, one of "RNN", "LSTM", "GRU"
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    d_model : int, default=64
+        Dimensionality of the model.
+    n_layers : int, default=8
+        Number of layers in the transformer.
+    norm : str, default="RMSNorm"
+        Normalization method to be used.
+    activation : callable, default=nn.SELU()
+        Activation function for the transformer.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for numerical embeddings.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    pooling_method : str, default="cls"
+        Pooling method to be used ('cls', 'avg', etc.).
+    norm_first : bool, default=False
+        Whether to apply normalization before other operations in each transformer block.
+    bias : bool, default=True
+        Whether to use bias in the linear layers.
+    rnn_activation : callable, default=nn.SELU()
+        Activation function for the transformer layers.
+    bidirectional : bool, default=False.
+        Whether to process data bidirectionally
+    cat_encoding : str, default="int"
+        Encoding method for categorical features.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=BatchTabRNN, config=DefaultBatchTabRNNConfig, **kwargs)
+
+
+class BatchTabRNNLSS(SklearnBaseLSS):
+    """
+    RNN LSS. This class extends the SklearnBaseLSS class and uses the BatchTabRNN model
+    with the default BatchTabRNN configuration.
+
+    The accepted arguments to the BatchTabRNNLSS class include both the attributes in the DefaultBatchTabRNNConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    model_type : str, default="RNN"
+        type of model, one of "RNN", "LSTM", "GRU"
+    family : str, default=None
+        Distributional family to be used for the model.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    d_model : int, default=64
+        Dimensionality of the model.
+    n_layers : int, default=8
+        Number of layers in the transformer.
+    norm : str, default="RMSNorm"
+        Normalization method to be used.
+    activation : callable, default=nn.SELU()
+        Activation function for the transformer.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for numerical embeddings.
+    head_layer_sizes : list, default=(128, 64, 32)
+        Sizes of the layers in the head of the model.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to skip layers in the head.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    pooling_method : str, default="cls"
+        Pooling method to be used ('cls', 'avg', etc.).
+    norm_first : bool, default=False
+        Whether to apply normalization before other operations in each transformer block.
+    bias : bool, default=True
+        Whether to use bias in the linear layers.
+    rnn_activation : callable, default=nn.SELU()
+        Activation function for the transformer layers.
+    bidirectional : bool, default=False.
+        Whether to process data bidirectionally
+    cat_encoding : str, default="int"
+        Encoding method for categorical features.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=BatchTabRNN, config=DefaultBatchTabRNNConfig, **kwargs)

From 38edb67cb33d859016121787f6bf0e3fb8b9d8cb Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:54:12 +0000
Subject: [PATCH 075/132] new model

---
 mambular/models/tabm.py | 283 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 283 insertions(+)
 create mode 100644 mambular/models/tabm.py

diff --git a/mambular/models/tabm.py b/mambular/models/tabm.py
new file mode 100644
index 0000000..d8fb1bc
--- /dev/null
+++ b/mambular/models/tabm.py
@@ -0,0 +1,283 @@
+from .sklearn_base_regressor import SklearnBaseRegressor
+from .sklearn_base_classifier import SklearnBaseClassifier
+from .sklearn_base_lss import SklearnBaseLSS
+from ..base_models.tabm import TabM
+from ..configs.tabm_config import DefaultTabMConfig
+
+
+class TabMRegressor(SklearnBaseRegressor):
+    """
+    Multi-Layer Perceptron regressor. This class extends the SklearnBaseRegressor class and uses the TabM model
+    with the default TabM configuration.
+
+    The accepted arguments to the TabMRegressor class include both the attributes in the DefaultTabMConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    layer_sizes : list, default=(128, 128, 32)
+        Sizes of the layers in the TabM.
+    activation : callable, default=nn.SELU()
+        Activation function for the TabM layers.
+    skip_layers : bool, default=False
+        Whether to skip layers in the TabM.
+    dropout : float, default=0.5
+        Dropout rate for regularization.
+    norm : str, default=None
+        Normalization method to be used, if any.
+    use_glu : bool, default=False
+        Whether to use Gated Linear Units (GLU) in the TabM.
+    skip_connections : bool, default=False
+        Whether to use skip connections in the TabM.
+    batch_norm : bool, default=False
+        Whether to use batch normalization in the TabM layers.
+    layer_norm : bool, default=False
+        Whether to use layer normalization in the TabM layers.
+    use_embeddings : bool, default=False
+        Whether to use embedding layers for all features.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for  embeddings.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    d_model : int, default=32
+        Dimensionality of the embeddings.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the TabMRegressor class are the same as the attributes in the DefaultTabMConfig dataclass.
+    - TabMRegressor uses SklearnBaseRegressor as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseRegressor : The parent class for TabMRegressor.
+
+    Examples
+    --------
+    >>> from mambular.models import TabMRegressor
+    >>> model = TabMRegressor(layer_sizes=[128, 128, 64], activation=nn.ReLU())
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=TabM, config=DefaultTabMConfig, **kwargs)
+
+
+class TabMClassifier(SklearnBaseClassifier):
+    """
+    Multi-Layer Perceptron classifier. This class extends the SklearnBaseClassifier class and uses the TabM model
+    with the default TabM configuration.
+
+    The accepted arguments to the TabMClassifier class include both the attributes in the DefaultTabMConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    layer_sizes : list, default=(128, 128, 32)
+        Sizes of the layers in the TabM.
+    activation : callable, default=nn.SELU()
+        Activation function for the TabM layers.
+    skip_layers : bool, default=False
+        Whether to skip layers in the TabM.
+    dropout : float, default=0.5
+        Dropout rate for regularization.
+    norm : str, default=None
+        Normalization method to be used, if any.
+    use_glu : bool, default=False
+        Whether to use Gated Linear Units (GLU) in the TabM.
+    skip_connections : bool, default=False
+        Whether to use skip connections in the TabM.
+    batch_norm : bool, default=False
+        Whether to use batch normalization in the TabM layers.
+    layer_norm : bool, default=False
+        Whether to use layer normalization in the TabM layers.
+    use_embeddings : bool, default=False
+        Whether to use embedding layers for all features.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for  embeddings.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    d_model : int, default=32
+        Dimensionality of the embeddings.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the TabMClassifier class are the same as the attributes in the DefaultTabMConfig dataclass.
+    - TabMClassifier uses SklearnBaseClassifieras the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseClassifier : The parent class for TabMClassifier.
+
+    Examples
+    --------
+    >>> from mambular.models import TabMClassifier
+    >>> model = TabMClassifier(layer_sizes=[128, 128, 64], activation=nn.ReLU())
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=TabM, config=DefaultTabMConfig, **kwargs)
+
+
+class TabMLSS(SklearnBaseLSS):
+    """
+    Multi-Layer Perceptron for distributional regression. This class extends the SklearnBaseLSS class and uses the TabM model
+    with the default TabM configuration.
+
+    The accepted arguments to the TabMLSS class include both the attributes in the DefaultTabMConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which learning rate will be reduced.
+    family : str, default=None
+        Distributional family to be used for the model.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 penalty) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    layer_sizes : list, default=(128, 128, 32)
+        Sizes of the layers in the TabM.
+    activation : callable, default=nn.SELU()
+        Activation function for the TabM layers.
+    skip_layers : bool, default=False
+        Whether to skip layers in the TabM.
+    dropout : float, default=0.5
+        Dropout rate for regularization.
+    norm : str, default=None
+        Normalization method to be used, if any.
+    use_glu : bool, default=False
+        Whether to use Gated Linear Units (GLU) in the TabM.
+    skip_connections : bool, default=False
+        Whether to use skip connections in the TabM.
+    batch_norm : bool, default=False
+        Whether to use batch normalization in the TabM layers.
+    layer_norm : bool, default=False
+        Whether to use layer normalization in the TabM layers.
+    use_embeddings : bool, default=False
+        Whether to use embedding layers for all features.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for  embeddings.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    d_model : int, default=32
+        Dimensionality of the embeddings.
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    task : str, default="regression"
+        Indicates the type of machine learning task ('regression' or 'classification'). This can
+        influence certain preprocessing behaviors, especially when using decision tree-based binning as ple.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the TabMLSS class are the same as the attributes in the DefaultTabMConfig dataclass.
+    - TabMLSS uses SklearnBaseLSS as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseLSS : The parent class for TabMLSS.
+
+    Examples
+    --------
+    >>> from mambular.models import TabMLSS
+    >>> model = TabMLSS(layer_sizes=[128, 128, 64], activation=nn.ReLU())
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=TabM, config=DefaultTabMConfig, **kwargs)

From bafcde3483c487c19bf7bd1d096d10b477aa1685 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:54:38 +0000
Subject: [PATCH 076/132] remove default values for lr related params in fit

---
 mambular/models/sklearn_base_classifier.py | 40 +++++++++++++---------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 187cfec..687ca7d 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -139,10 +139,10 @@ def build_model(
         random_state: int = 101,
         batch_size: int = 128,
         shuffle: bool = True,
-        lr: float = 1e-4,
-        lr_patience: int = 10,
-        factor: float = 0.1,
-        weight_decay: float = 1e-06,
+        lr: float = None,
+        lr_patience: int = None,
+        lr_factor: float = None,
+        weight_decay: float = None,
         dataloader_kwargs={},
     ):
         """
@@ -218,10 +218,14 @@ def build_model(
             config=self.config,
             cat_feature_info=self.data_module.cat_feature_info,
             num_feature_info=self.data_module.num_feature_info,
-            lr=lr,
-            lr_patience=lr_patience,
-            lr_factor=factor,
-            weight_decay=weight_decay,
+            lr_patience=lr_patience
+            if lr_patience is not None
+            else self.config.lr_patience,
+            lr=lr if lr is not None else self.config.lr,
+            lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
+            weight_decay=weight_decay
+            if weight_decay is not None
+            else self.config.weight_decay,
             optimizer_type=self.optimizer_type,
             optimizer_args=self.optimizer_kwargs,
         )
@@ -276,10 +280,10 @@ def fit(
         patience: int = 15,
         monitor: str = "val_loss",
         mode: str = "min",
-        lr: float = 1e-4,
-        lr_patience: int = 10,
-        factor: float = 0.1,
-        weight_decay: float = 1e-06,
+        lr: float = None,
+        lr_patience: int = None,
+        lr_factor: float = None,
+        weight_decay: float = None,
         checkpoint_path="model_checkpoints",
         dataloader_kwargs={},
         rebuild=True,
@@ -371,10 +375,14 @@ def fit(
                 config=self.config,
                 cat_feature_info=self.data_module.cat_feature_info,
                 num_feature_info=self.data_module.num_feature_info,
-                lr=lr,
-                lr_patience=lr_patience,
-                lr_factor=factor,
-                weight_decay=weight_decay,
+                lr_patience=lr_patience
+                if lr_patience is not None
+                else self.config.lr_patience,
+                lr=lr if lr is not None else self.config.lr,
+                lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
+                weight_decay=weight_decay
+                if weight_decay is not None
+                else self.config.weight_decay,
                 optimizer_type=self.optimizer_type,
                 optimizer_args=self.optimizer_kwargs,
             )

From 2414fa38ccb0f717c4d8450b17f1badf7347210a Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:55:03 +0000
Subject: [PATCH 077/132] delete lr related default params in fit

---
 mambular/models/sklearn_base_lss.py | 42 ++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index 132954d..d2af6b5 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -161,10 +161,10 @@ def build_model(
         random_state: int = 101,
         batch_size: int = 128,
         shuffle: bool = True,
-        lr: float = 1e-4,
-        lr_patience: int = 10,
-        factor: float = 0.1,
-        weight_decay: float = 1e-06,
+        lr: float = None,
+        lr_patience: int = None,
+        lr_factor: float = None,
+        weight_decay: float = None,
         dataloader_kwargs={},
     ):
         """
@@ -233,13 +233,19 @@ def build_model(
         self.task_model = TaskModel(
             model_class=self.base_model,
             num_classes=self.family.param_count,
+            family=self.family,
             config=self.config,
             cat_feature_info=self.data_module.cat_feature_info,
             num_feature_info=self.data_module.num_feature_info,
-            lr=lr,
-            lr_patience=lr_patience,
-            lr_factor=factor,
-            weight_decay=weight_decay,
+            lr=lr if lr is not None else self.config.lr,
+            lr_patience=lr_patience
+            if lr_patience is not None
+            else self.config.lr_patience,
+            lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
+            weight_decay=weight_decay
+            if weight_decay is not None
+            else self.config.weight_decay,
+            lss=True,
             optimizer_type=self.optimizer_type,
             optimizer_args=self.optimizer_kwargs,
         )
@@ -295,10 +301,10 @@ def fit(
         patience: int = 15,
         monitor: str = "val_loss",
         mode: str = "min",
-        lr: float = 1e-4,
-        lr_patience: int = 10,
-        factor: float = 0.1,
-        weight_decay: float = 1e-06,
+        lr: float = None,
+        lr_patience: int = None,
+        lr_factor: float = None,
+        weight_decay: float = None,
         checkpoint_path="model_checkpoints",
         distributional_kwargs=None,
         dataloader_kwargs={},
@@ -411,10 +417,14 @@ def fit(
             config=self.config,
             cat_feature_info=self.data_module.cat_feature_info,
             num_feature_info=self.data_module.num_feature_info,
-            lr=lr,
-            lr_patience=lr_patience,
-            lr_factor=factor,
-            weight_decay=weight_decay,
+            lr=lr if lr is not None else self.config.lr,
+            lr_patience=lr_patience
+            if lr_patience is not None
+            else self.config.lr_patience,
+            lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
+            weight_decay=weight_decay
+            if weight_decay is not None
+            else self.config.weight_decay,
             lss=True,
             optimizer_type=self.optimizer_type,
             optimizer_args=self.optimizer_kwargs,

From 8091909d967dcb2245252af1cc0e4a51b67f8f73 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 16:55:39 +0000
Subject: [PATCH 078/132] lr realted param adjustments

---
 mambular/models/sklearn_base_regressor.py | 40 ++++++++++++++---------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index afc4e98..d1d99ca 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -140,10 +140,10 @@ def build_model(
         random_state: int = 101,
         batch_size: int = 128,
         shuffle: bool = True,
-        lr: float = 1e-4,
-        lr_patience: int = 10,
-        factor: float = 0.1,
-        weight_decay: float = 1e-06,
+        lr: float = None,
+        lr_patience: int = None,
+        lr_factor: float = None,
+        weight_decay: float = None,
         dataloader_kwargs={},
     ):
         """
@@ -216,10 +216,14 @@ def build_model(
             config=self.config,
             cat_feature_info=self.data_module.cat_feature_info,
             num_feature_info=self.data_module.num_feature_info,
-            lr=lr,
-            lr_patience=lr_patience,
-            lr_factor=factor,
-            weight_decay=weight_decay,
+            lr=lr if lr is not None else self.config.lr,
+            lr_patience=lr_patience
+            if lr_patience is not None
+            else self.config.lr_patience,
+            lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
+            weight_decay=weight_decay
+            if weight_decay is not None
+            else self.config.weight_decay,
             optimizer_type=self.optimizer_type,
             optimizer_args=self.optimizer_kwargs,
         )
@@ -274,10 +278,10 @@ def fit(
         patience: int = 15,
         monitor: str = "val_loss",
         mode: str = "min",
-        lr: float = 1e-4,
-        lr_patience: int = 10,
-        factor: float = 0.1,
-        weight_decay: float = 1e-06,
+        lr: float = None,
+        lr_patience: int = None,
+        lr_factor: float = None,
+        weight_decay: float = None,
         checkpoint_path="model_checkpoints",
         dataloader_kwargs={},
         rebuild=True,
@@ -364,10 +368,14 @@ def fit(
                 config=self.config,
                 cat_feature_info=self.data_module.cat_feature_info,
                 num_feature_info=self.data_module.num_feature_info,
-                lr=lr,
-                lr_patience=lr_patience,
-                lr_factor=factor,
-                weight_decay=weight_decay,
+                lr=lr if lr is not None else self.config.lr,
+                lr_patience=lr_patience
+                if lr_patience is not None
+                else self.config.lr_patience,
+                lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
+                weight_decay=weight_decay
+                if weight_decay is not None
+                else self.config.weight_decay,
                 optimizer_type=self.optimizer_type,
                 optimizer_args=self.optimizer_kwargs,
             )

From ea184ce868dc6c242ca331f83b6cb31b9163d908 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 19:39:47 +0000
Subject: [PATCH 079/132] make usable even when params not in config

---
 mambular/arch_utils/get_norm_fn.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mambular/arch_utils/get_norm_fn.py b/mambular/arch_utils/get_norm_fn.py
index d32ff16..a9e9f9b 100644
--- a/mambular/arch_utils/get_norm_fn.py
+++ b/mambular/arch_utils/get_norm_fn.py
@@ -28,10 +28,9 @@ def get_normalization_layer(config):
         If an unsupported normalization layer is specified in the config.
     """
 
-    norm_layer = config.norm
-
-    d_model = config.d_model
-    layer_norm_eps = config.layer_norm_eps
+    norm_layer = getattr(config, "norm", None)
+    d_model = getattr(config, "d_model", 128)
+    layer_norm_eps = getattr(config, "layer_norm_eps", 1e-05)
 
     if norm_layer == "RMSNorm":
         return RMSNorm(d_model, eps=layer_norm_eps)

From 063b8dd0cb94da1bdb54d2ca65c00c9111637f59 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 19:39:57 +0000
Subject: [PATCH 080/132] adapt embedding layer to plr encodings

---
 .../arch_utils/layer_utils/embedding_layer.py | 83 ++++++++++++++-----
 1 file changed, 61 insertions(+), 22 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index 7fbcbdc..653c137 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 from .embedding_tree import NeuralEmbeddingTree
+from .plr_layer import PeriodicEmbeddings
 
 
 class EmbeddingLayer(nn.Module):
@@ -19,7 +20,7 @@ def __init__(self, num_feature_info, cat_feature_info, config):
         """
         super(EmbeddingLayer, self).__init__()
 
-        self.d_model = config.d_model
+        self.d_model = getattr(config, "d_model", 128)
         self.embedding_activation = getattr(
             config, "embedding_activation", nn.Identity()
         )
@@ -34,16 +35,42 @@ def __init__(self, num_feature_info, cat_feature_info, config):
             if getattr(config, "embedding_dropout", None) is not None
             else None
         )
+        self.embedding_type = getattr(config, "embedding_type", "standard")
 
-        self.num_embeddings = nn.ModuleList(
-            [
-                nn.Sequential(
-                    nn.Linear(input_shape, self.d_model, bias=False),
-                    self.embedding_activation,
-                )
-                for feature_name, input_shape in num_feature_info.items()
-            ]
-        )
+        # Sequence length
+        self.seq_len = len(num_feature_info) + len(cat_feature_info)
+
+        # Initialize numerical embeddings based on embedding_type
+        if self.embedding_type == "ndt":
+            self.num_embeddings = nn.ModuleList(
+                [
+                    NeuralEmbeddingTree(input_shape, self.d_model)
+                    for feature_name, input_shape in num_feature_info.items()
+                ]
+            )
+        elif self.embedding_type == "plr":
+            self.num_embeddings = PeriodicEmbeddings(
+                n_features=self.seq_len,
+                d_embedding=self.d_model,
+                n_frequencies=getattr(config, "n_frequencies", 48),
+                frequency_init_scale=getattr(config, "frequency_init_scale", 0.01),
+                activation=self.embedding_activation,
+                lite=getattr(config, "plr_lite", False),
+            )
+        elif self.embedding_type == "standard":
+            self.num_embeddings = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Linear(input_shape, self.d_model, bias=False),
+                        self.embedding_activation,
+                    )
+                    for feature_name, input_shape in num_feature_info.items()
+                ]
+            )
+        else:
+            raise ValueError(
+                "Invalid embedding_type. Choose from 'standard', 'ndt', or 'plr'."
+            )
 
         # Initialize categorical embeddings
         self.cat_embeddings = nn.ModuleList()
@@ -72,9 +99,6 @@ def __init__(self, num_feature_info, cat_feature_info, config):
         if self.layer_norm_after_embedding:
             self.embedding_norm = nn.LayerNorm(self.d_model)
 
-        # Sequence length
-        self.seq_len = len(self.num_embeddings) + len(self.cat_embeddings)
-
     def forward(self, num_features=None, cat_features=None):
         """
         Defines the forward pass of the model.
@@ -117,16 +141,31 @@ def forward(self, num_features=None, cat_features=None):
         else:
             cat_embeddings = None
 
-        # Process numerical embeddings
-        if self.num_embeddings and num_features is not None:
-            num_embeddings = [
-                emb(num_features[i]) for i, emb in enumerate(self.num_embeddings)
-            ]
-            num_embeddings = torch.stack(num_embeddings, dim=1)
-            if self.layer_norm_after_embedding:
-                num_embeddings = self.embedding_norm(num_embeddings)
+        # Process numerical embeddings based on embedding_type
+        if self.embedding_type == "plr":
+            # For PLR, pass all numerical features together
+            if num_features is not None:
+                num_features = torch.stack(num_features, dim=1).squeeze(
+                    -1
+                )  # Stack features along the feature dimension
+                num_embeddings = self.num_embeddings(
+                    num_features
+                )  # Use the single PLR layer for all features
+                if self.layer_norm_after_embedding:
+                    num_embeddings = self.embedding_norm(num_embeddings)
+            else:
+                num_embeddings = None
         else:
-            num_embeddings = None
+            # For standard and ndt embeddings, handle each feature individually
+            if self.num_embeddings and num_features is not None:
+                num_embeddings = [
+                    emb(num_features[i]) for i, emb in enumerate(self.num_embeddings)
+                ]
+                num_embeddings = torch.stack(num_embeddings, dim=1)
+                if self.layer_norm_after_embedding:
+                    num_embeddings = self.embedding_norm(num_embeddings)
+            else:
+                num_embeddings = None
 
         # Combine categorical and numerical embeddings
         if cat_embeddings is not None and num_embeddings is not None:

From 786e4d2292fc1c151c8f97241fd899627745c07a Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 19:40:05 +0000
Subject: [PATCH 081/132] PLR layer inclusion

---
 mambular/arch_utils/layer_utils/plr_layer.py | 101 +++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 mambular/arch_utils/layer_utils/plr_layer.py

diff --git a/mambular/arch_utils/layer_utils/plr_layer.py b/mambular/arch_utils/layer_utils/plr_layer.py
new file mode 100644
index 0000000..ad66b7b
--- /dev/null
+++ b/mambular/arch_utils/layer_utils/plr_layer.py
@@ -0,0 +1,101 @@
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.parameter import Parameter
+import math
+
+
+class Periodic(nn.Module):
+    """Periodic transformation with learned frequency coefficients."""
+
+    def __init__(self, n_features: int, k: int, sigma: float) -> None:
+        super().__init__()
+        if sigma <= 0.0:
+            raise ValueError(f"sigma must be positive, but got {sigma=}")
+
+        self._sigma = sigma
+        self.weight = Parameter(torch.empty(n_features, k))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        bound = self._sigma * 3
+        nn.init.trunc_normal_(self.weight, 0.0, self._sigma, a=-bound, b=bound)
+
+    def forward(self, x):
+        x = 2 * math.pi * self.weight * x[..., None]
+        return torch.cat([torch.cos(x), torch.sin(x)], dim=-1)
+
+
+class SNLinear(nn.Module):
+    """Separate linear layers for each feature embedding."""
+
+    def __init__(self, n: int, in_features: int, out_features: int) -> None:
+        super().__init__()
+        self.weight = Parameter(torch.empty(n, in_features, out_features))
+        self.bias = Parameter(torch.empty(n, out_features))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        d_in_rsqrt = self.weight.shape[-2] ** -0.5
+        nn.init.uniform_(self.weight, -d_in_rsqrt, d_in_rsqrt)
+        nn.init.uniform_(self.bias, -d_in_rsqrt, d_in_rsqrt)
+
+    def forward(self, x):
+        if x.ndim != 3:
+            raise ValueError(
+                "_NLinear requires a 3D input (batch, features, embedding)."
+            )
+        if x.shape[-(self.weight.ndim - 1) :] != self.weight.shape[:-1]:
+            raise ValueError("Input shape mismatch with weight dimensions.")
+
+        x = x.transpose(0, 1) @ self.weight
+        return x.transpose(0, 1) + self.bias
+
+
+class PeriodicEmbeddings(nn.Module):
+    """Embeddings for continuous features using Periodic + Linear (+ ReLU) transformations.
+
+    Supports PL, PLR, and PLR(lite) embedding types.
+
+    Shape:
+        - Input: (*, n_features)
+        - Output: (*, n_features, d_embedding)
+    """
+
+    def __init__(
+        self,
+        n_features: int,
+        d_embedding: int = 24,
+        *,
+        n_frequencies: int = 48,
+        frequency_init_scale: float = 0.01,
+        activation: bool = True,
+        lite: bool = False,
+    ):
+        """
+        Args:
+            n_features (int): Number of features.
+            d_embedding (int): Size of each feature embedding.
+            n_frequencies (int): Number of frequencies per feature.
+            frequency_init_scale (float): Initialization scale for frequency coefficients.
+            activation (bool): If True, applies ReLU, making it PLR; otherwise, PL.
+            lite (bool): If True, uses shared linear layer (PLR lite); otherwise, separate layers.
+        """
+        super().__init__()
+        self.periodic = Periodic(n_features, n_frequencies, frequency_init_scale)
+
+        # Choose linear transformation: shared or separate
+        if lite:
+            if not activation:
+                raise ValueError("lite=True requires activation=True")
+            self.linear = nn.Linear(2 * n_frequencies, d_embedding)
+        else:
+            self.linear = SNLinear(n_features, 2 * n_frequencies, d_embedding)
+
+        self.activation = nn.ReLU() if activation else None
+
+    def forward(self, x):
+        """Forward pass."""
+        x = self.periodic(x)
+        x = self.linear(x)
+        return self.activation(x) if self.activation else x

From b900b7116f3ed04b7e33ffafcc5aa2ecce55115a Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 19:40:41 +0000
Subject: [PATCH 082/132] minor fix in embedding layer creation

---
 mambular/base_models/mlp.py    | 8 ++++++--
 mambular/base_models/resnet.py | 6 +++---
 mambular/base_models/tabm.py   | 7 ++++++-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index ec64d3d..f9cc315 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -67,7 +67,7 @@ def __init__(
         super().__init__(**kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.layer_sizes = self.hparams.get("layer_sizes", self.layer_sizes)
+        self.layer_sizes = self.hparams.get("layer_sizes", config.layer_sizes)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
@@ -87,7 +87,11 @@ def __init__(
             input_dim += 1
 
         if self.use_embeddings:
-            self.embedding_layer = EmbeddingLayer(config)
+            self.embedding_layer = EmbeddingLayer(
+                num_feature_info=num_feature_info,
+                cat_feature_info=cat_feature_info,
+                config=config,
+            )
             input_dim = (
                 len(num_feature_info) * config.d_model
                 + len(cat_feature_info) * config.d_model
diff --git a/mambular/base_models/resnet.py b/mambular/base_models/resnet.py
index c08ab6f..e7b018a 100644
--- a/mambular/base_models/resnet.py
+++ b/mambular/base_models/resnet.py
@@ -69,7 +69,7 @@ def __init__(
         super().__init__(**kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.layer_sizes = self.hparams.get("layer_sizes", self.layer_sizes)
+        self.layer_sizes = self.hparams.get("layer_sizes", config.layer_sizes)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
         self.activation = config.activation
@@ -81,6 +81,8 @@ def __init__(
         for feature_name, input_shape in cat_feature_info.items():
             input_dim += 1
 
+        self.norm_f = get_normalization_layer(config)
+
         if self.use_embeddings:
             input_dim = (
                 len(num_feature_info) * config.d_model
@@ -114,8 +116,6 @@ def __init__(
 
         self.output_layer = nn.Linear(self.layer_sizes[-1], num_classes)
 
-        self.norm_f = get_normalization_layer(config)
-
     def forward(self, num_features, cat_features):
         if self.use_embeddings:
             x = self.embedding_layer(num_features, cat_features)
diff --git a/mambular/base_models/tabm.py b/mambular/base_models/tabm.py
index 1c86a5e..2468ecf 100644
--- a/mambular/base_models/tabm.py
+++ b/mambular/base_models/tabm.py
@@ -90,7 +90,12 @@ def __init__(
 
         # Embedding layer
         if self.use_embeddings:
-            self.embedding_layer = EmbeddingLayer(config)
+            self.embedding_layer = EmbeddingLayer(
+                num_feature_info=num_feature_info,
+                cat_feature_info=cat_feature_info,
+                config=config,
+            )
+
             if self.hparams.get("average_embeddings", config.average_embeddings):
                 input_dim = self.hparams.get("d_model", config.d_model)
             else:

From 0986c65e5ea2f5b3eb8e4b2d5abb3e6bfcce2caf Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 19:40:55 +0000
Subject: [PATCH 083/132] adjust defaults

---
 mambular/configs/batchtabrnn_config.py |  2 +-
 mambular/configs/mlp_config.py         |  2 ++
 mambular/configs/resnet_config.py      | 10 ++++++++--
 mambular/configs/tabm_config.py        |  6 +++++-
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/mambular/configs/batchtabrnn_config.py b/mambular/configs/batchtabrnn_config.py
index 12b8aa5..089ce54 100644
--- a/mambular/configs/batchtabrnn_config.py
+++ b/mambular/configs/batchtabrnn_config.py
@@ -86,7 +86,7 @@ class DefaultBatchTabRNNConfig:
     residuals: bool = False
 
     # Batch ensembling specific configurations
-    ensemble_size: int = 16
+    ensemble_size: int = 32
     ensemble_scaling_in: bool = True
     ensemble_scaling_out: bool = True
     ensemble_bias: bool = True
diff --git a/mambular/configs/mlp_config.py b/mambular/configs/mlp_config.py
index 9a99a12..599d225 100644
--- a/mambular/configs/mlp_config.py
+++ b/mambular/configs/mlp_config.py
@@ -63,3 +63,5 @@ class DefaultMLPConfig:
     embedding_activation: callable = nn.Identity()
     layer_norm_after_embedding: bool = False
     d_model: int = 32
+    embedding_type: float = "plr"
+    plr_lite: bool = False
diff --git a/mambular/configs/resnet_config.py b/mambular/configs/resnet_config.py
index c2fb1bc..1103169 100644
--- a/mambular/configs/resnet_config.py
+++ b/mambular/configs/resnet_config.py
@@ -60,8 +60,14 @@ class DefaultResNetConfig:
     skip_connections: bool = True
     batch_norm: bool = True
     layer_norm: bool = False
+    layer_norm_eps: float = 1e-05
     num_blocks: int = 3
-    use_embeddings: bool = False
+
+    # embedding params
+    use_embeddings: bool = True
+    embedding_type: float = "plr"
+    plr_lite: bool = False
+    average_embeddings: bool = True
     embedding_activation: callable = nn.Identity()
     layer_norm_after_embedding: bool = False
-    d_model: int = 32
+    d_model: int = 64
diff --git a/mambular/configs/tabm_config.py b/mambular/configs/tabm_config.py
index 7d55360..eafa4b9 100644
--- a/mambular/configs/tabm_config.py
+++ b/mambular/configs/tabm_config.py
@@ -72,7 +72,11 @@ class DefaultTabMConfig:
     batch_norm: bool = False
     layer_norm: bool = False
     layer_norm_eps: float = 1e-05
-    use_embeddings: bool = False
+
+    # embedding params
+    use_embeddings: bool = True
+    embedding_type: float = "plr"
+    plr_lite: bool = False
     average_embeddings: bool = True
     embedding_activation: callable = nn.Identity()
     layer_norm_after_embedding: bool = False

From 058bad94abe1591542640db0937e3425ce53c1c7 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 11 Nov 2024 19:41:14 +0000
Subject: [PATCH 084/132] include new models in init

---
 mambular/models/__init__.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mambular/models/__init__.py b/mambular/models/__init__.py
index f9d82f6..51eac90 100644
--- a/mambular/models/__init__.py
+++ b/mambular/models/__init__.py
@@ -25,6 +25,8 @@
 
 from .ndtf import NDTFClassifier, NDTFRegressor, NDTFLSS
 from .node import NODEClassifier, NODERegressor, NODELSS
+from .tabm import TabMClassifier, TabMRegressor, TabMLSS
+from .batchtabrnn import BatchTabRNNRegressor, BatchTabRNNClassifier, BatchTabRNNLSS
 
 
 __all__ = [
@@ -61,4 +63,10 @@
     "NODEClassifier",
     "NODERegressor",
     "NODELSS",
+    "TabMClassifier",
+    "TabMRegressor",
+    "TabMLSS",
+    "BatchTabRNNRegressor",
+    "BatchTabRNNClassifier",
+    "BatchTabRNNLSS",
 ]

From 7de4982d8a94d53a47b60013a8030b94aa527dd0 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Tue, 12 Nov 2024 22:20:23 +0100
Subject: [PATCH 085/132] fix validation dataset bug

---
 mambular/models/sklearn_base_classifier.py | 30 ++++++++++++----------
 mambular/models/sklearn_base_lss.py        | 28 ++++++++++----------
 mambular/models/sklearn_base_regressor.py  | 30 ++++++++++++----------
 3 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 687ca7d..2c7f033 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -188,7 +188,7 @@ def build_model(
             X = pd.DataFrame(X)
         if isinstance(y, pd.Series):
             y = y.values
-        if X_val:
+        if X_val is not None:
             if not isinstance(X_val, pd.DataFrame):
                 X_val = pd.DataFrame(X_val)
             if isinstance(y_val, pd.Series):
@@ -218,14 +218,14 @@ def build_model(
             config=self.config,
             cat_feature_info=self.data_module.cat_feature_info,
             num_feature_info=self.data_module.num_feature_info,
-            lr_patience=lr_patience
-            if lr_patience is not None
-            else self.config.lr_patience,
+            lr_patience=(
+                lr_patience if lr_patience is not None else self.config.lr_patience
+            ),
             lr=lr if lr is not None else self.config.lr,
             lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-            weight_decay=weight_decay
-            if weight_decay is not None
-            else self.config.weight_decay,
+            weight_decay=(
+                weight_decay if weight_decay is not None else self.config.weight_decay
+            ),
             optimizer_type=self.optimizer_type,
             optimizer_args=self.optimizer_kwargs,
         )
@@ -345,7 +345,7 @@ def fit(
                 X = pd.DataFrame(X)
             if isinstance(y, pd.Series):
                 y = y.values
-            if X_val:
+            if X_val is not None:
                 if not isinstance(X_val, pd.DataFrame):
                     X_val = pd.DataFrame(X_val)
                 if isinstance(y_val, pd.Series):
@@ -375,14 +375,16 @@ def fit(
                 config=self.config,
                 cat_feature_info=self.data_module.cat_feature_info,
                 num_feature_info=self.data_module.num_feature_info,
-                lr_patience=lr_patience
-                if lr_patience is not None
-                else self.config.lr_patience,
+                lr_patience=(
+                    lr_patience if lr_patience is not None else self.config.lr_patience
+                ),
                 lr=lr if lr is not None else self.config.lr,
                 lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-                weight_decay=weight_decay
-                if weight_decay is not None
-                else self.config.weight_decay,
+                weight_decay=(
+                    weight_decay
+                    if weight_decay is not None
+                    else self.config.weight_decay
+                ),
                 optimizer_type=self.optimizer_type,
                 optimizer_args=self.optimizer_kwargs,
             )
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index d2af6b5..f6191b5 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -208,7 +208,7 @@ def build_model(
             X = pd.DataFrame(X)
         if isinstance(y, pd.Series):
             y = y.values
-        if X_val:
+        if X_val is not None:
             if not isinstance(X_val, pd.DataFrame):
                 X_val = pd.DataFrame(X_val)
             if isinstance(y_val, pd.Series):
@@ -238,13 +238,13 @@ def build_model(
             cat_feature_info=self.data_module.cat_feature_info,
             num_feature_info=self.data_module.num_feature_info,
             lr=lr if lr is not None else self.config.lr,
-            lr_patience=lr_patience
-            if lr_patience is not None
-            else self.config.lr_patience,
+            lr_patience=(
+                lr_patience if lr_patience is not None else self.config.lr_patience
+            ),
             lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-            weight_decay=weight_decay
-            if weight_decay is not None
-            else self.config.weight_decay,
+            weight_decay=(
+                weight_decay if weight_decay is not None else self.config.weight_decay
+            ),
             lss=True,
             optimizer_type=self.optimizer_type,
             optimizer_args=self.optimizer_kwargs,
@@ -388,7 +388,7 @@ def fit(
             X = pd.DataFrame(X)
         if isinstance(y, pd.Series):
             y = y.values
-        if X_val:
+        if X_val is not None:
             if not isinstance(X_val, pd.DataFrame):
                 X_val = pd.DataFrame(X_val)
             if isinstance(y_val, pd.Series):
@@ -418,13 +418,13 @@ def fit(
             cat_feature_info=self.data_module.cat_feature_info,
             num_feature_info=self.data_module.num_feature_info,
             lr=lr if lr is not None else self.config.lr,
-            lr_patience=lr_patience
-            if lr_patience is not None
-            else self.config.lr_patience,
+            lr_patience=(
+                lr_patience if lr_patience is not None else self.config.lr_patience
+            ),
             lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-            weight_decay=weight_decay
-            if weight_decay is not None
-            else self.config.weight_decay,
+            weight_decay=(
+                weight_decay if weight_decay is not None else self.config.weight_decay
+            ),
             lss=True,
             optimizer_type=self.optimizer_type,
             optimizer_args=self.optimizer_kwargs,
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index d1d99ca..874aa45 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -189,7 +189,7 @@ def build_model(
             X = pd.DataFrame(X)
         if isinstance(y, pd.Series):
             y = y.values
-        if X_val:
+        if X_val is not None:
             if not isinstance(X_val, pd.DataFrame):
                 X_val = pd.DataFrame(X_val)
             if isinstance(y_val, pd.Series):
@@ -217,13 +217,13 @@ def build_model(
             cat_feature_info=self.data_module.cat_feature_info,
             num_feature_info=self.data_module.num_feature_info,
             lr=lr if lr is not None else self.config.lr,
-            lr_patience=lr_patience
-            if lr_patience is not None
-            else self.config.lr_patience,
+            lr_patience=(
+                lr_patience if lr_patience is not None else self.config.lr_patience
+            ),
             lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-            weight_decay=weight_decay
-            if weight_decay is not None
-            else self.config.weight_decay,
+            weight_decay=(
+                weight_decay if weight_decay is not None else self.config.weight_decay
+            ),
             optimizer_type=self.optimizer_type,
             optimizer_args=self.optimizer_kwargs,
         )
@@ -341,7 +341,7 @@ def fit(
                 X = pd.DataFrame(X)
             if isinstance(y, pd.Series):
                 y = y.values
-            if X_val:
+            if X_val is not None:
                 if not isinstance(X_val, pd.DataFrame):
                     X_val = pd.DataFrame(X_val)
                 if isinstance(y_val, pd.Series):
@@ -369,13 +369,15 @@ def fit(
                 cat_feature_info=self.data_module.cat_feature_info,
                 num_feature_info=self.data_module.num_feature_info,
                 lr=lr if lr is not None else self.config.lr,
-                lr_patience=lr_patience
-                if lr_patience is not None
-                else self.config.lr_patience,
+                lr_patience=(
+                    lr_patience if lr_patience is not None else self.config.lr_patience
+                ),
                 lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-                weight_decay=weight_decay
-                if weight_decay is not None
-                else self.config.weight_decay,
+                weight_decay=(
+                    weight_decay
+                    if weight_decay is not None
+                    else self.config.weight_decay
+                ),
                 optimizer_type=self.optimizer_type,
                 optimizer_args=self.optimizer_kwargs,
             )

From 097c6f46c68c607fd4f042d572304d39b8495cf5 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Tue, 12 Nov 2024 22:32:10 +0100
Subject: [PATCH 086/132] original_mamba dt_rank fix

---
 mambular/arch_utils/mamba_utils/mamba_original.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index b5e726a..988abdc 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -77,7 +77,6 @@ def __init__(
             d_state=d_state,
             d_conv=d_conv,
             expand=expand_factor,
-            dt_rank=dt_rank,
             dt_min=dt_min,
             dt_max=dt_max,
             dt_init=dt_init,
@@ -85,7 +84,6 @@ def __init__(
             dt_init_floor=dt_init_floor,
             conv_bias=conv_bias,
             bias=bias,
-            use_fast_path=True,  # Fused kernel options
             layer_idx=layer_idx,
         )
         self.norm = norm

From 9650d2459821fd4bf28648c1c7d33caed54e0750 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Wed, 13 Nov 2024 23:55:31 +0100
Subject: [PATCH 087/132] including tab_mini

---
 .../layer_utils/batch_ensemble_layer.py       | 11 ++--
 .../arch_utils/layer_utils/embedding_layer.py |  2 +-
 mambular/arch_utils/layer_utils/plr_layer.py  | 28 +--------
 mambular/arch_utils/layer_utils/sn_linear.py  | 29 ++++++++++
 mambular/base_models/tabm.py                  | 57 +++++++++----------
 mambular/configs/tabm_config.py               |  3 +-
 6 files changed, 67 insertions(+), 63 deletions(-)
 create mode 100644 mambular/arch_utils/layer_utils/sn_linear.py

diff --git a/mambular/arch_utils/layer_utils/batch_ensemble_layer.py b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
index ea7330e..d0ed3a4 100644
--- a/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
+++ b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
@@ -44,15 +44,17 @@ def __init__(
         self.bias = (
             nn.Parameter(torch.empty(out_features))
             if not ensemble_bias and out_features > 0
-            else nn.Parameter(torch.empty(ensemble_size, out_features))
-            if ensemble_bias
-            else None
+            else (
+                nn.Parameter(torch.empty(ensemble_size, out_features))
+                if ensemble_bias
+                else None
+            )
         )
 
         # Initialize parameters
         self.reset_parameters(scaling_init)
 
-    def reset_parameters(self, scaling_init: Literal["ones", "random-signs"]):
+    def reset_parameters(self, scaling_init: Literal["ones", "random-signs", "normal"]):
         # Initialize W using a uniform distribution
         nn.init.kaiming_uniform_(self.W, a=math.sqrt(5))
 
@@ -60,6 +62,7 @@ def reset_parameters(self, scaling_init: Literal["ones", "random-signs"]):
         scaling_init_fn = {
             "ones": nn.init.ones_,
             "random-signs": lambda x: torch.sign(torch.randn_like(x)),
+            "normal": lambda x: nn.init.normal_(x, mean=0.0, std=1.0),
         }
 
         if self.r is not None:
diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index 653c137..042ca4e 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -50,7 +50,7 @@ def __init__(self, num_feature_info, cat_feature_info, config):
             )
         elif self.embedding_type == "plr":
             self.num_embeddings = PeriodicEmbeddings(
-                n_features=self.seq_len,
+                n_features=len(num_feature_info),
                 d_embedding=self.d_model,
                 n_frequencies=getattr(config, "n_frequencies", 48),
                 frequency_init_scale=getattr(config, "frequency_init_scale", 0.01),
diff --git a/mambular/arch_utils/layer_utils/plr_layer.py b/mambular/arch_utils/layer_utils/plr_layer.py
index ad66b7b..f61caf3 100644
--- a/mambular/arch_utils/layer_utils/plr_layer.py
+++ b/mambular/arch_utils/layer_utils/plr_layer.py
@@ -1,8 +1,8 @@
 import torch
 import torch.nn as nn
-from torch import Tensor
 from torch.nn.parameter import Parameter
 import math
+from .sn_linear import SNLinear
 
 
 class Periodic(nn.Module):
@@ -26,32 +26,6 @@ def forward(self, x):
         return torch.cat([torch.cos(x), torch.sin(x)], dim=-1)
 
 
-class SNLinear(nn.Module):
-    """Separate linear layers for each feature embedding."""
-
-    def __init__(self, n: int, in_features: int, out_features: int) -> None:
-        super().__init__()
-        self.weight = Parameter(torch.empty(n, in_features, out_features))
-        self.bias = Parameter(torch.empty(n, out_features))
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        d_in_rsqrt = self.weight.shape[-2] ** -0.5
-        nn.init.uniform_(self.weight, -d_in_rsqrt, d_in_rsqrt)
-        nn.init.uniform_(self.bias, -d_in_rsqrt, d_in_rsqrt)
-
-    def forward(self, x):
-        if x.ndim != 3:
-            raise ValueError(
-                "_NLinear requires a 3D input (batch, features, embedding)."
-            )
-        if x.shape[-(self.weight.ndim - 1) :] != self.weight.shape[:-1]:
-            raise ValueError("Input shape mismatch with weight dimensions.")
-
-        x = x.transpose(0, 1) @ self.weight
-        return x.transpose(0, 1) + self.bias
-
-
 class PeriodicEmbeddings(nn.Module):
     """Embeddings for continuous features using Periodic + Linear (+ ReLU) transformations.
 
diff --git a/mambular/arch_utils/layer_utils/sn_linear.py b/mambular/arch_utils/layer_utils/sn_linear.py
new file mode 100644
index 0000000..10a6943
--- /dev/null
+++ b/mambular/arch_utils/layer_utils/sn_linear.py
@@ -0,0 +1,29 @@
+import torch
+import torch.nn as nn
+from torch.nn.parameter import Parameter
+
+
+class SNLinear(nn.Module):
+    """Separate linear layers for each feature embedding."""
+
+    def __init__(self, n: int, in_features: int, out_features: int) -> None:
+        super().__init__()
+        self.weight = Parameter(torch.empty(n, in_features, out_features))
+        self.bias = Parameter(torch.empty(n, out_features))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        d_in_rsqrt = self.weight.shape[-2] ** -0.5
+        nn.init.uniform_(self.weight, -d_in_rsqrt, d_in_rsqrt)
+        nn.init.uniform_(self.bias, -d_in_rsqrt, d_in_rsqrt)
+
+    def forward(self, x):
+        if x.ndim != 3:
+            raise ValueError(
+                "SNLinear requires a 3D input (batch, features, embedding)."
+            )
+        if x.shape[-(self.weight.ndim - 1) :] != self.weight.shape[:-1]:
+            raise ValueError("Input shape mismatch with weight dimensions.")
+
+        x = x.transpose(0, 1) @ self.weight
+        return x.transpose(0, 1) + self.bias
diff --git a/mambular/base_models/tabm.py b/mambular/base_models/tabm.py
index 2468ecf..4860b84 100644
--- a/mambular/base_models/tabm.py
+++ b/mambular/base_models/tabm.py
@@ -5,6 +5,7 @@
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.layer_utils.batch_ensemble_layer import LinearBatchEnsembleLayer
+from ..arch_utils.layer_utils.sn_linear import SNLinear
 
 
 class TabM(BaseModel):
@@ -138,21 +139,30 @@ def __init__(
 
         # Hidden layers with batch ensembling
         for i in range(1, len(self.layer_sizes)):
-            self.layers.append(
-                LinearBatchEnsembleLayer(
-                    in_features=self.layer_sizes[i - 1],
-                    out_features=self.layer_sizes[i],
-                    ensemble_size=config.ensemble_size,
-                    ensemble_scaling_in=config.ensemble_scaling_in,
-                    ensemble_scaling_out=config.ensemble_scaling_out,
-                    ensemble_bias=config.ensemble_bias,
-                    scaling_init=config.scaling_init,
+            if config.model_type == "mini":
+                self.layers.append(
+                    LinearBatchEnsembleLayer(
+                        in_features=self.layer_sizes[i - 1],
+                        out_features=self.layer_sizes[i],
+                        ensemble_size=config.ensemble_size,
+                        ensemble_scaling_in=False,
+                        ensemble_scaling_out=False,
+                        ensemble_bias=config.ensemble_bias,
+                        scaling_init="ones",
+                    )
+                )
+            else:
+                self.layers.append(
+                    LinearBatchEnsembleLayer(
+                        in_features=self.layer_sizes[i - 1],
+                        out_features=self.layer_sizes[i],
+                        ensemble_size=config.ensemble_size,
+                        ensemble_scaling_in=config.ensemble_scaling_in,
+                        ensemble_scaling_out=config.ensemble_scaling_out,
+                        ensemble_bias=config.ensemble_bias,
+                        scaling_init="ones",
+                    )
                 )
-            )
-            if config.batch_norm:
-                self.layers.append(nn.BatchNorm1d(self.layer_sizes[i]))
-            if config.layer_norm:
-                self.layers.append(nn.LayerNorm(self.layer_sizes[i]))
 
             if config.use_glu:
                 self.layers.append(nn.GLU())
@@ -161,22 +171,9 @@ def __init__(
             if config.dropout > 0.0:
                 self.layers.append(nn.Dropout(config.dropout))
 
-        # Output layer
-        self.layers.append(
-            LinearBatchEnsembleLayer(
-                in_features=self.layer_sizes[-1],
-                out_features=num_classes,
-                ensemble_size=config.ensemble_size,
-                ensemble_scaling_in=config.ensemble_scaling_in,
-                ensemble_scaling_out=config.ensemble_scaling_out,
-                ensemble_bias=config.ensemble_bias,
-                scaling_init=config.scaling_init,
-            )
-        )
-
         if not self.hparams.get("average_ensembles", True):
-            self.final_layer = nn.Linear(
-                self.layer_sizes[-1] * config.ensemble_size, num_classes
+            self.final_layer = SNLinear(
+                config.ensemble_size, self.layer_sizes[-1], num_classes
             )
 
     def forward(self, num_features, cat_features) -> torch.Tensor:
@@ -231,7 +228,7 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
 
         # Option 2: Adding a final layer to map to `num_classes`
         else:
-            x = x.view(x.size(0), -1)  # Flatten ensemble dimension if not averaging
+            # x = x.view(x.size(0), -1)  # Flatten ensemble dimension if not averaging
             x = self.final_layer(x)  # Shape (batch_size, num_classes)
 
         return x
diff --git a/mambular/configs/tabm_config.py b/mambular/configs/tabm_config.py
index eafa4b9..c7dab7f 100644
--- a/mambular/configs/tabm_config.py
+++ b/mambular/configs/tabm_config.py
@@ -87,5 +87,6 @@ class DefaultTabMConfig:
     ensemble_scaling_in: bool = True
     ensemble_scaling_out: bool = True
     ensemble_bias: bool = True
-    scaling_init: Literal["ones", "random-signs"] = "ones"
+    scaling_init: Literal["ones", "random-signs", "normal"] = "normal"
     average_ensembles: bool = True
+    model_type: Literal["mini", "full"] = "full"

From 0cc95017c80ff07e2204e910229f031787414bdb Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 00:31:23 +0100
Subject: [PATCH 088/132] fix multiple predictions in ensemble loss

---
 .../layer_utils/batch_ensemble_layer.py       |   5 +-
 mambular/arch_utils/rnn_utils.py              |  37 ++++-
 mambular/base_models/basemodel.py             |  30 +++-
 mambular/base_models/batch_tabrnn.py          |  15 +-
 mambular/base_models/lightning_wrapper.py     |   4 +-
 mambular/base_models/tabm.py                  | 149 +++++++-----------
 mambular/configs/batchtabrnn_config.py        |   3 +-
 7 files changed, 129 insertions(+), 114 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/batch_ensemble_layer.py b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
index d0ed3a4..2a9e0d5 100644
--- a/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
+++ b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
@@ -116,7 +116,7 @@ def __init__(
         ensemble_scaling_in: bool = True,
         ensemble_scaling_out: bool = True,
         ensemble_bias: bool = False,
-        scaling_init: Literal["ones", "random-signs"] = "ones",
+        scaling_init: Literal["ones", "random-signs", "normal"] = "ones",
     ):
         """
         A batch ensemble RNN layer with optional bidirectionality and shared weights.
@@ -173,11 +173,12 @@ def __init__(
         # Initialize parameters
         self.reset_parameters(scaling_init)
 
-    def reset_parameters(self, scaling_init: Literal["ones", "random-signs"]):
+    def reset_parameters(self, scaling_init: Literal["ones", "random-signs", "normal"]):
         # Initialize scaling factors r and s based on selected initialization
         scaling_init_fn = {
             "ones": nn.init.ones_,
             "random-signs": lambda x: torch.sign(torch.randn_like(x)),
+            "normal": lambda x: nn.init.normal_(x, mean=0.0, std=1.0),
         }
 
         if self.r is not None:
diff --git a/mambular/arch_utils/rnn_utils.py b/mambular/arch_utils/rnn_utils.py
index 705d4a2..b409dba 100644
--- a/mambular/arch_utils/rnn_utils.py
+++ b/mambular/arch_utils/rnn_utils.py
@@ -156,6 +156,7 @@ def __init__(
         self.ensemble_scaling_out = getattr(config, "ensemble_scaling_out", True)
         self.ensemble_bias = getattr(config, "ensemble_bias", False)
         self.scaling_init = getattr(config, "scaling_init", "ones")
+        self.model_type = getattr(config, "model_type", "full")
 
         # Convolutional layers
         self.convs = nn.ModuleList()
@@ -185,9 +186,9 @@ def __init__(
         self.rnns = nn.ModuleList()
         self.layernorms_rnn = nn.ModuleList()  # LayerNorms for RNN layers
 
-        for i in range(self.num_layers):
-            rnn = RNNBatchEnsembleLayer(
-                input_size=(self.input_size if i == 0 else self.hidden_size),
+        self.rnns.append(
+            RNNBatchEnsembleLayer(
+                input_size=self.input_size,
                 hidden_size=self.hidden_size,
                 ensemble_size=self.ensemble_size,
                 ensemble_scaling_in=self.ensemble_scaling_in,
@@ -195,11 +196,37 @@ def __init__(
                 ensemble_bias=self.ensemble_bias,
                 dropout=self.rnn_dropout if i < self.num_layers - 1 else 0,
                 nonlinearity=self.rnn_activation,
-                scaling_init=self.scaling_init,
+                scaling_init="normal",
             )
+        )
+
+        for i in range(1, self.num_layers):
+            if self.model_type == "mini":
+                rnn = RNNBatchEnsembleLayer(
+                    input_size=self.hidden_size,
+                    hidden_size=self.hidden_size,
+                    ensemble_size=self.ensemble_size,
+                    ensemble_scaling_in=False,
+                    ensemble_scaling_out=False,
+                    ensemble_bias=self.ensemble_bias,
+                    dropout=self.rnn_dropout if i < self.num_layers - 1 else 0,
+                    nonlinearity=self.rnn_activation,
+                    scaling_init=self.scaling_init,
+                )
+            else:
+                rnn = RNNBatchEnsembleLayer(
+                    input_size=self.hidden_size,
+                    hidden_size=self.hidden_size,
+                    ensemble_size=self.ensemble_size,
+                    ensemble_scaling_in=self.ensemble_scaling_in,
+                    ensemble_scaling_out=self.ensemble_scaling_out,
+                    ensemble_bias=self.ensemble_bias,
+                    dropout=self.rnn_dropout if i < self.num_layers - 1 else 0,
+                    nonlinearity=self.rnn_activation,
+                    scaling_init=self.scaling_init,
+                )
 
             self.rnns.append(rnn)
-            self.layernorms_rnn.append(nn.LayerNorm(self.hidden_size))
 
     def forward(self, x):
         """
diff --git a/mambular/base_models/basemodel.py b/mambular/base_models/basemodel.py
index b18aa3a..b892c3d 100644
--- a/mambular/base_models/basemodel.py
+++ b/mambular/base_models/basemodel.py
@@ -5,28 +5,46 @@
 
 
 class BaseModel(nn.Module):
-    def __init__(self, **kwargs):
+    def __init__(self, config=None, **kwargs):
         """
-        Initializes the BaseModel with given hyperparameters.
+        Initializes the BaseModel with a configuration file and optional extra parameters.
 
         Parameters
         ----------
+        config : object, optional
+            Configuration object with model hyperparameters.
         **kwargs : dict
-            Hyperparameters to be saved and used in the model.
+            Additional hyperparameters to be saved.
         """
         super(BaseModel, self).__init__()
-        self.hparams = kwargs
+
+        # Store the configuration object
+        self.config = config if config is not None else {}
+
+        # Store any additional keyword arguments
+        self.extra_hparams = kwargs
 
     def save_hyperparameters(self, ignore=[]):
         """
-        Saves the hyperparameters while ignoring specified keys.
+        Saves the configuration and additional hyperparameters while ignoring specified keys.
 
         Parameters
         ----------
         ignore : list, optional
             List of keys to ignore while saving hyperparameters, by default [].
         """
-        self.hparams = {k: v for k, v in self.hparams.items() if k not in ignore}
+        # Filter the config and extra hparams for ignored keys
+        config_hparams = (
+            {k: v for k, v in vars(self.config).items() if k not in ignore}
+            if self.config
+            else {}
+        )
+        extra_hparams = {k: v for k, v in self.extra_hparams.items() if k not in ignore}
+
+        # Merge config and extra hparams
+        self.hparams = {**config_hparams, **extra_hparams}
+
+        # Set each hyperparameter as an attribute
         for key, value in self.hparams.items():
             setattr(self, key, value)
 
diff --git a/mambular/base_models/batch_tabrnn.py b/mambular/base_models/batch_tabrnn.py
index ad49ecb..4680b0b 100644
--- a/mambular/base_models/batch_tabrnn.py
+++ b/mambular/base_models/batch_tabrnn.py
@@ -7,6 +7,7 @@
 from ..arch_utils.rnn_utils import EnsembleConvRNN
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from dataclasses import replace
+from ..arch_utils.layer_utils.sn_linear import SNLinear
 
 
 class BatchTabRNN(BaseModel):
@@ -80,8 +81,6 @@ def __init__(
         )
         self.rnn = EnsembleConvRNN(config=config)
 
-        head_activation = self.hparams.get("head_activation", config.head_activation)
-
         self.tabular_head = MLPhead(
             input_dim=self.hparams.get("dim_feedforward", config.dim_feedforward),
             config=config,
@@ -97,7 +96,9 @@ def __init__(
         self.norm_f = get_normalization_layer(temp_config)
 
         if not self.hparams.get("average_ensembles", True):
-            self.ensemble_linear = nn.Linear(config.ensemble_size, 1)
+            self.ensemble_linear = SNLinear(
+                config.ensemble_size, config.dim_feedforward, num_classes
+            )
 
         n_inputs = len(num_feature_info) + len(cat_feature_info)
         self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
@@ -136,13 +137,11 @@ def forward(self, num_features, cat_features):
             if self.hparams.get("average_ensembles", True):
                 # Simple average over ensembles
                 out = out.mean(dim=1)  # Shape: (batch_size, hidden_size)
+                # Final prediction head
+                preds = self.tabular_head(out)
 
             else:
                 # Apply the learned linear combination over ensembles
-                out = self.ensemble_linear(out.permute(0, 2, 1)).squeeze(
-                    -1
-                )  # Shape: (batch_size, hidden_size)
+                preds = self.ensemble_linear(out)  # Shape: (batch_size, hidden_size)
 
-        # Final prediction head
-        preds = self.tabular_head(out)
         return preds
diff --git a/mambular/base_models/lightning_wrapper.py b/mambular/base_models/lightning_wrapper.py
index 276b797..bf6d37d 100644
--- a/mambular/base_models/lightning_wrapper.py
+++ b/mambular/base_models/lightning_wrapper.py
@@ -142,7 +142,9 @@ def compute_loss(self, predictions, y_true):
         if self.lss:
             return self.family.compute_loss(predictions, y_true.squeeze(-1))
         else:
-            loss = self.loss_fct(predictions, y_true)
+            loss = self.loss_fct(
+                predictions, y_true.unsqueeze(1).expand_as(predictions)
+            )
             return loss
 
     def training_step(self, batch, batch_idx):
diff --git a/mambular/base_models/tabm.py b/mambular/base_models/tabm.py
index 4860b84..7f11f45 100644
--- a/mambular/base_models/tabm.py
+++ b/mambular/base_models/tabm.py
@@ -9,61 +9,6 @@
 
 
 class TabM(BaseModel):
-    """
-    A TabM model for tabular data, integrating feature embeddings, batch ensemble layers, and configurable
-    architecture for processing categorical and numerical features with options for skip connections, GLU activation,
-    and dropout.
-
-    Parameters
-    ----------
-    cat_feature_info : dict
-        Dictionary containing information about categorical features, including their names and dimensions.
-    num_feature_info : dict
-        Dictionary containing information about numerical features, including their names and dimensions.
-    num_classes : int, optional
-        The number of output classes or target dimensions for regression, by default 1.
-    config : DefaultTabMConfig, optional
-        Configuration object containing model hyperparameters such as layer sizes, dropout rates, batch ensemble
-        settings, activation functions, and normalization settings, by default DefaultTabMConfig().
-    **kwargs : dict
-        Additional keyword arguments for the BaseModel class.
-
-    Attributes
-    ----------
-    layer_sizes : list of int
-        List specifying the number of units in each layer of the TabM model.
-    cat_feature_info : dict
-        Stores categorical feature information.
-    num_feature_info : dict
-        Stores numerical feature information.
-    config : DefaultTabMConfig
-        Stores the configuration for the TabM model.
-    layers : nn.ModuleList
-        List containing the layers of the TabM model, including LinearBatchEnsembleLayer, normalization, activation,
-        and dropout layers.
-    skip_connections : bool
-        Flag indicating whether skip connections are enabled between layers.
-    use_glu : bool
-        Flag indicating if gated linear units (GLU) should be used as the activation function.
-    activation : callable
-        Activation function applied between layers.
-    use_embeddings : bool
-        Flag indicating if embeddings should be used for categorical and numerical features.
-    embedding_layer : EmbeddingLayer, optional
-        Embedding layer for features, used if `use_embeddings` is enabled.
-    norm_f : nn.Module, optional
-        Normalization layer applied in each batch ensemble layer, if specified in the configuration.
-    final_layer : nn.Linear, optional
-        Final linear layer applied when ensemble outputs are not averaged.
-
-    Methods
-    -------
-    forward(num_features, cat_features) -> torch.Tensor
-        Perform a forward pass through the model, including embedding (if enabled), batch ensemble layers,
-        optional skip connections, and prediction steps.
-
-    """
-
     def __init__(
         self,
         cat_feature_info,
@@ -72,22 +17,24 @@ def __init__(
         config: DefaultTabMConfig = DefaultTabMConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        # Pass config to BaseModel
+        super().__init__(config=config, **kwargs)
+
+        # Save hparams including config attributes
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.layer_sizes = self.hparams.get("layer_sizes", config.layer_sizes)
+        # Use self.hparams for configuration attributes
+        self.layer_sizes = self.hparams.get("layer_sizes", [256, 256])
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
-        self.config = config
+        self.average_ensembles = self.hparams.get("average_ensembles", True)
+        self.average_embeddings = self.hparams.get("average_embeddings", False)
 
         # Initialize layers
         self.layers = nn.ModuleList()
-        self.skip_connections = self.hparams.get(
-            "skip_connections", config.skip_connections
-        )
-        self.use_glu = self.hparams.get("use_glu", config.use_glu)
-        self.activation = self.hparams.get("activation", config.activation)
-        self.use_embeddings = self.hparams.get("use_embeddings", config.use_embeddings)
+        self.use_glu = self.hparams.get("use_glu", False)
+        self.activation = self.hparams.get("activation", nn.SELU())
+        self.use_embeddings = self.hparams.get("use_embeddings", True)
 
         # Embedding layer
         if self.use_embeddings:
@@ -115,14 +62,18 @@ def __init__(
             LinearBatchEnsembleLayer(
                 in_features=input_dim,
                 out_features=self.layer_sizes[0],
-                ensemble_size=config.ensemble_size,
-                ensemble_scaling_in=config.ensemble_scaling_in,
-                ensemble_scaling_out=config.ensemble_scaling_out,
-                ensemble_bias=config.ensemble_bias,
-                scaling_init=config.scaling_init,
+                ensemble_size=self.hparams.get("ensemble_size", config.ensemble_size),
+                ensemble_scaling_in=self.hparams.get(
+                    "ensemble_scaling_in", config.ensemble_scaling_in
+                ),
+                ensemble_scaling_out=self.hparams.get(
+                    "ensemble_scaling_out", config.ensemble_scaling_out
+                ),
+                ensemble_bias=self.hparams.get("ensemble_bias", config.ensemble_bias),
+                scaling_init=self.hparams.get("scaling_init", config.scaling_init),
             )
         )
-        if config.batch_norm:
+        if self.hparams.get("batch_norm", config.batch_norm):
             self.layers.append(nn.BatchNorm1d(self.layer_sizes[0]))
 
         self.norm_f = get_normalization_layer(config)
@@ -130,24 +81,28 @@ def __init__(
             self.layers.append(self.norm_f(self.layer_sizes[0]))
 
         # Optional activation and dropout
-        if config.use_glu:
+        if self.hparams.get("use_glu", config.use_glu):
             self.layers.append(nn.GLU())
         else:
             self.layers.append(self.activation)
-        if config.dropout > 0.0:
-            self.layers.append(nn.Dropout(config.dropout))
+        if self.hparams.get("dropout", config.dropout) > 0.0:
+            self.layers.append(nn.Dropout(self.hparams.get("dropout", config.dropout)))
 
         # Hidden layers with batch ensembling
         for i in range(1, len(self.layer_sizes)):
-            if config.model_type == "mini":
+            if self.hparams.get("model_type", config.model_type) == "mini":
                 self.layers.append(
                     LinearBatchEnsembleLayer(
                         in_features=self.layer_sizes[i - 1],
                         out_features=self.layer_sizes[i],
-                        ensemble_size=config.ensemble_size,
+                        ensemble_size=self.hparams.get(
+                            "ensemble_size", config.ensemble_size
+                        ),
                         ensemble_scaling_in=False,
                         ensemble_scaling_out=False,
-                        ensemble_bias=config.ensemble_bias,
+                        ensemble_bias=self.hparams.get(
+                            "ensemble_bias", config.ensemble_bias
+                        ),
                         scaling_init="ones",
                     )
                 )
@@ -156,24 +111,38 @@ def __init__(
                     LinearBatchEnsembleLayer(
                         in_features=self.layer_sizes[i - 1],
                         out_features=self.layer_sizes[i],
-                        ensemble_size=config.ensemble_size,
-                        ensemble_scaling_in=config.ensemble_scaling_in,
-                        ensemble_scaling_out=config.ensemble_scaling_out,
-                        ensemble_bias=config.ensemble_bias,
+                        ensemble_size=self.hparams.get(
+                            "ensemble_size", config.ensemble_size
+                        ),
+                        ensemble_scaling_in=self.hparams.get(
+                            "ensemble_scaling_in", config.ensemble_scaling_in
+                        ),
+                        ensemble_scaling_out=self.hparams.get(
+                            "ensemble_scaling_out", config.ensemble_scaling_out
+                        ),
+                        ensemble_bias=self.hparams.get(
+                            "ensemble_bias", config.ensemble_bias
+                        ),
                         scaling_init="ones",
                     )
                 )
 
-            if config.use_glu:
+            if self.hparams.get("use_glu", config.use_glu):
                 self.layers.append(nn.GLU())
             else:
                 self.layers.append(self.activation)
-            if config.dropout > 0.0:
-                self.layers.append(nn.Dropout(config.dropout))
+            if self.hparams.get("dropout", config.dropout) > 0.0:
+                self.layers.append(
+                    nn.Dropout(self.hparams.get("dropout", config.dropout))
+                )
 
-        if not self.hparams.get("average_ensembles", True):
+        if self.average_ensembles:
+            self.final_layer = nn.Linear(self.layer_sizes[-1], num_classes)
+        else:
             self.final_layer = SNLinear(
-                config.ensemble_size, self.layer_sizes[-1], num_classes
+                self.hparams.get("ensemble_size", config.ensemble_size),
+                self.layer_sizes[-1],
+                num_classes,
             )
 
     def forward(self, num_features, cat_features) -> torch.Tensor:
@@ -196,7 +165,7 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
         if self.use_embeddings:
             x = self.embedding_layer(num_features, cat_features)
             # Option 1: Average over feature dimension (N)
-            if self.hparams.get("average_embeddings", self.config.average_embeddings):
+            if self.average_embeddings:
                 x = x.mean(dim=1)  # Shape: (B, D)
             # Option 2: Flatten feature and embedding dimensions
             else:
@@ -223,12 +192,10 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
         x = self.layers[-1](x)  # Shape (batch_size, ensemble_size, num_classes)
 
         # Option 1: Averaging across ensemble outputs
-        if self.hparams.get("average_ensembles", True):
+        if self.average_ensembles:
             x = x.mean(dim=1)  # Shape (batch_size, num_classes)
 
-        # Option 2: Adding a final layer to map to `num_classes`
-        else:
-            # x = x.view(x.size(0), -1)  # Flatten ensemble dimension if not averaging
-            x = self.final_layer(x)  # Shape (batch_size, num_classes)
+        x = self.final_layer(x)  # Shape (batch_size, num_classes)
 
+        print(x.shape)
         return x
diff --git a/mambular/configs/batchtabrnn_config.py b/mambular/configs/batchtabrnn_config.py
index 089ce54..78ed9ab 100644
--- a/mambular/configs/batchtabrnn_config.py
+++ b/mambular/configs/batchtabrnn_config.py
@@ -90,6 +90,7 @@ class DefaultBatchTabRNNConfig:
     ensemble_scaling_in: bool = True
     ensemble_scaling_out: bool = True
     ensemble_bias: bool = True
-    scaling_init: Literal["ones", "random-signs"] = "ones"
+    scaling_init: Literal["ones", "random-signs", "normal"] = "ones"
     average_ensembles: bool = True
     ensemble_first: bool = True
+    model_type: Literal["mini", "full"] = "full"

From 2a9fc9603c52b5c0c12a435a4dc52b979b8089ce Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 14:24:02 +0000
Subject: [PATCH 089/132] adjust how values are retrieved from config

---
 mambular/arch_utils/mamba_utils/mamba_arch.py | 44 ++++++++++---------
 .../arch_utils/mamba_utils/mamba_original.py  | 28 ++++++------
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_arch.py b/mambular/arch_utils/mamba_utils/mamba_arch.py
index f971865..51da1cf 100644
--- a/mambular/arch_utils/mamba_utils/mamba_arch.py
+++ b/mambular/arch_utils/mamba_utils/mamba_arch.py
@@ -33,29 +33,31 @@ def __init__(
         self.layers = nn.ModuleList(
             [
                 ResidualBlock(
-                    d_model=config.d_model,
-                    expand_factor=config.expand_factor,
-                    bias=config.bias,
-                    d_conv=config.d_conv,
-                    conv_bias=config.conv_bias,
-                    dropout=config.dropout,
-                    dt_rank=config.dt_rank,
-                    d_state=config.d_state,
-                    dt_scale=config.dt_scale,
-                    dt_init=config.dt_init,
-                    dt_max=config.dt_max,
-                    dt_min=config.dt_min,
-                    dt_init_floor=config.dt_init_floor,
+                    d_model=getattr(config, "d_model", 128),
+                    expand_factor=getattr(config, "expand_factor", 4),
+                    bias=getattr(config, "bias", True),
+                    d_conv=getattr(config, "d_conv", 4),
+                    conv_bias=getattr(config, "conv_bias", False),
+                    dropout=getattr(config, "dropout", 0.0),
+                    dt_rank=getattr(config, "dt_rank", "auto"),
+                    d_state=getattr(config, "d_state", 256),
+                    dt_scale=getattr(config, "dt_scale", 1.0),
+                    dt_init=getattr(config, "dt_init", "random"),
+                    dt_max=getattr(config, "dt_max", 0.1),
+                    dt_min=getattr(config, "dt_min", 1e-04),
+                    dt_init_floor=getattr(config, "dt_init_floor", 1e-04),
                     norm=get_normalization_layer(config),
-                    activation=config.activation,
-                    bidirectional=config.bidirectional,
-                    use_learnable_interaction=config.use_learnable_interaction,
-                    layer_norm_eps=config.layer_norm_eps,
-                    AD_weight_decay=config.AD_weight_decay,
-                    BC_layer_norm=config.BC_layer_norm,
-                    use_pscan=config.use_pscan,
+                    activation=getattr(config, "activation", nn.SiLU()),
+                    bidirectional=getattr(config, "bidirectional", False),
+                    use_learnable_interaction=getattr(
+                        config, "use_learnable_interaction", False
+                    ),
+                    layer_norm_eps=getattr(config, "layer_norm_eps", 1e-5),
+                    AD_weight_decay=getattr(config, "AD_weight_decay", True),
+                    BC_layer_norm=getattr(config, "BC_layer_norm", False),
+                    use_pscan=getattr(config, "use_pscan", False),
                 )
-                for _ in range(config.n_layers)
+                for _ in range(getattr(config, "n_layers", 6))
             ]
         )
 
diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index 988abdc..d4a49a8 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -30,7 +30,6 @@ def __init__(
         bias=False,
         d_conv=16,
         conv_bias=True,
-        dt_rank="auto",
         d_state=32,
         dt_scale=1.0,
         dt_init="random",
@@ -147,23 +146,22 @@ def __init__(self, config):
         self.fwd_layers = nn.ModuleList(
             [
                 ResidualBlock(
-                    mamba_version=config.mamba_version,
-                    d_model=config.d_model,
-                    d_state=config.d_state,
-                    d_conv=config.d_conv,
+                    mamba_version=getattr(config, "mamba_version", "mamba2"),
+                    d_model=getattr(config, "d_model", 128),
+                    d_state=getattr(config, "d_state", 256),
+                    d_conv=getattr(config, "d_conv", 4),
                     norm=get_normalization_layer(config),
-                    expand_factor=config.expand_factor,
-                    dt_rank=config.dt_rank,
-                    dt_min=config.dt_min,
-                    dt_max=config.dt_max,
-                    dt_init=config.dt_init,
-                    dt_scale=config.dt_scale,
-                    dt_init_floor=config.dt_init_floor,
-                    conv_bias=config.conv_bias,
-                    bias=config.bias,
+                    expand_factor=getattr(config, "expand_factor", 2),
+                    dt_min=getattr(config, "dt_min", 1e-04),
+                    dt_max=getattr(config, "dt_max", 0.1),
+                    dt_init=getattr(config, "dt_init", "random"),
+                    dt_scale=getattr(config, "dt_scale", 1.0),
+                    dt_init_floor=getattr(config, "dt_init_floor", 1e-04),
+                    conv_bias=getattr(config, "conv_bias", False),
+                    bias=getattr(config, "bias", True),
                     layer_idx=i,
                 )
-                for i in range(config.n_layers)
+                for i in range(getattr(config, "n_layers", 6))
             ]
         )
 

From 8f3994b72979a3df17d05fc66f4a1080c3f2738c Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 14:24:13 +0000
Subject: [PATCH 090/132] save config as hparams

---
 mambular/base_models/basemodel.py | 43 ++++++++++++++-----------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/mambular/base_models/basemodel.py b/mambular/base_models/basemodel.py
index b892c3d..b64a03a 100644
--- a/mambular/base_models/basemodel.py
+++ b/mambular/base_models/basemodel.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-import os
+from argparse import Namespace
 import logging
 
 
@@ -40,13 +40,10 @@ def save_hyperparameters(self, ignore=[]):
             else {}
         )
         extra_hparams = {k: v for k, v in self.extra_hparams.items() if k not in ignore}
+        config_hparams.update(extra_hparams)
 
-        # Merge config and extra hparams
-        self.hparams = {**config_hparams, **extra_hparams}
-
-        # Set each hyperparameter as an attribute
-        for key, value in self.hparams.items():
-            setattr(self, key, value)
+        # Merge config and extra hparams and convert to Namespace for dot notation
+        self.hparams = Namespace(**config_hparams)
 
     def save_model(self, path):
         """
@@ -167,23 +164,23 @@ def print_summary(self):
 
     def initialize_pooling_layers(self, config, n_inputs):
         """
-        Initializes the layers needed for learnable pooling methods based on self.pooling_method.
+        Initializes the layers needed for learnable pooling methods based on self.hparams.pooling_method.
         """
-        if self.pooling_method == "learned_flatten":
+        if self.hparams.pooling_method == "learned_flatten":
             # Flattening + Linear layer
             self.learned_flatten_pooling = nn.Linear(
                 n_inputs * config.dim_feedforward, config.dim_feedforward
             )
 
-        elif self.pooling_method == "attention":
+        elif self.hparams.pooling_method == "attention":
             # Attention-based pooling with learnable attention weights
             self.attention_weights = nn.Parameter(torch.randn(config.dim_feedforward))
 
-        elif self.pooling_method == "gated":
+        elif self.hparams.pooling_method == "gated":
             # Gated pooling with a learned gating layer
             self.gate_layer = nn.Linear(config.dim_feedforward, config.dim_feedforward)
 
-        elif self.pooling_method == "rnn":
+        elif self.hparams.pooling_method == "rnn":
             # RNN-based pooling: Use a small RNN (e.g., LSTM)
             self.pooling_rnn = nn.LSTM(
                 input_size=config.dim_feedforward,
@@ -193,7 +190,7 @@ def initialize_pooling_layers(self, config, n_inputs):
                 bidirectional=False,
             )
 
-        elif self.pooling_method == "conv":
+        elif self.hparams.pooling_method == "conv":
             # Conv1D-based pooling with global max pooling
             self.conv1d_pooling = nn.Conv1d(
                 in_channels=config.dim_feedforward,
@@ -204,29 +201,29 @@ def initialize_pooling_layers(self, config, n_inputs):
 
     def pool_sequence(self, out):
         """
-        Pools the sequence dimension based on self.pooling_method.
+        Pools the sequence dimension based on self.hparams.pooling_method.
         """
 
-        if self.pooling_method == "avg":
+        if self.hparams.pooling_method == "avg":
             return out.mean(
                 dim=1
             )  # Shape: (batch_size, ensemble_size, hidden_size) or (batch_size, hidden_size)
-        elif self.pooling_method == "max":
+        elif self.hparams.pooling_method == "max":
             return out.max(dim=1)[0]
-        elif self.pooling_method == "sum":
+        elif self.hparams.pooling_method == "sum":
             return out.sum(dim=1)
-        elif self.pooling_method == "last":
+        elif self.hparams.pooling_method == "last":
             return out[:, -1, :]
-        elif self.pooling_method == "cls":
+        elif self.hparams.pooling_method == "cls":
             return out[:, 0, :]
-        elif self.pooling_method == "learned_flatten":
+        elif self.hparams.pooling_method == "learned_flatten":
             # Flatten sequence and apply a learned linear layer
             batch_size, seq_len, hidden_size = out.shape
             out = out.reshape(
                 batch_size, -1
             )  # Shape: (batch_size, seq_len * hidden_size)
             return self.learned_flatten_pooling(out)  # Shape: (batch_size, hidden_size)
-        elif self.pooling_method == "attention":
+        elif self.hparams.pooling_method == "attention":
             # Attention-based pooling
             attention_scores = torch.einsum(
                 "bsh,h->bs", out, self.attention_weights
@@ -238,7 +235,7 @@ def pool_sequence(self, out):
                 dim=1
             )  # Weighted sum across the sequence, Shape: (batch_size, hidden_size)
             return out
-        elif self.pooling_method == "gated":
+        elif self.hparams.pooling_method == "gated":
             # Gated pooling
             gates = torch.sigmoid(
                 self.gate_layer(out)
@@ -246,4 +243,4 @@ def pool_sequence(self, out):
             out = (out * gates).sum(dim=1)  # Shape: (batch_size, hidden_size)
             return out
         else:
-            raise ValueError(f"Invalid pooling method: {self.pooling_method}")
+            raise ValueError(f"Invalid pooling method: {self.hparams.pooling_method}")

From c823188dba823e385bb972c6954e19a7efea0593 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 14:24:39 +0000
Subject: [PATCH 091/132] fix batch-mini rnn arch

---
 mambular/arch_utils/rnn_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/arch_utils/rnn_utils.py b/mambular/arch_utils/rnn_utils.py
index b409dba..b43b9c6 100644
--- a/mambular/arch_utils/rnn_utils.py
+++ b/mambular/arch_utils/rnn_utils.py
@@ -194,7 +194,7 @@ def __init__(
                 ensemble_scaling_in=self.ensemble_scaling_in,
                 ensemble_scaling_out=self.ensemble_scaling_out,
                 ensemble_bias=self.ensemble_bias,
-                dropout=self.rnn_dropout if i < self.num_layers - 1 else 0,
+                dropout=self.rnn_dropout,
                 nonlinearity=self.rnn_activation,
                 scaling_init="normal",
             )

From 6ca631dc55872c751591ebf5dd18c0fdb6129169 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 14:25:06 +0000
Subject: [PATCH 092/132] adjust models to new self.hparams arch

---
 mambular/base_models/batch_tabrnn.py   |  67 +++++-------
 mambular/base_models/ft_transformer.py |  12 +--
 mambular/base_models/mambatab.py       |  11 +-
 mambular/base_models/mambular.py       |  13 +--
 mambular/base_models/mlp.py            |  60 +++++------
 mambular/base_models/ndtf.py           |  25 +++--
 mambular/base_models/node.py           |  38 +++----
 mambular/base_models/resnet.py         |  45 ++++-----
 mambular/base_models/tabm.py           | 135 ++++++++++++-------------
 mambular/base_models/tabtransformer.py |  14 +--
 mambular/base_models/tabularnn.py      |  16 +--
 11 files changed, 193 insertions(+), 243 deletions(-)

diff --git a/mambular/base_models/batch_tabrnn.py b/mambular/base_models/batch_tabrnn.py
index 4680b0b..120d4a0 100644
--- a/mambular/base_models/batch_tabrnn.py
+++ b/mambular/base_models/batch_tabrnn.py
@@ -66,13 +66,16 @@ def __init__(
         config: DefaultBatchTabRNNConfig = DefaultBatchTabRNNConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
+        if not self.hparams.average_ensembles:
+            self.returns_ensemble = True  # Directly set ensemble flag
+        else:
+            self.returns_ensemble = False
+
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
-        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
-        self.ensemble_first = self.hparams.get("ensemble_first", config.ensemble_first)
 
         self.embedding_layer = EmbeddingLayer(
             num_feature_info=num_feature_info,
@@ -81,24 +84,24 @@ def __init__(
         )
         self.rnn = EnsembleConvRNN(config=config)
 
-        self.tabular_head = MLPhead(
-            input_dim=self.hparams.get("dim_feedforward", config.dim_feedforward),
-            config=config,
-            output_dim=num_classes,
-        )
-
         self.linear = nn.Linear(
-            self.hparams.get("d_model", config.d_model),
-            self.hparams.get("dim_feedforward", config.dim_feedforward),
+            self.hparams.d_model,
+            self.hparams.dim_feedforward,
         )
 
         temp_config = replace(config, d_model=config.dim_feedforward)
         self.norm_f = get_normalization_layer(temp_config)
 
-        if not self.hparams.get("average_ensembles", True):
-            self.ensemble_linear = SNLinear(
+        if not self.hparams.average_ensembles:
+            self.tabular_head = SNLinear(
                 config.ensemble_size, config.dim_feedforward, num_classes
             )
+        else:
+            self.tabular_head = MLPhead(
+                input_dim=self.hparams.dim_feedforward,
+                config=config,
+                output_dim=num_classes,
+            )
 
         n_inputs = len(num_feature_info) + len(cat_feature_info)
         self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
@@ -111,37 +114,15 @@ def forward(self, num_features, cat_features):
             x
         )  # Shape: (batch_size, sequence_length, ensemble_size, hidden_size)
 
-        if self.ensemble_first:
-            # Combine or average over ensembles first, then pool over the sequence
-            if self.hparams.get("average_ensembles", True):
-                # Simple average over ensembles
-                out = out.mean(
-                    dim=2
-                )  # Shape: (batch_size, sequence_length, hidden_size)
+        out = self.pool_sequence(out)  # Shape: (batch_size, ensemble_size, hidden_size)
+        if self.hparams.average_ensembles:
+            # Simple average over ensembles
+            out = out.mean(dim=1)  # Shape: (batch_size, hidden_size)
+            # Final prediction head
 
-            else:
-                # Apply the learned linear combination over ensembles
-                out = self.ensemble_linear(out.permute(0, 1, 3, 2)).squeeze(
-                    -1
-                )  # Shape: (batch_size, sequence_length, hidden_size)
+        preds = self.tabular_head(out)  #
 
-            # Now pool over the sequence
-            out = self.pool_sequence(out)  # Shape: (batch_size, hidden_size)
-
-        else:
-            # Pool over the sequence first, then combine or average over ensembles
-            out = self.pool_sequence(
-                out
-            )  # Shape: (batch_size, ensemble_size, hidden_size)
-
-            if self.hparams.get("average_ensembles", True):
-                # Simple average over ensembles
-                out = out.mean(dim=1)  # Shape: (batch_size, hidden_size)
-                # Final prediction head
-                preds = self.tabular_head(out)
-
-            else:
-                # Apply the learned linear combination over ensembles
-                preds = self.ensemble_linear(out)  # Shape: (batch_size, hidden_size)
+        if not self.hparams.average_ensembles:
+            preds = preds.squeeze(-1)
 
         return preds
diff --git a/mambular/base_models/ft_transformer.py b/mambular/base_models/ft_transformer.py
index 7019f38..e02652a 100644
--- a/mambular/base_models/ft_transformer.py
+++ b/mambular/base_models/ft_transformer.py
@@ -59,10 +59,9 @@ def __init__(
         config: DefaultFTTransformerConfig = DefaultFTTransformerConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
-
-        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
+        self.returns_ensemble = False
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
@@ -78,15 +77,12 @@ def __init__(
         encoder_layer = CustomTransformerEncoderLayer(config=config)
         self.encoder = nn.TransformerEncoder(
             encoder_layer,
-            num_layers=self.hparams.get("n_layers", config.n_layers),
+            num_layers=self.hparams.n_layers,
             norm=self.norm_f,
         )
 
-        # tabular head
-        head_activation = self.hparams.get("head_activation", config.head_activation)
-
         self.tabular_head = MLPhead(
-            input_dim=self.hparams.get("d_model", config.d_model),
+            input_dim=self.hparams.d_model,
             config=config,
             output_dim=num_classes,
         )
diff --git a/mambular/base_models/mambatab.py b/mambular/base_models/mambatab.py
index 3e1da9e..2a87352 100644
--- a/mambular/base_models/mambatab.py
+++ b/mambular/base_models/mambatab.py
@@ -64,7 +64,7 @@ def __init__(
         config: DefaultMambaTabConfig = DefaultMambaTabConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
         input_dim = 0
@@ -75,20 +75,19 @@ def __init__(
 
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
+        self.returns_ensemble = False
 
         self.initial_layer = nn.Linear(input_dim, config.d_model)
         self.norm_f = LayerNorm(config.d_model)
 
-        self.embedding_activation = self.hparams.get(
-            "num_embedding_activation", config.num_embedding_activation
-        )
+        self.embedding_activation = self.hparams.num_embedding_activation
 
         self.axis = config.axis
 
-        head_activation = self.hparams.get("head_activation", config.head_activation)
+        head_activation = self.hparams.head_activation
 
         self.tabular_head = MLPhead(
-            input_dim=self.hparams.get("d_model", config.d_model),
+            input_dim=self.hparams.d_model,
             config=config,
             output_dim=num_classes,
         )
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index 04a6e2b..7a01ddf 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -60,13 +60,10 @@ def __init__(
         config: DefaultMambularConfig = DefaultMambularConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
-        self.shuffle_embeddings = self.hparams.get(
-            "shuffle_embeddings", config.shuffle_embeddings
-        )
+        self.returns_ensemble = False
 
         # embedding layer
         self.embedding_layer = EmbeddingLayer(
@@ -82,12 +79,12 @@ def __init__(
         self.norm_f = get_normalization_layer(config)
 
         self.tabular_head = MLPhead(
-            input_dim=self.hparams.get("d_model", config.d_model),
+            input_dim=self.hparams.d_model,
             config=config,
             output_dim=num_classes,
         )
 
-        if self.shuffle_embeddings:
+        if self.hparams.shuffle_embeddings:
             self.perm = torch.randperm(self.embedding_layer.seq_len)
 
         # pooling
@@ -112,7 +109,7 @@ def forward(self, num_features, cat_features):
         """
         x = self.embedding_layer(num_features, cat_features)
 
-        if self.shuffle_embeddings:
+        if self.hparams.shuffle_embeddings:
             x = x[:, self.perm, :]
 
         x = self.mamba(x)
diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index f9cc315..08005cc 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -64,21 +64,15 @@ def __init__(
         config: DefaultMLPConfig = DefaultMLPConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.layer_sizes = self.hparams.get("layer_sizes", config.layer_sizes)
+        self.returns_ensemble = False
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
         # Initialize layers
         self.layers = nn.ModuleList()
-        self.skip_connections = self.hparams.get(
-            "skip_connections", config.skip_connections
-        )
-        self.use_glu = self.hparams.get("use_glu", config.use_glu)
-        self.activation = self.hparams.get("activation", config.activation)
-        self.use_embeddings = self.hparams.get("use_embeddings", config.use_embeddings)
 
         input_dim = 0
         for feature_name, input_shape in num_feature_info.items():
@@ -86,50 +80,52 @@ def __init__(
         for feature_name, input_shape in cat_feature_info.items():
             input_dim += 1
 
-        if self.use_embeddings:
+        if self.hparams.use_embeddings:
             self.embedding_layer = EmbeddingLayer(
                 num_feature_info=num_feature_info,
                 cat_feature_info=cat_feature_info,
                 config=config,
             )
             input_dim = (
-                len(num_feature_info) * config.d_model
-                + len(cat_feature_info) * config.d_model
+                len(num_feature_info) * self.hparams.d_model
+                + len(cat_feature_info) * self.hparams.d_model
             )
 
         # Input layer
-        self.layers.append(nn.Linear(input_dim, self.layer_sizes[0]))
-        if config.batch_norm:
-            self.layers.append(nn.BatchNorm1d(self.layer_sizes[0]))
+        self.layers.append(nn.Linear(input_dim, self.hparams.layer_sizes[0]))
+        if self.hparams.batch_norm:
+            self.layers.append(nn.BatchNorm1d(self.hparams.layer_sizes[0]))
 
         self.norm_f = get_normalization_layer(config)
 
         if self.norm_f is not None:
-            self.layers.append(self.norm_f(self.layer_sizes[0]))
+            self.layers.append(self.norm_f(self.hparams.layer_sizes[0]))
 
-        if config.use_glu:
+        if self.hparams.use_glu:
             self.layers.append(nn.GLU())
         else:
-            self.layers.append(self.activation)
-        if config.dropout > 0.0:
-            self.layers.append(nn.Dropout(config.dropout))
+            self.layers.append(self.hparams.activation)
+        if self.hparams.dropout > 0.0:
+            self.layers.append(nn.Dropout(self.hparams.dropout))
 
         # Hidden layers
-        for i in range(1, len(self.layer_sizes)):
-            self.layers.append(nn.Linear(self.layer_sizes[i - 1], self.layer_sizes[i]))
-            if config.batch_norm:
-                self.layers.append(nn.BatchNorm1d(self.layer_sizes[i]))
-            if config.layer_norm:
-                self.layers.append(nn.LayerNorm(self.layer_sizes[i]))
-            if config.use_glu:
+        for i in range(1, len(self.hparams.layer_sizes)):
+            self.layers.append(
+                nn.Linear(self.hparams.layer_sizes[i - 1], self.hparams.layer_sizes[i])
+            )
+            if self.hparams.batch_norm:
+                self.layers.append(nn.BatchNorm1d(self.hparams.layer_sizes[i]))
+            if self.hparams.layer_norm:
+                self.layers.append(nn.LayerNorm(self.hparams.layer_sizes[i]))
+            if self.hparams.use_glu:
                 self.layers.append(nn.GLU())
             else:
-                self.layers.append(self.activation)
-            if config.dropout > 0.0:
-                self.layers.append(nn.Dropout(config.dropout))
+                self.layers.append(self.hparams.activation)
+            if self.hparams.dropout > 0.0:
+                self.layers.append(nn.Dropout(self.hparams.dropout))
 
         # Output layer
-        self.layers.append(nn.Linear(self.layer_sizes[-1], num_classes))
+        self.layers.append(nn.Linear(self.hparams.layer_sizes[-1], num_classes))
 
     def forward(self, num_features, cat_features) -> torch.Tensor:
         """
@@ -145,7 +141,7 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
         torch.Tensor
             Output tensor.
         """
-        if self.use_embeddings:
+        if self.hparams.use_embeddings:
             x = self.embedding_layer(num_features, cat_features)
             B, S, D = x.shape
             x = x.reshape(B, S * D)
@@ -156,7 +152,7 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
         for i in range(len(self.layers) - 1):
             if isinstance(self.layers[i], nn.Linear):
                 out = self.layers[i](x)
-                if self.skip_connections and x.shape == out.shape:
+                if self.hparams.skip_connections and x.shape == out.shape:
                     x = x + out
                 else:
                     x = out
diff --git a/mambular/base_models/ndtf.py b/mambular/base_models/ndtf.py
index 44e3498..794f2ac 100644
--- a/mambular/base_models/ndtf.py
+++ b/mambular/base_models/ndtf.py
@@ -32,7 +32,7 @@ class NDTF(BaseModel):
     num_feature_info : dict
         Stores numerical feature information.
     penalty_factor : float
-        Scaling factor for the penalty applied during training, specified in the config.
+        Scaling factor for the penalty applied during training, specified in the self.hparams.
     input_dimensions : list of int
         List of input dimensions for each tree in the ensemble, with random sampling.
     trees : nn.ModuleList
@@ -59,12 +59,12 @@ def __init__(
         config: DefaultNDTFConfig = DefaultNDTFConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
-        self.penalty_factor = config.penalty_factor
+        self.returns_ensemble = False
 
         input_dim = 0
         for feature_name, input_shape in num_feature_info.items():
@@ -74,20 +74,23 @@ def __init__(
 
         self.input_dimensions = [input_dim]
 
-        for _ in range(config.n_ensembles - 1):
+        for _ in range(self.hparams.n_ensembles - 1):
             self.input_dimensions.append(np.random.randint(1, input_dim))
 
         self.trees = nn.ModuleList(
             [
                 NeuralDecisionTree(
                     input_dim=self.input_dimensions[idx],
-                    depth=np.random.randint(config.min_depth, config.max_depth),
+                    depth=np.random.randint(
+                        self.hparams.min_depth, self.hparams.max_depth
+                    ),
                     output_dim=num_classes,
-                    lamda=config.lamda,
-                    temperature=config.temperature + np.abs(np.random.normal(0, 0.1)),
-                    node_sampling=config.node_sampling,
+                    lamda=self.hparams.lamda,
+                    temperature=self.hparams.temperature
+                    + np.abs(np.random.normal(0, 0.1)),
+                    node_sampling=self.hparams.node_sampling,
                 )
-                for idx in range(config.n_ensembles)
+                for idx in range(self.hparams.n_ensembles)
             ]
         )
 
@@ -101,7 +104,7 @@ def __init__(
         )
 
         self.tree_weights = nn.Parameter(
-            torch.full((config.n_ensembles, 1), 1.0 / config.n_ensembles),
+            torch.full((self.hparams.n_ensembles, 1), 1.0 / self.hparams.n_ensembles),
             requires_grad=True,
         )
 
@@ -168,4 +171,4 @@ def penalty_forward(self, num_features, cat_features) -> torch.Tensor:
 
         # Stack predictions and calculate mean across trees
         preds = torch.stack(preds, dim=1).squeeze(-1)
-        return preds @ self.tree_weights, self.penalty_factor * penalty
+        return preds @ self.tree_weights, self.hparams.penalty_factor * penalty
diff --git a/mambular/base_models/node.py b/mambular/base_models/node.py
index e49d7e0..0ed3f4e 100644
--- a/mambular/base_models/node.py
+++ b/mambular/base_models/node.py
@@ -58,41 +58,41 @@ def __init__(
         config: DefaultNODEConfig = DefaultNODEConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
+        self.returns_ensemble = False
+
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
-        self.use_embeddings = self.hparams.get("use_embeddings", config.use_embeddings)
-
-        input_dim = 0
-        for feature_name, input_shape in num_feature_info.items():
-            input_dim += input_shape
-        for feature_name, input_shape in cat_feature_info.items():
-            input_dim += 1
 
-        if self.use_embeddings:
+        if self.hparams.use_embeddings:
             input_dim = (
-                len(num_feature_info) * config.d_model
-                + len(cat_feature_info) * config.d_model
+                len(num_feature_info) * self.hparams.d_model
+                + len(cat_feature_info) * self.hparams.d_model
             )
 
             self.embedding_layer = EmbeddingLayer(config)
 
+        else:
+            input_dim = 0
+            for feature_name, input_shape in num_feature_info.items():
+                input_dim += input_shape
+            for feature_name, input_shape in cat_feature_info.items():
+                input_dim += 1
+
         self.d_out = num_classes
         self.block = DenseBlock(
             input_dim=input_dim,
-            num_layers=config.num_layers,
-            layer_dim=config.layer_dim,
-            depth=config.depth,
-            tree_dim=config.tree_dim,
+            num_layers=self.hparams.num_layers,
+            layer_dim=self.hparams.layer_dim,
+            depth=self.hparams.depth,
+            tree_dim=self.hparams.tree_dim,
             flatten_output=True,
         )
 
-        head_activation = self.hparams.get("head_activation", config.head_activation)
-
         self.tabular_head = MLPhead(
-            input_dim=config.num_layers * config.layer_dim,
+            input_dim=self.hparams.num_layers * self.hparams.layer_dim,
             config=config,
             output_dim=num_classes,
         )
@@ -113,7 +113,7 @@ def forward(self, num_features, cat_features):
         torch.Tensor
             Model output of shape [batch_size, num_classes].
         """
-        if self.use_embeddings:
+        if self.hparams.use_embeddings:
             x = self.embedding_layer(num_features, cat_features)
             B, S, D = x.shape
             x = x.reshape(B, S * D)
diff --git a/mambular/base_models/resnet.py b/mambular/base_models/resnet.py
index e7b018a..549ca41 100644
--- a/mambular/base_models/resnet.py
+++ b/mambular/base_models/resnet.py
@@ -66,27 +66,19 @@ def __init__(
         config: DefaultResNetConfig = DefaultResNetConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.layer_sizes = self.hparams.get("layer_sizes", config.layer_sizes)
+        self.returns_ensemble = False
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
-        self.activation = config.activation
-        self.use_embeddings = self.hparams.get("use_embeddings", config.use_embeddings)
-
-        input_dim = 0
-        for feature_name, input_shape in num_feature_info.items():
-            input_dim += input_shape
-        for feature_name, input_shape in cat_feature_info.items():
-            input_dim += 1
 
         self.norm_f = get_normalization_layer(config)
 
-        if self.use_embeddings:
+        if self.hparams.use_embeddings:
             input_dim = (
-                len(num_feature_info) * config.d_model
-                + len(cat_feature_info) * config.d_model
+                len(num_feature_info) * self.hparams.d_model
+                + len(cat_feature_info) * self.hparams.d_model
             )
             # embedding layer
             self.embedding_layer = EmbeddingLayer(
@@ -95,29 +87,36 @@ def __init__(
                 config=config,
             )
 
-        self.initial_layer = nn.Linear(input_dim, self.layer_sizes[0])
+        else:
+            input_dim = 0
+            for feature_name, input_shape in num_feature_info.items():
+                input_dim += input_shape
+            for feature_name, input_shape in cat_feature_info.items():
+                input_dim += 1
+
+        self.initial_layer = nn.Linear(input_dim, self.hparams.layer_sizes[0])
 
         self.blocks = nn.ModuleList()
-        for i in range(config.num_blocks):
-            input_dim = self.layer_sizes[i]
+        for i in range(self.hparams.num_blocks):
+            input_dim = self.hparams.layer_sizes[i]
             output_dim = (
-                self.layer_sizes[i + 1]
-                if i + 1 < len(self.layer_sizes)
-                else self.layer_sizes[-1]
+                self.hparams.layer_sizes[i + 1]
+                if i + 1 < len(self.hparams.layer_sizes)
+                else self.hparams.layer_sizes[-1]
             )
             block = ResidualBlock(
                 input_dim,
                 output_dim,
-                self.activation,
+                self.hparams.activation,
                 self.norm_f,
-                config.dropout,
+                self.hparams.dropout,
             )
             self.blocks.append(block)
 
-        self.output_layer = nn.Linear(self.layer_sizes[-1], num_classes)
+        self.output_layer = nn.Linear(self.hparams.layer_sizes[-1], num_classes)
 
     def forward(self, num_features, cat_features):
-        if self.use_embeddings:
+        if self.hparams.use_embeddings:
             x = self.embedding_layer(num_features, cat_features)
             B, S, D = x.shape
             x = x.reshape(B, S * D)
diff --git a/mambular/base_models/tabm.py b/mambular/base_models/tabm.py
index 7f11f45..e3a624c 100644
--- a/mambular/base_models/tabm.py
+++ b/mambular/base_models/tabm.py
@@ -22,35 +22,28 @@ def __init__(
 
         # Save hparams including config attributes
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        if not self.hparams.average_ensembles:
+            self.returns_ensemble = True  # Directly set ensemble flag
+        else:
+            self.returns_ensemble = False
 
-        # Use self.hparams for configuration attributes
-        self.layer_sizes = self.hparams.get("layer_sizes", [256, 256])
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
-        self.average_ensembles = self.hparams.get("average_ensembles", True)
-        self.average_embeddings = self.hparams.get("average_embeddings", False)
-
-        # Initialize layers
+        # Initialize layers based on self.hparams
         self.layers = nn.ModuleList()
-        self.use_glu = self.hparams.get("use_glu", False)
-        self.activation = self.hparams.get("activation", nn.SELU())
-        self.use_embeddings = self.hparams.get("use_embeddings", True)
 
-        # Embedding layer
-        if self.use_embeddings:
+        # Conditionally initialize EmbeddingLayer based on self.hparams
+        if self.hparams.use_embeddings:
             self.embedding_layer = EmbeddingLayer(
                 num_feature_info=num_feature_info,
                 cat_feature_info=cat_feature_info,
                 config=config,
             )
 
-            if self.hparams.get("average_embeddings", config.average_embeddings):
-                input_dim = self.hparams.get("d_model", config.d_model)
+            if self.hparams.average_embeddings:
+                input_dim = self.hparams.d_model
             else:
                 input_dim = (
                     len(num_feature_info) + len(cat_feature_info)
                 ) * config.d_model
-                print(input_dim)
 
         else:
             # Calculate input dimension
@@ -61,87 +54,77 @@ def __init__(
         self.layers.append(
             LinearBatchEnsembleLayer(
                 in_features=input_dim,
-                out_features=self.layer_sizes[0],
-                ensemble_size=self.hparams.get("ensemble_size", config.ensemble_size),
-                ensemble_scaling_in=self.hparams.get(
-                    "ensemble_scaling_in", config.ensemble_scaling_in
-                ),
-                ensemble_scaling_out=self.hparams.get(
-                    "ensemble_scaling_out", config.ensemble_scaling_out
-                ),
-                ensemble_bias=self.hparams.get("ensemble_bias", config.ensemble_bias),
-                scaling_init=self.hparams.get("scaling_init", config.scaling_init),
+                out_features=self.hparams.layer_sizes[0],
+                ensemble_size=self.hparams.ensemble_size,
+                ensemble_scaling_in=self.hparams.ensemble_scaling_in,
+                ensemble_scaling_out=self.hparams.ensemble_scaling_out,
+                ensemble_bias=self.hparams.ensemble_bias,
+                scaling_init=self.hparams.scaling_init,
             )
         )
-        if self.hparams.get("batch_norm", config.batch_norm):
-            self.layers.append(nn.BatchNorm1d(self.layer_sizes[0]))
+        if self.hparams.batch_norm:
+            self.layers.append(nn.BatchNorm1d(self.hparams.layer_sizes[0]))
 
         self.norm_f = get_normalization_layer(config)
         if self.norm_f is not None:
-            self.layers.append(self.norm_f(self.layer_sizes[0]))
+            self.layers.append(self.norm_f(self.hparams.layer_sizes[0]))
 
         # Optional activation and dropout
-        if self.hparams.get("use_glu", config.use_glu):
+        if self.hparams.use_glu:
             self.layers.append(nn.GLU())
         else:
-            self.layers.append(self.activation)
-        if self.hparams.get("dropout", config.dropout) > 0.0:
-            self.layers.append(nn.Dropout(self.hparams.get("dropout", config.dropout)))
+            self.layers.append(
+                self.hparams.activation
+                if hasattr(self.hparams, "activation")
+                else nn.SELU()
+            )
+        if self.hparams.dropout > 0.0:
+            self.layers.append(nn.Dropout(self.hparams.dropout))
 
         # Hidden layers with batch ensembling
-        for i in range(1, len(self.layer_sizes)):
-            if self.hparams.get("model_type", config.model_type) == "mini":
+        for i in range(1, len(self.hparams.layer_sizes)):
+            if self.hparams.model_type == "mini":
                 self.layers.append(
                     LinearBatchEnsembleLayer(
-                        in_features=self.layer_sizes[i - 1],
-                        out_features=self.layer_sizes[i],
-                        ensemble_size=self.hparams.get(
-                            "ensemble_size", config.ensemble_size
-                        ),
+                        in_features=self.hparams.layer_sizes[i - 1],
+                        out_features=self.hparams.layer_sizes[i],
+                        ensemble_size=self.hparams.ensemble_size,
                         ensemble_scaling_in=False,
                         ensemble_scaling_out=False,
-                        ensemble_bias=self.hparams.get(
-                            "ensemble_bias", config.ensemble_bias
-                        ),
+                        ensemble_bias=self.hparams.ensemble_bias,
                         scaling_init="ones",
                     )
                 )
             else:
                 self.layers.append(
                     LinearBatchEnsembleLayer(
-                        in_features=self.layer_sizes[i - 1],
-                        out_features=self.layer_sizes[i],
-                        ensemble_size=self.hparams.get(
-                            "ensemble_size", config.ensemble_size
-                        ),
-                        ensemble_scaling_in=self.hparams.get(
-                            "ensemble_scaling_in", config.ensemble_scaling_in
-                        ),
-                        ensemble_scaling_out=self.hparams.get(
-                            "ensemble_scaling_out", config.ensemble_scaling_out
-                        ),
-                        ensemble_bias=self.hparams.get(
-                            "ensemble_bias", config.ensemble_bias
-                        ),
+                        in_features=self.hparams.layer_sizes[i - 1],
+                        out_features=self.hparams.layer_sizes[i],
+                        ensemble_size=self.hparams.ensemble_size,
+                        ensemble_scaling_in=self.hparams.ensemble_scaling_in,
+                        ensemble_scaling_out=self.hparams.ensemble_scaling_out,
+                        ensemble_bias=self.hparams.ensemble_bias,
                         scaling_init="ones",
                     )
                 )
 
-            if self.hparams.get("use_glu", config.use_glu):
+            if self.hparams.use_glu:
                 self.layers.append(nn.GLU())
             else:
-                self.layers.append(self.activation)
-            if self.hparams.get("dropout", config.dropout) > 0.0:
                 self.layers.append(
-                    nn.Dropout(self.hparams.get("dropout", config.dropout))
+                    self.hparams.activation
+                    if hasattr(self.hparams, "activation")
+                    else nn.SELU()
                 )
+            if self.hparams.dropout > 0.0:
+                self.layers.append(nn.Dropout(self.hparams.dropout))
 
-        if self.average_ensembles:
-            self.final_layer = nn.Linear(self.layer_sizes[-1], num_classes)
+        if self.hparams.average_ensembles:
+            self.final_layer = nn.Linear(self.hparams.layer_sizes[-1], num_classes)
         else:
             self.final_layer = SNLinear(
-                self.hparams.get("ensemble_size", config.ensemble_size),
-                self.layer_sizes[-1],
+                self.hparams.ensemble_size,
+                self.hparams.layer_sizes[-1],
                 num_classes,
             )
 
@@ -162,10 +145,10 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
             Output tensor.
         """
         # Handle embeddings if used
-        if self.use_embeddings:
+        if self.hparams.use_embeddings:
             x = self.embedding_layer(num_features, cat_features)
             # Option 1: Average over feature dimension (N)
-            if self.average_embeddings:
+            if self.hparams.average_embeddings:
                 x = x.mean(dim=1)  # Shape: (B, D)
             # Option 2: Flatten feature and embedding dimensions
             else:
@@ -181,7 +164,11 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
             if isinstance(self.layers[i], LinearBatchEnsembleLayer):
                 out = self.layers[i](x)
                 # `out` shape is expected to be (batch_size, ensemble_size, out_features)
-                if self.skip_connections and x.shape == out.shape:
+                if (
+                    hasattr(self, "skip_connections")
+                    and self.skip_connections
+                    and x.shape == out.shape
+                ):
                     x = x + out
                 else:
                     x = out
@@ -192,10 +179,14 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
         x = self.layers[-1](x)  # Shape (batch_size, ensemble_size, num_classes)
 
         # Option 1: Averaging across ensemble outputs
-        if self.average_ensembles:
-            x = x.mean(dim=1)  # Shape (batch_size, num_classes)
+        if self.hparams.average_ensembles:
+            x = x.mean(axis=1)  # Shape (batch_size, num_classes)
+
+        x = self.final_layer(
+            x
+        )  # Shape (batch_size, (ensemble_size), num_classes) if not averaged
 
-        x = self.final_layer(x)  # Shape (batch_size, num_classes)
+        if not self.hparams.average_ensembles:
+            x = x.squeeze(-1)
 
-        print(x.shape)
         return x
diff --git a/mambular/base_models/tabtransformer.py b/mambular/base_models/tabtransformer.py
index acb71b7..d14923e 100644
--- a/mambular/base_models/tabtransformer.py
+++ b/mambular/base_models/tabtransformer.py
@@ -68,14 +68,14 @@ def __init__(
         config: DefaultTabTransformerConfig = DefaultTabTransformerConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
         if cat_feature_info == {}:
             raise ValueError(
                 "You are trying to fit a TabTransformer with no categorical features. Try using a different model that is better suited for tasks without categorical features."
             )
 
-        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
+        self.returns_ensemble = False
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
@@ -91,16 +91,14 @@ def __init__(
         encoder_layer = CustomTransformerEncoderLayer(config=config)
         self.encoder = nn.TransformerEncoder(
             encoder_layer,
-            num_layers=self.hparams.get("n_layers", config.n_layers),
+            num_layers=self.hparams.n_layers,
             norm=self.norm_f,
         )
 
-        head_activation = self.hparams.get("head_activation", config.head_activation)
-
         mlp_input_dim = 0
         for feature_name, input_shape in num_feature_info.items():
             mlp_input_dim += input_shape
-        mlp_input_dim += config.d_model
+        mlp_input_dim += self.hparams.d_model
 
         self.tabular_head = MLPhead(
             input_dim=mlp_input_dim,
@@ -108,10 +106,6 @@ def __init__(
             output_dim=num_classes,
         )
 
-        self.cls_token = nn.Parameter(
-            torch.zeros(1, 1, self.hparams.get("d_model", config.d_model))
-        )
-
         # pooling
         n_inputs = len(num_feature_info) + len(cat_feature_info)
         self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
diff --git a/mambular/base_models/tabularnn.py b/mambular/base_models/tabularnn.py
index 56b3b1c..f3191e2 100644
--- a/mambular/base_models/tabularnn.py
+++ b/mambular/base_models/tabularnn.py
@@ -18,14 +18,10 @@ def __init__(
         config: DefaultTabulaRNNConfig = DefaultTabulaRNNConfig(),
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(config=config, **kwargs)
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-        self.lr = self.hparams.get("lr", config.lr)
-        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
-        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
-        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
-        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
+        self.returns_ensemble = False
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
@@ -37,17 +33,15 @@ def __init__(
             config=config,
         )
 
-        head_activation = self.hparams.get("head_activation", config.head_activation)
-
         self.tabular_head = MLPhead(
-            input_dim=self.hparams.get("dim_feedforward", config.dim_feedforward),
+            input_dim=self.hparams.dim_feedforward,
             config=config,
             output_dim=num_classes,
         )
 
         self.linear = nn.Linear(
-            self.hparams.get("d_model", config.d_model),
-            self.hparams.get("dim_feedforward", config.dim_feedforward),
+            self.hparams.d_model,
+            self.hparams.dim_feedforward,
         )
 
         temp_config = replace(config, d_model=config.dim_feedforward)

From e8b123cafad79d1d38e7122a10d4f4aad81fae4e Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 14:25:15 +0000
Subject: [PATCH 093/132] include ensembling loss

---
 mambular/base_models/lightning_wrapper.py | 43 +++++++++++++++++------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/mambular/base_models/lightning_wrapper.py b/mambular/base_models/lightning_wrapper.py
index bf6d37d..b06f966 100644
--- a/mambular/base_models/lightning_wrapper.py
+++ b/mambular/base_models/lightning_wrapper.py
@@ -130,9 +130,9 @@ def compute_loss(self, predictions, y_true):
         Parameters
         ----------
         predictions : Tensor
-            Model predictions.
+            Model predictions. Shape: (batch_size, k, output_dim) for ensembles, or (batch_size, output_dim) otherwise.
         y_true : Tensor
-            True labels.
+            True labels. Shape: (batch_size, output_dim).
 
         Returns
         -------
@@ -140,12 +140,35 @@ def compute_loss(self, predictions, y_true):
             Computed loss.
         """
         if self.lss:
-            return self.family.compute_loss(predictions, y_true.squeeze(-1))
+            if getattr(self.base_model, "returns_ensemble", False):
+                loss = 0.0
+                for ensemble_member in range(predictions.shape[1]):
+                    loss += self.family.compute_loss(
+                        predictions[:, ensemble_member], y_true.squeeze(-1)
+                    )
+                return loss
+            else:
+                return self.family.compute_loss(predictions, y_true.squeeze(-1))
+
+        if getattr(self.base_model, "returns_ensemble", False):  # Ensemble case
+            if (
+                self.loss_fct.__class__.__name__ == "CrossEntropyLoss"
+                and predictions.dim() == 3
+            ):
+                # Classification case with ensemble: predictions (N, E, k), y_true (N,)
+                N, E, k = predictions.shape
+                loss = 0.0
+                for ensemble_member in range(E):
+                    loss += self.loss_fct(predictions[:, ensemble_member, :], y_true)
+                return loss
+
+            else:
+                # Regression case with ensemble (e.g., MSE) or other compatible losses
+                y_true_expanded = y_true.expand_as(predictions)
+                return self.loss_fct(predictions, y_true_expanded)
         else:
-            loss = self.loss_fct(
-                predictions, y_true.unsqueeze(1).expand_as(predictions)
-            )
-            return loss
+            # Non-ensemble case
+            return self.loss_fct(predictions, y_true)
 
     def training_step(self, batch, batch_idx):
         """
@@ -181,7 +204,7 @@ def training_step(self, batch, batch_idx):
         )
 
         # Log additional metrics
-        if not self.lss:
+        if not self.lss and not self.base_model.returns_ensemble:
             if self.num_classes > 1:
                 acc = self.acc(preds, labels)
                 self.log(
@@ -226,7 +249,7 @@ def validation_step(self, batch, batch_idx):
         )
 
         # Log additional metrics
-        if not self.lss:
+        if not self.lss and not self.base_model.returns_ensemble:
             if self.num_classes > 1:
                 acc = self.acc(preds, labels)
                 self.log(
@@ -270,7 +293,7 @@ def test_step(self, batch, batch_idx):
         )
 
         # Log additional metrics
-        if not self.lss:
+        if not self.lss and not self.base_model.returns_ensemble:
             if self.num_classes > 1:
                 acc = self.acc(preds, labels)
                 self.log(

From 98b873f27568e5537d055e1b5c90a6e6d8c86038 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 14:26:03 +0000
Subject: [PATCH 094/132] adjust configs to new params

---
 mambular/configs/batchtabrnn_config.py | 5 ++---
 mambular/configs/mambatab_config.py    | 1 +
 mambular/configs/tabm_config.py        | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mambular/configs/batchtabrnn_config.py b/mambular/configs/batchtabrnn_config.py
index 78ed9ab..40880dc 100644
--- a/mambular/configs/batchtabrnn_config.py
+++ b/mambular/configs/batchtabrnn_config.py
@@ -91,6 +91,5 @@ class DefaultBatchTabRNNConfig:
     ensemble_scaling_out: bool = True
     ensemble_bias: bool = True
     scaling_init: Literal["ones", "random-signs", "normal"] = "ones"
-    average_ensembles: bool = True
-    ensemble_first: bool = True
-    model_type: Literal["mini", "full"] = "full"
+    average_ensembles: bool = False
+    model_type: Literal["mini", "full"] = "mini"
diff --git a/mambular/configs/mambatab_config.py b/mambular/configs/mambatab_config.py
index 4927c35..a8bd506 100644
--- a/mambular/configs/mambatab_config.py
+++ b/mambular/configs/mambatab_config.py
@@ -98,3 +98,4 @@ class DefaultMambaTabConfig:
     axis: int = 1
     use_pscan: bool = False
     mamba_version: str = "mamba-torch"
+    bidirectional = False
diff --git a/mambular/configs/tabm_config.py b/mambular/configs/tabm_config.py
index c7dab7f..dac8398 100644
--- a/mambular/configs/tabm_config.py
+++ b/mambular/configs/tabm_config.py
@@ -88,5 +88,5 @@ class DefaultTabMConfig:
     ensemble_scaling_out: bool = True
     ensemble_bias: bool = True
     scaling_init: Literal["ones", "random-signs", "normal"] = "normal"
-    average_ensembles: bool = True
-    model_type: Literal["mini", "full"] = "full"
+    average_ensembles: bool = False
+    model_type: Literal["mini", "full"] = "mini"

From 3cfa7ba1fc9efaf612c007fa41d9276fabdb8f97 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 14:26:20 +0000
Subject: [PATCH 095/132] sklearn classes for new ensembling logic

---
 mambular/models/sklearn_base_classifier.py | 10 ++++++++--
 mambular/models/sklearn_base_lss.py        | 10 ++++++++--
 mambular/models/sklearn_base_regressor.py  | 10 ++++++++--
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 2c7f033..5db8bcf 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -420,7 +420,7 @@ def fit(
 
         return self
 
-    def predict(self, X):
+    def predict(self, X, device=None):
         """
         Predicts target values for the given input samples.
 
@@ -443,7 +443,8 @@ def predict(self, X):
         cat_tensors, num_tensors = self.data_module.preprocess_test_data(X)
 
         # Move tensors to appropriate device
-        device = next(self.task_model.parameters()).device
+        if device is None:
+            device = next(self.task_model.parameters()).device
         if isinstance(cat_tensors, list):
             cat_tensors = [tensor.to(device) for tensor in cat_tensors]
         else:
@@ -461,6 +462,11 @@ def predict(self, X):
         with torch.no_grad():
             logits = self.task_model(num_features=num_tensors, cat_features=cat_tensors)
 
+            # Check if ensemble is used
+            if self.task_model.base_model.returns_ensemble:  # If using ensemble
+                # Average logits across the ensemble dimension (assuming shape: (batch_size, ensemble_size, output_dim))
+                logits = logits.mean(dim=1)
+
             # Check the shape of the logits to determine binary or multi-class classification
             if logits.shape[1] == 1:
                 # Binary classification
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index f6191b5..6631eed 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -461,7 +461,7 @@ def fit(
 
         return self
 
-    def predict(self, X, raw=False):
+    def predict(self, X, raw=False, device=None):
         """
         Predicts target values for the given input samples.
 
@@ -484,7 +484,8 @@ def predict(self, X, raw=False):
         cat_tensors, num_tensors = self.data_module.preprocess_test_data(X)
 
         # Move tensors to appropriate device
-        device = next(self.task_model.parameters()).device
+        if device is not None:
+            device = next(self.task_model.parameters()).device
         if isinstance(cat_tensors, list):
             cat_tensors = [tensor.to(device) for tensor in cat_tensors]
         else:
@@ -504,6 +505,11 @@ def predict(self, X, raw=False):
                 num_features=num_tensors, cat_features=cat_tensors
             )
 
+        # Check if ensemble is used
+        if getattr(self.base_model, "returns_ensemble", False):  # If using ensemble
+            # Average over the ensemble dimension (assuming shape: (batch_size, ensemble_size, output_dim))
+            predictions = predictions.mean(dim=1)
+
         if not raw:
             return self.task_model.family(predictions).cpu().numpy()
 
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index 874aa45..8bc3048 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -416,7 +416,7 @@ def fit(
 
         return self
 
-    def predict(self, X):
+    def predict(self, X, device=None):
         """
         Predicts target values for the given input samples.
 
@@ -439,7 +439,8 @@ def predict(self, X):
         cat_tensors, num_tensors = self.data_module.preprocess_test_data(X)
 
         # Move tensors to appropriate device
-        device = next(self.task_model.parameters()).device
+        if device is None:
+            qdevice = next(self.task_model.parameters()).device
         if isinstance(cat_tensors, list):
             cat_tensors = [tensor.to(device) for tensor in cat_tensors]
         else:
@@ -459,6 +460,11 @@ def predict(self, X):
                 num_features=num_tensors, cat_features=cat_tensors
             )
 
+        # Check if ensemble is used
+        if self.task_model.base_model.returns_ensemble:  # If using ensemble
+            # Average over the ensemble dimension (assuming shape: (batch_size, ensemble_size, output_dim))
+            predictions = predictions.mean(dim=1)
+
         # Convert predictions to NumPy array and return
         return predictions.cpu().numpy()
 

From bab7f1197657f3a29042d4c596914b27906fbe68 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 15:51:45 +0000
Subject: [PATCH 096/132] fix proba prediction for ensembles

---
 mambular/models/sklearn_base_classifier.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 5db8bcf..022a072 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -466,6 +466,10 @@ def predict(self, X, device=None):
             if self.task_model.base_model.returns_ensemble:  # If using ensemble
                 # Average logits across the ensemble dimension (assuming shape: (batch_size, ensemble_size, output_dim))
                 logits = logits.mean(dim=1)
+                if (
+                    logits.dim() == 1
+                ):  # Check if logits has only one dimension (shape (N,))
+                    logits = logits.unsqueeze(1)
 
             # Check the shape of the logits to determine binary or multi-class classification
             if logits.shape[1] == 1:
@@ -480,7 +484,7 @@ def predict(self, X, device=None):
         # Convert predictions to NumPy array and return
         return predictions.cpu().numpy()
 
-    def predict_proba(self, X):
+    def predict_proba(self, X, device=None):
         """
         Predict class probabilities for the given input samples.
 
@@ -540,6 +544,14 @@ def predict_proba(self, X):
         # Perform inference
         with torch.no_grad():
             logits = self.task_model(num_features=num_tensors, cat_features=cat_tensors)
+            # Check if ensemble is used
+            if self.task_model.base_model.returns_ensemble:  # If using ensemble
+                # Average logits across the ensemble dimension (assuming shape: (batch_size, ensemble_size, output_dim))
+                logits = logits.mean(dim=1)
+                if (
+                    logits.dim() == 1
+                ):  # Check if logits has only one dimension (shape (N,))
+                    logits = logits.unsqueeze(1)
             if logits.shape[1] > 1:
                 probabilities = torch.softmax(logits, dim=1)
             else:

From 6a83537411a577e5b006aee910425b5cf8df7aeb Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 14 Nov 2024 16:51:52 +0000
Subject: [PATCH 097/132] add positional_invariance layer

---
 .../layer_utils/invariance_layer.py           | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 mambular/arch_utils/layer_utils/invariance_layer.py

diff --git a/mambular/arch_utils/layer_utils/invariance_layer.py b/mambular/arch_utils/layer_utils/invariance_layer.py
new file mode 100644
index 0000000..2eddf7f
--- /dev/null
+++ b/mambular/arch_utils/layer_utils/invariance_layer.py
@@ -0,0 +1,93 @@
+import torch
+import torch.nn as nn
+
+
+class LearnableFourierFeatures(nn.Module):
+    def __init__(self, num_features=64, d_model=512):
+        super().__init__()
+        self.freqs = nn.Parameter(torch.randn(num_features, d_model))
+        self.phases = nn.Parameter(torch.randn(num_features) * 2 * torch.pi)
+
+    def forward(self, input):
+        B, K, D = input.shape
+        positions = torch.arange(K, device=input.device).unsqueeze(1)
+        encoding = torch.sin(positions * self.freqs.T + self.phases)
+        return input + encoding.unsqueeze(0).expand(B, K, -1)
+
+
+class LearnableFourierMask(nn.Module):
+    def __init__(self, sequence_length, keep_ratio=0.5):
+        super().__init__()
+        cutoff_index = int(sequence_length * keep_ratio)
+        self.mask = nn.Parameter(torch.ones(sequence_length))
+        self.mask[cutoff_index:] = 0  # Start with a low-frequency cutoff
+
+    def forward(self, input):
+        B, K, D = input.shape
+        freq_repr = torch.fft.fft(input, dim=1)
+        masked_freq = freq_repr * self.mask.unsqueeze(1)  # Apply learnable mask
+        return torch.fft.ifft(masked_freq, dim=1).real
+
+
+class LearnableRandomPositionalPerturbation(nn.Module):
+    def __init__(self, num_features=64, d_model=512):
+        super().__init__()
+        self.freqs = nn.Parameter(torch.randn(num_features))
+        self.amplitude = nn.Parameter(torch.tensor(0.1))
+
+    def forward(self, input):
+        B, K, D = input.shape
+        positions = torch.arange(K, device=input.device).unsqueeze(1)
+        random_features = torch.sin(positions * self.freqs.T)
+        perturbation = random_features.unsqueeze(0).expand(B, K, D) * self.amplitude
+        return input + perturbation
+
+
+class LearnableRandomProjection(nn.Module):
+    def __init__(self, d_model=512, projection_dim=64):
+        super().__init__()
+        self.projection_matrix = nn.Parameter(torch.randn(d_model, projection_dim))
+
+    def forward(self, input):
+        return torch.einsum("bkd,dp->bkp", input, self.projection_matrix)
+
+
+class PositionalInvariance(nn.Module):
+    def __init__(self, config, invariance_type, seq_len, in_channels=None):
+        super().__init__()
+        # Select the appropriate layer based on config.invariance_type
+        if invariance_type == "lfm":  # Learnable Fourier Mask
+            self.layer = LearnableFourierMask(
+                sequence_length=seq_len, keep_ratio=getattr(config, "keep_ratio", 0.5)
+            )
+        elif invariance_type == "lff":  # Learnable Fourier Features
+            self.layer = LearnableFourierFeatures(
+                num_features=seq_len, d_model=config.d_model
+            )
+        elif invariance_type == "lprp":  # Learnable Positional Random Perturbation
+            self.layer = LearnableRandomPositionalPerturbation(
+                num_features=seq_len, d_model=config.d_model
+            )
+        elif invariance_type == "lrp":  # Learnable Random Projection
+            self.layer = LearnableRandomProjection(
+                d_model=config.d_model,
+                projection_dim=getattr(config, "projection_dim", 64),
+            )
+
+        elif invariance_type == "conv":
+            self.layer = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=config.d_conv,
+                padding=config.d_conv - 1,
+                bias=config.conv_bias,
+                groups=in_channels,
+            )
+        else:
+            raise ValueError(
+                f"Unknown positional invariance type: {config.invariance_type}"
+            )
+
+    def forward(self, input):
+        # Pass the input through the selected layer
+        return self.layer(input)

From 2bb90f56ddf79dd43b7aa98fb3fd7ffe18d5d7d7 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 18 Nov 2024 13:36:31 +0000
Subject: [PATCH 098/132] fix new mamba2 version

---
 mambular/arch_utils/mamba_utils/mamba_original.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_original.py b/mambular/arch_utils/mamba_utils/mamba_original.py
index d4a49a8..5a51481 100644
--- a/mambular/arch_utils/mamba_utils/mamba_original.py
+++ b/mambular/arch_utils/mamba_utils/mamba_original.py
@@ -31,8 +31,6 @@ def __init__(
         d_conv=16,
         conv_bias=True,
         d_state=32,
-        dt_scale=1.0,
-        dt_init="random",
         dt_max=0.1,
         dt_min=1e-03,
         dt_init_floor=1e-04,
@@ -67,9 +65,6 @@ def __init__(
                 f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}"
             )
 
-        if dt_rank == "auto":
-            dt_rank = math.ceil(d_model / 16)
-
         # Use the imported MambaBlock to create layers
         self.layers = ResidualBlock.MambaBlock(
             d_model=d_model,
@@ -78,8 +73,6 @@ def __init__(
             expand=expand_factor,
             dt_min=dt_min,
             dt_max=dt_max,
-            dt_init=dt_init,
-            dt_scale=dt_scale,
             dt_init_floor=dt_init_floor,
             conv_bias=conv_bias,
             bias=bias,
@@ -154,8 +147,6 @@ def __init__(self, config):
                     expand_factor=getattr(config, "expand_factor", 2),
                     dt_min=getattr(config, "dt_min", 1e-04),
                     dt_max=getattr(config, "dt_max", 0.1),
-                    dt_init=getattr(config, "dt_init", "random"),
-                    dt_scale=getattr(config, "dt_scale", 1.0),
                     dt_init_floor=getattr(config, "dt_init_floor", 1e-04),
                     conv_bias=getattr(config, "conv_bias", False),
                     bias=getattr(config, "bias", True),
@@ -175,11 +166,8 @@ def __init__(self, config):
                         d_conv=config.d_conv,
                         norm=get_normalization_layer(config),
                         expand_factor=config.expand_factor,
-                        dt_rank=config.dt_rank,
                         dt_min=config.dt_min,
                         dt_max=config.dt_max,
-                        dt_init=config.dt_init,
-                        dt_scale=config.dt_scale,
                         dt_init_floor=config.dt_init_floor,
                         conv_bias=config.conv_bias,
                         bias=config.bias,

From 4d82e0e58c277e9e1d945d8fd65b49e19d063a79 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Fri, 22 Nov 2024 09:11:39 +0000
Subject: [PATCH 099/132] include new ensemble method for rnn

---
 mambular/base_models/batch_tabrnn.py | 29 ++++++++++++++--------------
 mambular/base_models/tabm.py         |  1 -
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/mambular/base_models/batch_tabrnn.py b/mambular/base_models/batch_tabrnn.py
index 120d4a0..5c6ae27 100644
--- a/mambular/base_models/batch_tabrnn.py
+++ b/mambular/base_models/batch_tabrnn.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from ..arch_utils.mlp_utils import MLPhead
+from ..arch_utils.layer_utils.sn_linear import SNLinear
 from ..configs.batchtabrnn_config import DefaultBatchTabRNNConfig
 from .basemodel import BaseModel
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
@@ -92,15 +92,13 @@ def __init__(
         temp_config = replace(config, d_model=config.dim_feedforward)
         self.norm_f = get_normalization_layer(temp_config)
 
-        if not self.hparams.average_ensembles:
-            self.tabular_head = SNLinear(
-                config.ensemble_size, config.dim_feedforward, num_classes
-            )
+        if self.hparams.average_ensembles:
+            self.final_layer = nn.Linear(self.hparams.dim_feedforward, num_classes)
         else:
-            self.tabular_head = MLPhead(
-                input_dim=self.hparams.dim_feedforward,
-                config=config,
-                output_dim=num_classes,
+            self.final_layer = SNLinear(
+                self.hparams.ensemble_size,
+                self.hparams.dim_feedforward,
+                num_classes,
             )
 
         n_inputs = len(num_feature_info) + len(cat_feature_info)
@@ -115,14 +113,15 @@ def forward(self, num_features, cat_features):
         )  # Shape: (batch_size, sequence_length, ensemble_size, hidden_size)
 
         out = self.pool_sequence(out)  # Shape: (batch_size, ensemble_size, hidden_size)
+
         if self.hparams.average_ensembles:
-            # Simple average over ensembles
-            out = out.mean(dim=1)  # Shape: (batch_size, hidden_size)
-            # Final prediction head
+            x = out.mean(axis=1)  # Shape (batch_size, num_classes)
 
-        preds = self.tabular_head(out)  #
+        x = self.final_layer(
+            out
+        )  # Shape (batch_size, (ensemble_size), num_classes) if not averaged
 
         if not self.hparams.average_ensembles:
-            preds = preds.squeeze(-1)
+            x = x.squeeze(-1)
 
-        return preds
+        return x
diff --git a/mambular/base_models/tabm.py b/mambular/base_models/tabm.py
index e3a624c..ead84ca 100644
--- a/mambular/base_models/tabm.py
+++ b/mambular/base_models/tabm.py
@@ -178,7 +178,6 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
         # Final ensemble output from the last ConfigurableBatchEnsembleLayer
         x = self.layers[-1](x)  # Shape (batch_size, ensemble_size, num_classes)
 
-        # Option 1: Averaging across ensemble outputs
         if self.hparams.average_ensembles:
             x = x.mean(axis=1)  # Shape (batch_size, num_classes)
 

From a791bea24b532266c0141226e25809980f32c386 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Fri, 22 Nov 2024 09:17:48 +0000
Subject: [PATCH 100/132] adapt readme to new models

---
 README.md | 61 ++-----------------------------------------------------
 1 file changed, 2 insertions(+), 59 deletions(-)

diff --git a/README.md b/README.md
index 4ba6274..5d5e1b0 100644
--- a/README.md
+++ b/README.md
@@ -58,13 +58,12 @@ Mambular is a Python package that brings the power of advanced deep learning arc
 | `Mambular`       | A sequential model using Mamba blocks [Gu and Dao](https://arxiv.org/pdf/2312.00752)  specifically designed for various tabular data tasks.             |
 | `TabM`           | Batch Ensembling for a MLP as introduced by [Gorishniy et al.](https://arxiv.org/abs/2410.24210)                                                        |
 | `NODE`           | Neural Oblivious Decision Ensembles as introduced by [Popov et al.](https://arxiv.org/abs/1909.06312)                                                   |
-| `BatchTabRNN`    | A sequential model using RNN and batch ensembling. [TBD]()                                                                                              |
 | `FTTransformer`  | A model leveraging transformer encoders, as introduced by [Gorishniy et al.](https://arxiv.org/abs/2106.11959), for tabular data.                       |
 | `MLP`            | A classical Multi-Layer Perceptron (MLP) model for handling tabular data tasks.                                                                         |
 | `ResNet`         | An adaptation of the ResNet architecture for tabular data applications.                                                                                 |
 | `TabTransformer` | A transformer-based model for tabular data introduced by [Huang et al.](https://arxiv.org/abs/2012.06678), enhancing feature learning capabilities.     |
 | `MambaTab`       | A tabular model using a Mamba-Block on a joint input representation described [here](https://arxiv.org/abs/2401.08867) . Not a sequential model.        |
-| `TabulaRNN`      | A Recurrent Neural Network for Tabular data. Not yet included in the benchmarks. Paper Link will follow                                                 |
+| `TabulaRNN`      | A Recurrent Neural Network for Tabular data. Not yet included in the benchmarks. Paper Link will follow.                                                 |
 | `MambAttention`  | A combination between Mamba and Transformers, similar to Jamba by [Lieber et al.](https://arxiv.org/abs/2403.19887). Not yet included in the benchmarks |
 
 
@@ -73,62 +72,6 @@ All models are available for `regression`, `classification` and distributional r
 Hence, they are available as e.g. `MambularRegressor`, `MambularClassifier` or `MambularLSS`
 
 
-# 🏆 Results
-Detailed results for the available methods can be found [here](https://arxiv.org/abs/2408.06291).
-Note, that these are achieved results with default hyperparameter and for our splits. Performing hyperparameter optimization could improve the performance of all models.
-
-The average rank table over all models and all datasets is given here:
-
-<div align="center">
-
-<table>
-  <tr>
-    <th style="text-align:center;">Model</th>
-    <th style="text-align:center;">Avg. Rank</th>
-  </tr>
-  <tr>
-    <td style="text-align:center;"><strong>Mambular</strong></td>
-    <td style="text-align:center;"><strong>2.083</strong> <sub>±1.037</sub></td>
-  </tr>
-  <tr>
-    <td style="text-align:center;">FT-Transformer</td>
-    <td style="text-align:center;">2.417 <sub>±1.256</sub></td>
-  </tr>
-  <tr>
-    <td style="text-align:center;">XGBoost</td>
-    <td style="text-align:center;">3.167 <sub>±2.577</sub></td>
-  </tr>
-  <tr>
-    <td style="text-align:center;">MambaTab*</td>
-    <td style="text-align:center;">4.333 <sub>±1.374</sub></td>
-  </tr>
-  <tr>
-    <td style="text-align:center;">ResNet</td>
-    <td style="text-align:center;">4.750 <sub>±1.639</sub></td>
-  </tr>
-  <tr>
-    <td style="text-align:center;">TabTransformer</td>
-    <td style="text-align:center;">6.222 <sub>±1.618</sub></td>
-  </tr>
-  <tr>
-    <td style="text-align:center;">MLP</td>
-    <td style="text-align:center;">6.500 <sub>±1.500</sub></td>
-  </tr>
-  <tr>
-    <td style="text-align:center;">MambaTab</td>
-    <td style="text-align:center;">6.583 <sub>±1.801</sub></td>
-  </tr>
-  <tr>
-    <td style="text-align:center;">MambaTab<sup>T</sup></td>
-    <td style="text-align:center;">7.917 <sub>±1.187</sub></td>
-  </tr>
-</table>
-
-</div>
-
-
-
-
 # 📚 Documentation
 
 You can find the Mamba-Tabular API documentation [here](https://mambular.readthedocs.io/en/latest/).
@@ -164,7 +107,7 @@ Mambular simplifies data preprocessing with a range of tools designed for easy t
 - **Ordinal & One-Hot Encoding**: Automatically transforms categorical data into numerical formats.
 - **Binning**: Discretizes numerical features; can use decision trees for optimal binning.
 - **Normalization & Standardization**: Scales numerical data appropriately.
-- **Periodic Linear Encoding (PLE)**: Encodes periodicity in numerical data.
+- **Piecewise Linear Encodings (PLE)**: Encodes periodicity in numerical data.
 - **Quantile & Spline Transformations**: Applies advanced transformations to handle nonlinearity and distributional shifts.
 - **Polynomial Features**: Generates polynomial and interaction terms to capture complex relationships.
 

From fe632abef57cc561c991c90eead560621ca68bf2 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Fri, 22 Nov 2024 16:18:15 +0000
Subject: [PATCH 101/132] adjust default parameters in configs for hpo

---
 .../layer_utils/normalization_layers.py           | 14 +++++++-------
 mambular/base_models/mlp.py                       |  6 ------
 mambular/base_models/resnet.py                    |  7 +------
 mambular/configs/batchtabrnn_config.py            | 15 ---------------
 mambular/configs/mlp_config.py                    |  3 ---
 mambular/configs/resnet_config.py                 |  2 +-
 6 files changed, 9 insertions(+), 38 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/normalization_layers.py b/mambular/arch_utils/layer_utils/normalization_layers.py
index 5237177..7d7bfcd 100644
--- a/mambular/arch_utils/layer_utils/normalization_layers.py
+++ b/mambular/arch_utils/layer_utils/normalization_layers.py
@@ -61,6 +61,7 @@ class BatchNorm(nn.Module):
 
     def __init__(self, d_model: int, eps: float = 1e-5, momentum: float = 0.1):
         super().__init__()
+        self.d_model = d_model
         self.eps = eps
         self.momentum = momentum
         self.register_buffer("running_mean", torch.zeros(d_model))
@@ -71,13 +72,12 @@ def __init__(self, d_model: int, eps: float = 1e-5, momentum: float = 0.1):
     def forward(self, x):
         if self.training:
             mean = x.mean(dim=0)
-            var = x.var(dim=0)
-            self.running_mean = (
-                1 - self.momentum
-            ) * self.running_mean + self.momentum * mean
-            self.running_var = (
-                1 - self.momentum
-            ) * self.running_var + self.momentum * var
+            var = x.var(
+                dim=0, unbiased=False
+            )  # Use unbiased=False for consistency with BatchNorm
+            # Update running stats in-place
+            self.running_mean.mul_(1 - self.momentum).add_(self.momentum * mean)
+            self.running_var.mul_(1 - self.momentum).add_(self.momentum * var)
         else:
             mean = self.running_mean
             var = self.running_var
diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index 08005cc..8ac0970 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -2,7 +2,6 @@
 import torch.nn as nn
 from ..configs.mlp_config import DefaultMLPConfig
 from .basemodel import BaseModel
-from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 
 
@@ -96,11 +95,6 @@ def __init__(
         if self.hparams.batch_norm:
             self.layers.append(nn.BatchNorm1d(self.hparams.layer_sizes[0]))
 
-        self.norm_f = get_normalization_layer(config)
-
-        if self.norm_f is not None:
-            self.layers.append(self.norm_f(self.hparams.layer_sizes[0]))
-
         if self.hparams.use_glu:
             self.layers.append(nn.GLU())
         else:
diff --git a/mambular/base_models/resnet.py b/mambular/base_models/resnet.py
index 549ca41..14851f6 100644
--- a/mambular/base_models/resnet.py
+++ b/mambular/base_models/resnet.py
@@ -3,7 +3,6 @@
 from typing import Any
 from ..configs.resnet_config import DefaultResNetConfig
 from .basemodel import BaseModel
-from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.resnet_utils import ResidualBlock
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 
@@ -47,8 +46,6 @@ class ResNet(BaseModel):
         List of residual blocks to process the hidden representations.
     output_layer : nn.Linear
         Output layer that produces the final prediction.
-    norm_f : nn.Module, optional
-        Normalization layer applied in each residual block, if specified in the configuration.
 
     Methods
     -------
@@ -73,8 +70,6 @@ def __init__(
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        self.norm_f = get_normalization_layer(config)
-
         if self.hparams.use_embeddings:
             input_dim = (
                 len(num_feature_info) * self.hparams.d_model
@@ -108,7 +103,7 @@ def __init__(
                 input_dim,
                 output_dim,
                 self.hparams.activation,
-                self.norm_f,
+                self.hparams.norm,
                 self.hparams.dropout,
             )
             self.blocks.append(block)
diff --git a/mambular/configs/batchtabrnn_config.py b/mambular/configs/batchtabrnn_config.py
index 40880dc..55a50e3 100644
--- a/mambular/configs/batchtabrnn_config.py
+++ b/mambular/configs/batchtabrnn_config.py
@@ -30,16 +30,6 @@ class DefaultBatchTabRNNConfig:
         Activation function for the transformer.
     embedding_activation : callable, default=nn.Identity()
         Activation function for numerical embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
-    head_dropout : float, default=0.5
-        Dropout rate for the head layers.
-    head_skip_layers : bool, default=False
-        Whether to skip layers in the head.
-    head_activation : callable, default=nn.SELU()
-        Activation function for the head layers.
-    head_use_batch_norm : bool, default=False
-        Whether to use batch normalization in the head layers.
     layer_norm_after_embedding : bool, default=False
         Whether to apply layer normalization after embedding.
     pooling_method : str, default="cls"
@@ -67,11 +57,6 @@ class DefaultBatchTabRNNConfig:
     activation: callable = nn.SELU()
     embedding_activation: callable = nn.Identity()
     embedding_dropout: float = None
-    head_layer_sizes: list = ()
-    head_dropout: float = 0.5
-    head_skip_layers: bool = False
-    head_activation: callable = nn.SELU()
-    head_use_batch_norm: bool = False
     layer_norm_after_embedding: bool = False
     pooling_method: str = "avg"
     norm_first: bool = False
diff --git a/mambular/configs/mlp_config.py b/mambular/configs/mlp_config.py
index 599d225..4bddcd5 100644
--- a/mambular/configs/mlp_config.py
+++ b/mambular/configs/mlp_config.py
@@ -25,8 +25,6 @@ class DefaultMLPConfig:
         Whether to skip layers in the MLP.
     dropout : float, default=0.5
         Dropout rate for regularization.
-    norm : str, default=None
-        Normalization method to be used, if any.
     use_glu : bool, default=False
         Whether to use Gated Linear Units (GLU) in the MLP.
     skip_connections : bool, default=False
@@ -53,7 +51,6 @@ class DefaultMLPConfig:
     activation: callable = nn.SELU()
     skip_layers: bool = False
     dropout: float = 0.5
-    norm: str = None
     use_glu: bool = False
     skip_connections: bool = False
     batch_norm: bool = False
diff --git a/mambular/configs/resnet_config.py b/mambular/configs/resnet_config.py
index 1103169..f73c070 100644
--- a/mambular/configs/resnet_config.py
+++ b/mambular/configs/resnet_config.py
@@ -55,7 +55,7 @@ class DefaultResNetConfig:
     activation: callable = nn.SELU()
     skip_layers: bool = False
     dropout: float = 0.5
-    norm: str = None
+    norm: bool = False
     use_glu: bool = False
     skip_connections: bool = True
     batch_norm: bool = True

From c662fca435edc8ec8a753d2de65c5556db070057 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 28 Nov 2024 12:47:52 +0000
Subject: [PATCH 102/132] adjust categorical embedding

---
 .../arch_utils/layer_utils/embedding_layer.py | 46 +++++++++++++------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index 042ca4e..72d5ec6 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -35,7 +35,8 @@ def __init__(self, num_feature_info, cat_feature_info, config):
             if getattr(config, "embedding_dropout", None) is not None
             else None
         )
-        self.embedding_type = getattr(config, "embedding_type", "standard")
+        self.embedding_type = getattr(config, "embedding_type", "linear")
+        self.embedding_bias = getattr(config, "embedding_bias", False)
 
         # Sequence length
         self.seq_len = len(num_feature_info) + len(cat_feature_info)
@@ -57,11 +58,11 @@ def __init__(self, num_feature_info, cat_feature_info, config):
                 activation=self.embedding_activation,
                 lite=getattr(config, "plr_lite", False),
             )
-        elif self.embedding_type == "standard":
+        elif self.embedding_type == "linear":
             self.num_embeddings = nn.ModuleList(
                 [
                     nn.Sequential(
-                        nn.Linear(input_shape, self.d_model, bias=False),
+                        nn.Linear(input_shape, self.d_model, bias=self.embedding_bias),
                         self.embedding_activation,
                     )
                     for feature_name, input_shape in num_feature_info.items()
@@ -69,27 +70,42 @@ def __init__(self, num_feature_info, cat_feature_info, config):
             )
         else:
             raise ValueError(
-                "Invalid embedding_type. Choose from 'standard', 'ndt', or 'plr'."
+                "Invalid embedding_type. Choose from 'linear', 'ndt', or 'plr'."
             )
 
-        # Initialize categorical embeddings
-        self.cat_embeddings = nn.ModuleList()
-        for feature_name, num_categories in cat_feature_info.items():
-            if self.cat_encoding == "int":
-                self.cat_embeddings.append(
+        if self.cat_encoding == "int":
+            self.cat_embeddings = nn.ModuleList(
+                [
                     nn.Sequential(
                         nn.Embedding(num_categories + 1, self.d_model),
                         self.embedding_activation,
                     )
-                )
-            elif self.cat_encoding == "one-hot":
-                self.cat_embeddings.append(
+                    for feature_name, num_categories in cat_feature_info.items()
+                ]
+            )
+        elif self.cat_encoding == "one-hot":
+            self.cat_embeddings = nn.ModuleList(
+                [
                     nn.Sequential(
                         OneHotEncoding(num_categories),
-                        nn.Linear(num_categories, self.d_model, bias=False),
+                        nn.Linear(
+                            num_categories, self.d_model, bias=self.embedding_bias
+                        ),
                         self.embedding_activation,
                     )
-                )
+                    for feature_name, num_categories in cat_feature_info.items()
+                ]
+            )
+        elif self.cat_encoding == "linear":
+            self.cat_embeddings = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Linear(input_shape, self.d_model, bias=self.embedding_bias),
+                        self.embedding_activation,
+                    )
+                    for feature_name, input_shape in cat_feature_info.items()
+                ]
+            )
 
         # Class token if required
         if self.use_cls:
@@ -156,7 +172,7 @@ def forward(self, num_features=None, cat_features=None):
             else:
                 num_embeddings = None
         else:
-            # For standard and ndt embeddings, handle each feature individually
+            # For linear and ndt embeddings, handle each feature individually
             if self.num_embeddings and num_features is not None:
                 num_embeddings = [
                     emb(num_features[i]) for i, emb in enumerate(self.num_embeddings)

From f0f7c9b9283402a43cc96747be26c6733f786b3a Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 28 Nov 2024 12:48:10 +0000
Subject: [PATCH 103/132] make norm in resnet to layernorm for hpo

---
 mambular/arch_utils/resnet_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mambular/arch_utils/resnet_utils.py b/mambular/arch_utils/resnet_utils.py
index cf1463a..6e6d40c 100644
--- a/mambular/arch_utils/resnet_utils.py
+++ b/mambular/arch_utils/resnet_utils.py
@@ -2,7 +2,7 @@
 
 
 class ResidualBlock(nn.Module):
-    def __init__(self, input_dim, output_dim, activation, norm_layer=None, dropout=0.0):
+    def __init__(self, input_dim, output_dim, activation, norm=False, dropout=0.0):
         """
         Residual Block used in ResNet.
 
@@ -23,8 +23,8 @@ def __init__(self, input_dim, output_dim, activation, norm_layer=None, dropout=0
         self.linear1 = nn.Linear(input_dim, output_dim)
         self.linear2 = nn.Linear(output_dim, output_dim)
         self.activation = activation
-        self.norm1 = norm_layer(output_dim) if norm_layer else None
-        self.norm2 = norm_layer(output_dim) if norm_layer else None
+        self.norm1 = nn.LayerNorm(output_dim) if norm else None
+        self.norm2 = nn.LayerNorm(output_dim) if norm else None
         self.dropout = nn.Dropout(dropout) if dropout > 0.0 else None
 
     def forward(self, x):

From 8a8e9bec82b7f8e46000b2dfe7f3d0e1ca3c4e00 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 28 Nov 2024 12:48:33 +0000
Subject: [PATCH 104/132] add ensemble configs

---
 mambular/configs/batchtabrnn_config.py |  4 ++--
 mambular/configs/tabm_config.py        | 21 ++++++++++-----------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/mambular/configs/batchtabrnn_config.py b/mambular/configs/batchtabrnn_config.py
index 55a50e3..54b43cf 100644
--- a/mambular/configs/batchtabrnn_config.py
+++ b/mambular/configs/batchtabrnn_config.py
@@ -52,7 +52,7 @@ class DefaultBatchTabRNNConfig:
     lr_factor: float = 0.1
     d_model: int = 128
     n_layers: int = 4
-    rnn_dropout: float = 0.5
+    rnn_dropout: float = 0.3
     norm: str = "RMSNorm"
     activation: callable = nn.SELU()
     embedding_activation: callable = nn.Identity()
@@ -64,7 +64,7 @@ class DefaultBatchTabRNNConfig:
     rnn_activation: callable = nn.ReLU()
     layer_norm_eps: float = 1e-05
     dim_feedforward: int = 256
-    numerical_embedding: str = "ple"
+    embedding_type: float = "standard"
     cat_encoding: str = "int"
     d_conv: int = 4
     conv_bias: bool = True
diff --git a/mambular/configs/tabm_config.py b/mambular/configs/tabm_config.py
index dac8398..2e19de1 100644
--- a/mambular/configs/tabm_config.py
+++ b/mambular/configs/tabm_config.py
@@ -18,9 +18,9 @@ class DefaultTabMConfig:
         Weight decay (L2 penalty) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
-    layer_sizes : list, default=(256, 128, 32)
+    layer_sizes : list, default=(512, 512, 128)
         Sizes of the layers in the model.
-    activation : callable, default=nn.SELU()
+    activation : callable, default=nn.ReLU()
         Activation function for the model layers.
     skip_layers : bool, default=False
         Whether to skip layers in the model.
@@ -30,8 +30,6 @@ class DefaultTabMConfig:
         Normalization method to be used, if any.
     use_glu : bool, default=False
         Whether to use Gated Linear Units (GLU) in the model.
-    skip_connections : bool, default=False
-        Whether to use skip connections in the model.
     batch_norm : bool, default=False
         Whether to use batch normalization in the model layers.
     layer_norm : bool, default=False
@@ -58,17 +56,18 @@ class DefaultTabMConfig:
         Whether to average the outputs of the ensembles.
     """
 
+    # lr params
     lr: float = 1e-04
     lr_patience: int = 10
     weight_decay: float = 1e-06
     lr_factor: float = 0.1
-    layer_sizes: list = (256, 256, 32)
-    activation: callable = nn.SELU()
-    skip_layers: bool = False
-    dropout: float = 0.5
+
+    # arch params
+    layer_sizes: list = (512, 512, 128)
+    activation: callable = nn.ReLU()
+    dropout: float = 0.3
     norm: str = None
     use_glu: bool = False
-    skip_connections: bool = False
     batch_norm: bool = False
     layer_norm: bool = False
     layer_norm_eps: float = 1e-05
@@ -77,8 +76,8 @@ class DefaultTabMConfig:
     use_embeddings: bool = True
     embedding_type: float = "plr"
     plr_lite: bool = False
-    average_embeddings: bool = True
-    embedding_activation: callable = nn.Identity()
+    average_embeddings: bool = False
+    embedding_activation: callable = nn.ReLU()
     layer_norm_after_embedding: bool = False
     d_model: int = 64
 

From 9722342af6a0fd0a948904b0dcf3d8f8a452ec27 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 28 Nov 2024 15:38:49 +0000
Subject: [PATCH 105/132] adjust configs for better readibility

---
 mambular/configs/batchtabrnn_config.py   | 74 +++++++++++++------
 mambular/configs/fttransformer_config.py | 60 +++++++++-------
 mambular/configs/mambatab_config.py      | 48 +++++++------
 mambular/configs/mambular_config.py      | 91 ++++++++++++++----------
 mambular/configs/mlp_config.py           | 28 ++++++--
 mambular/configs/node_config.py          | 79 +++++++++++---------
 mambular/configs/resnet_config.py        | 43 +++++++----
 mambular/configs/tabm_config.py          | 50 +++++++++----
 mambular/configs/tabularnn_config.py     |  4 +-
 9 files changed, 304 insertions(+), 173 deletions(-)

diff --git a/mambular/configs/batchtabrnn_config.py b/mambular/configs/batchtabrnn_config.py
index 54b43cf..3b41218 100644
--- a/mambular/configs/batchtabrnn_config.py
+++ b/mambular/configs/batchtabrnn_config.py
@@ -6,44 +6,75 @@
 @dataclass
 class DefaultBatchTabRNNConfig:
     """
-    Configuration class for the default TabulaRNN model with predefined hyperparameters.
+    Configuration class for the TabulaRNN model with predefined hyperparameters.
 
-    Parameters
+    Attributes
     ----------
     lr : float, default=1e-04
         Learning rate for the optimizer.
-    model_type : str, default="RNN"
-        type of model, one of "RNN", "LSTM", "GRU", "mLSTM", "sLSTM"
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
+        Number of epochs with no improvement after which the learning rate will be reduced.
+    weight_decay : float, default=1e-05
+        Weight decay (L2 regularization) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
-    d_model : int, default=64
+    d_model : int, default=128
         Dimensionality of the model.
-    n_layers : int, default=8
-        Number of layers in the transformer.
+    n_layers : int, default=4
+        Number of RNN layers in the model.
+    rnn_dropout : float, default=0.3
+        Dropout rate for RNN layers.
     norm : str, default="RMSNorm"
-        Normalization method to be used.
+        Type of normalization to be used ('RMSNorm', 'LayerNorm', etc.).
     activation : callable, default=nn.SELU()
-        Activation function for the transformer.
+        Activation function for the RNN model.
     embedding_activation : callable, default=nn.Identity()
         Activation function for numerical embeddings.
+    embedding_dropout : float, optional
+        Dropout rate applied to embeddings. If None, no dropout is applied.
     layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    pooling_method : str, default="cls"
+        Whether to apply layer normalization after the embedding layer.
+    pooling_method : str, default="avg"
         Pooling method to be used ('cls', 'avg', etc.).
     norm_first : bool, default=False
-        Whether to apply normalization before other operations in each transformer block.
+        Whether to apply normalization before other operations in each RNN block.
     bias : bool, default=True
         Whether to use bias in the linear layers.
-    rnn_activation : callable, default=nn.SELU()
-        Activation function for the transformer layers.
-    bidirectional : bool, default=False.
-        Whether to process data bidirectionally
+    rnn_activation : callable, default=nn.ReLU()
+        Activation function for RNN layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization to improve numerical stability.
+    dim_feedforward : int, default=256
+        Dimensionality of the feed-forward layers.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr'.).
+    embedding_bias : bool, default=False
+        Whether to have a bias in the embedding layer
     cat_encoding : str, default="int"
-        Encoding method for categorical features.
+        Encoding method for categorical features ('int', 'one-hot', 'linear').
+    d_conv : int, default=4
+        Dimensionality of convolutional layers, if used.
+    conv_bias : bool, default=True
+        Whether to use bias in convolutional layers.
+    residuals : bool, default=False
+        Whether to include residual connections.
+
+    Batch Ensembling Specific Attributes
+    ------------------------------------
+    ensemble_size : int, default=32
+        Number of ensemble members in batch ensembling.
+    ensemble_scaling_in : bool, default=True
+        Whether to apply scaling to input features for each ensemble member.
+    ensemble_scaling_out : bool, default=True
+        Whether to apply scaling to outputs for each ensemble member.
+    ensemble_bias : bool, default=True
+        Whether to include bias for ensemble-specific scaling.
+    scaling_init : {"ones", "random-signs", "normal"}, default="ones"
+        Initialization method for ensemble scaling factors.
+    average_ensembles : bool, default=False
+        Whether to average predictions across ensemble members.
+    model_type : {"mini", "full"}, default="mini"
+        Model type to use ('mini' for reduced version, 'full' for complete model).
     """
 
     lr: float = 1e-04
@@ -64,7 +95,8 @@ class DefaultBatchTabRNNConfig:
     rnn_activation: callable = nn.ReLU()
     layer_norm_eps: float = 1e-05
     dim_feedforward: int = 256
-    embedding_type: float = "standard"
+    embedding_type: float = "linear"
+    embedding_bias: bool = False
     cat_encoding: str = "int"
     d_conv: int = 4
     conv_bias: bool = True
diff --git a/mambular/configs/fttransformer_config.py b/mambular/configs/fttransformer_config.py
index a433753..19f6641 100644
--- a/mambular/configs/fttransformer_config.py
+++ b/mambular/configs/fttransformer_config.py
@@ -6,60 +6,64 @@
 @dataclass
 class DefaultFTTransformerConfig:
     """
-    Configuration class for the default FT Transformer model with predefined hyperparameters.
+    Configuration class for the FT Transformer model with predefined hyperparameters.
 
-    Parameters
+    Attributes
     ----------
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
+        Weight decay (L2 regularization) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
-    d_model : int, default=64
-        Dimensionality of the model.
-    n_layers : int, default=8
-        Number of layers in the transformer.
-    n_heads : int, default=4
+    d_model : int, default=128
+        Dimensionality of the transformer model.
+    n_layers : int, default=4
+        Number of transformer layers.
+    n_heads : int, default=8
         Number of attention heads in the transformer.
-    attn_dropout : float, default=0.3
+    attn_dropout : float, default=0.2
         Dropout rate for the attention mechanism.
-    ff_dropout : float, default=0.3
+    ff_dropout : float, default=0.1
         Dropout rate for the feed-forward layers.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
+    norm : str, default="LayerNorm"
+        Type of normalization to be used ('LayerNorm', 'RMSNorm', etc.).
     activation : callable, default=nn.SELU()
-        Activation function for the transformer.
+        Activation function for the transformer layers.
     embedding_activation : callable, default=nn.Identity()
-        Activation function for  embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
+        Activation function for embeddings.
+    embedding_bias : bool, default=False
+        Whether to use bias in embedding layers.
+    head_layer_sizes : list, default=()
+        Sizes of the fully connected layers in the model's head.
     head_dropout : float, default=0.5
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
-        Whether to skip layers in the head.
+        Whether to use skip connections in the head layers.
     head_activation : callable, default=nn.SELU()
         Activation function for the head layers.
     head_use_batch_norm : bool, default=False
         Whether to use batch normalization in the head layers.
     layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    pooling_method : str, default="cls"
+        Whether to apply layer normalization after embedding layers.
+    pooling_method : str, default="avg"
         Pooling method to be used ('cls', 'avg', etc.).
+    use_cls : bool, default=False
+        Whether to use a CLS token for pooling.
     norm_first : bool, default=False
         Whether to apply normalization before other operations in each transformer block.
     bias : bool, default=True
-        Whether to use bias in the linear layers.
-    transformer_activation : callable, default=nn.SELU()
-        Activation function for the transformer layers.
+        Whether to use bias in linear layers.
+    transformer_activation : callable, default=ReGLU()
+        Activation function for the transformer feed-forward layers.
     layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    transformer_dim_feedforward : int, default=512
+        Epsilon value for layer normalization to improve numerical stability.
+    transformer_dim_feedforward : int, default=256
         Dimensionality of the feed-forward layers in the transformer.
     cat_encoding : str, default="int"
-        whether to use integer encoding or one-hot encoding for cat features.
+        Method for encoding categorical features ('int', 'one-hot', or 'linear').
     """
 
     lr: float = 1e-04
@@ -74,13 +78,15 @@ class DefaultFTTransformerConfig:
     norm: str = "LayerNorm"
     activation: callable = nn.SELU()
     embedding_activation: callable = nn.Identity()
+    embedding_bias: bool = False
     head_layer_sizes: list = ()
     head_dropout: float = 0.5
     head_skip_layers: bool = False
     head_activation: callable = nn.SELU()
     head_use_batch_norm: bool = False
     layer_norm_after_embedding: bool = False
-    pooling_method: str = "cls"
+    pooling_method: str = "avg"
+    use_cls: bool = False
     norm_first: bool = False
     bias: bool = True
     transformer_activation: callable = ReGLU()
diff --git a/mambular/configs/mambatab_config.py b/mambular/configs/mambatab_config.py
index a8bd506..dd4d7be 100644
--- a/mambular/configs/mambatab_config.py
+++ b/mambular/configs/mambatab_config.py
@@ -7,19 +7,19 @@ class DefaultMambaTabConfig:
     """
     Configuration class for the Default Mambular model with predefined hyperparameters.
 
-    Parameters
+    Attributes
     ----------
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
+        Weight decay (L2 regularization) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
     d_model : int, default=64
         Dimensionality of the model.
-    n_layers : int, default=8
+    n_layers : int, default=1
         Number of layers in the model.
     expand_factor : int, default=2
         Expansion factor for the feed-forward layers.
@@ -32,41 +32,47 @@ class DefaultMambaTabConfig:
     dropout : float, default=0.05
         Dropout rate for regularization.
     dt_rank : str, default="auto"
-        Rank of the decision tree.
-    d_state : int, default=32
+        Rank of the decision tree used in the model.
+    d_state : int, default=128
         Dimensionality of the state in recurrent layers.
     dt_scale : float, default=1.0
-        Scaling factor for decision tree.
+        Scaling factor for the decision tree.
     dt_init : str, default="random"
-        Initialization method for decision tree.
+        Initialization method for the decision tree.
     dt_max : float, default=0.1
         Maximum value for decision tree initialization.
     dt_min : float, default=1e-04
         Minimum value for decision tree initialization.
     dt_init_floor : float, default=1e-04
         Floor value for decision tree initialization.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
-    activation : callable, default=nn.SELU()
+    activation : callable, default=nn.ReLU()
         Activation function for the model.
-    num_embedding_activation : callable, default=nn.Identity()
+    num_embedding_activation : callable, default=nn.ReLU()
         Activation function for numerical embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
-    head_dropout : float, default=0.5
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    head_layer_sizes : list, default=()
+        Sizes of the fully connected layers in the model's head.
+    head_dropout : float, default=0.0
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
         Whether to skip layers in the head.
-    head_activation : callable, default=nn.SELU()
+    head_activation : callable, default=nn.ReLU()
         Activation function for the head layers.
     head_use_batch_norm : bool, default=False
         Whether to use batch normalization in the head layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
+    norm : str, default="LayerNorm"
+        Type of normalization to be used ('LayerNorm', 'RMSNorm', etc.).
+    axis : int, default=1
+        Axis along which operations are applied, if applicable.
     use_pscan : bool, default=False
-        whether to use pscan for the ssm
+        Whether to use PSCAN for the state-space model.
     mamba_version : str, default="mamba-torch"
-        options are "mamba-torch", "mamba1" and "mamba2"
+        Version of the Mamba model to use ('mamba-torch', 'mamba1', 'mamba2').
+    bidirectional : bool, default=False
+        Whether to process data bidirectionally.
     """
 
     lr: float = 1e-04
@@ -89,6 +95,8 @@ class DefaultMambaTabConfig:
     dt_init_floor: float = 1e-04
     activation: callable = nn.ReLU()
     num_embedding_activation: callable = nn.ReLU()
+    embedding_type: str = "linear"
+    embedding_bias: bool = False
     head_layer_sizes: list = ()
     head_dropout: float = 0.0
     head_skip_layers: bool = False
diff --git a/mambular/configs/mambular_config.py b/mambular/configs/mambular_config.py
index 725a9a1..b38a605 100644
--- a/mambular/configs/mambular_config.py
+++ b/mambular/configs/mambular_config.py
@@ -7,52 +7,73 @@ class DefaultMambularConfig:
     """
     Configuration class for the Default Mambular model with predefined hyperparameters.
 
-    Parameters
-    ----------
+    Optimizer Parameters
+    --------------------
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
         Weight decay (L2 penalty) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
+
+    Mamba Model Parameters
+    -----------------------
     d_model : int, default=64
         Dimensionality of the model.
-    n_layers : int, default=8
+    n_layers : int, default=4
         Number of layers in the model.
     expand_factor : int, default=2
         Expansion factor for the feed-forward layers.
     bias : bool, default=False
         Whether to use bias in the linear layers.
-    d_conv : int, default=16
-        Dimensionality of the convolutional layers.
-    conv_bias : bool, default=True
-        Whether to use bias in the convolutional layers.
-    dropout : float, default=0.05
+    dropout : float, default=0.0
         Dropout rate for regularization.
     dt_rank : str, default="auto"
-        Rank of the decision tree.
-    d_state : int, default=32
+        Rank of the decision tree used in the model.
+    d_state : int, default=128
         Dimensionality of the state in recurrent layers.
     dt_scale : float, default=1.0
-        Scaling factor for decision tree.
+        Scaling factor for decision tree parameters.
     dt_init : str, default="random"
-        Initialization method for decision tree.
+        Initialization method for decision tree parameters.
     dt_max : float, default=0.1
         Maximum value for decision tree initialization.
     dt_min : float, default=1e-04
         Minimum value for decision tree initialization.
     dt_init_floor : float, default=1e-04
         Floor value for decision tree initialization.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
-    activation : callable, default=nn.SELU()
+    norm : str, default="LayerNorm"
+        Type of normalization used ('LayerNorm', 'RMSNorm', etc.).
+    activation : callable, default=nn.SiLU()
         Activation function for the model.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=True
+        Whether weight decay is applied to A-D matrices.
+    BC_layer_norm : bool, default=False
+        Whether to apply layer normalization to B-C matrices.
+
+    Embedding Parameters
+    ---------------------
     embedding_activation : callable, default=nn.Identity()
         Activation function for embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    shuffle_embeddings : bool, default=False
+        Whether to shuffle embeddings before being passed to Mamba layers.
+    cat_encoding : str, default="int"
+        Encoding method for categorical features ('int', 'one-hot', etc.).
+
+    Head Parameters
+    ---------------
+    head_layer_sizes : list, default=()
+        Sizes of the layers in the model's head.
     head_dropout : float, default=0.5
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
@@ -61,30 +82,24 @@ class DefaultMambularConfig:
         Activation function for the head layers.
     head_use_batch_norm : bool, default=False
         Whether to use batch normalization in the head layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
+
+    Additional Features
+    --------------------
     pooling_method : str, default="avg"
-        Pooling method to be used ('avg', 'max', etc.).
+        Pooling method to use ('avg', 'max', etc.).
     bidirectional : bool, default=False
-        Whether to use bidirectional processing of the input sequences.
+        Whether to process data bidirectionally.
     use_learnable_interaction : bool, default=False
-        Whether to use learnable feature interactions before passing through mamba blocks.
-    use_cls : bool, default=True
-        Whether to append a cls to the end of each 'sequence'.
-    shuffle_embeddings : bool, default=False.
-        Whether to shuffle the embeddings before being passed to the Mamba layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    AD_weight_decay : bool, default=True
-        whether weight decay is also applied to A-D matrices.
-    BC_layer_norm: bool, default=False
-        whether to apply layer normalization to B-C matrices.
-    cat_encoding : str, default="int"
-        whether to use integer encoding or one-hot encoding for cat features.
+        Whether to use learnable feature interactions before passing through Mamba blocks.
+    use_cls : bool, default=False
+        Whether to append a CLS token to the input sequences.
     use_pscan : bool, default=False
-        whether to use pscan for the ssm
+        Whether to use PSCAN for the state-space model.
+
+    Mamba Version
+    -------------
     mamba_version : str, default="mamba-torch"
-        options are "mamba-torch", "mamba1" and "mamba2"
+        Version of the Mamba model to use ('mamba-torch', 'mamba1', 'mamba2').
     """
 
     lr: float = 1e-04
@@ -108,6 +123,8 @@ class DefaultMambularConfig:
     norm: str = "LayerNorm"
     activation: callable = nn.SiLU()
     embedding_activation: callable = nn.Identity()
+    embedding_type: str = "linear"
+    embedding_bias: bool = False
     head_layer_sizes: list = ()
     head_dropout: float = 0.5
     head_skip_layers: bool = False
diff --git a/mambular/configs/mlp_config.py b/mambular/configs/mlp_config.py
index 4bddcd5..ee83c00 100644
--- a/mambular/configs/mlp_config.py
+++ b/mambular/configs/mlp_config.py
@@ -7,17 +7,20 @@ class DefaultMLPConfig:
     """
     Configuration class for the default Multi-Layer Perceptron (MLP) model with predefined hyperparameters.
 
-    Parameters
-    ----------
+    Optimizer Parameters
+    --------------------
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
+        Weight decay (L2 regularization) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
-    layer_sizes : list, default=(128, 128, 32)
+
+    MLP Architecture Parameters
+    ---------------------------
+    layer_sizes : list, default=(256, 128, 32)
         Sizes of the layers in the MLP.
     activation : callable, default=nn.SELU()
         Activation function for the MLP layers.
@@ -33,14 +36,25 @@ class DefaultMLPConfig:
         Whether to use batch normalization in the MLP layers.
     layer_norm : bool, default=False
         Whether to use layer normalization in the MLP layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+
+    Embedding Parameters
+    ---------------------
     use_embeddings : bool, default=False
         Whether to use embedding layers for all features.
     embedding_activation : callable, default=nn.Identity()
-        Activation function for  embeddings.
+        Activation function for embeddings.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
     layer_norm_after_embedding : bool, default=False
         Whether to apply layer normalization after embedding.
     d_model : int, default=32
         Dimensionality of the embeddings.
+    plr_lite : bool, default=False
+        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
     """
 
     lr: float = 1e-04
@@ -58,6 +72,8 @@ class DefaultMLPConfig:
     layer_norm_eps: float = 1e-05
     use_embeddings: bool = False
     embedding_activation: callable = nn.Identity()
+    embedding_type: str = "linear"
+    embedding_bias: bool = False
     layer_norm_after_embedding: bool = False
     d_model: int = 32
     embedding_type: float = "plr"
diff --git a/mambular/configs/node_config.py b/mambular/configs/node_config.py
index d574f25..b51645c 100644
--- a/mambular/configs/node_config.py
+++ b/mambular/configs/node_config.py
@@ -5,40 +5,51 @@
 @dataclass
 class DefaultNODEConfig:
     """
-    Configuration class for the default Neural Oblivious Decision Ensemble (NODE) model.
+    Configuration class for the Neural Oblivious Decision Ensemble (NODE) model.
 
-    This class provides default hyperparameters for training and configuring a NODE model.
+    Optimizer Parameters
+    --------------------
+    lr : float, default=1e-03
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs without improvement after which the learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 regularization penalty) applied by the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate is reduced when there is no improvement.
 
-    Attributes
-    ----------
-    lr : float, optional
-        Learning rate for the optimizer. Default is 1e-4.
-    lr_patience : int, optional
-        Number of epochs without improvement after which the learning rate will be reduced. Default is 10.
-    weight_decay : float, optional
-        Weight decay (L2 regularization penalty) applied by the optimizer. Default is 1e-6.
-    lr_factor : float, optional
-        Factor by which the learning rate is reduced when there is no improvement. Default is 0.1.
-    norm : str, optional
-        Type of normalization to use. Default is None.
-    use_embeddings : bool, optional
-        Whether to use embedding layers for categorical features. Default is False.
-    embedding_activation : callable, optional
-        Activation function to apply to embeddings. Default is `nn.Identity`.
-    layer_norm_after_embedding : bool, optional
-        Whether to apply layer normalization after embedding layers. Default is False.
-    d_model : int, optional
-        Dimensionality of the embedding space. Default is 32.
-    num_layers : int, optional
-        Number of dense layers in the model. Default is 4.
-    layer_dim : int, optional
-        Dimensionality of each dense layer. Default is 128.
-    tree_dim : int, optional
-        Dimensionality of the output from each tree leaf. Default is 1.
-    depth : int, optional
-        Depth of each decision tree in the ensemble. Default is 6.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
+    Model Architecture Parameters
+    -----------------------------
+    num_layers : int, default=4
+        Number of dense layers in the model.
+    layer_dim : int, default=128
+        Dimensionality of each dense layer.
+    tree_dim : int, default=1
+        Dimensionality of the output from each tree leaf.
+    depth : int, default=6
+        Depth of each decision tree in the ensemble.
+    norm : str, default=None
+        Type of normalization to use in the model.
+
+    Embedding Parameters
+    ---------------------
+    use_embeddings : bool, default=False
+        Whether to use embedding layers for categorical features.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function to apply to embeddings.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding layers.
+    d_model : int, default=32
+        Dimensionality of the embedding space.
+
+    Head Parameters
+    ---------------
+    head_layer_sizes : list, default=()
+        Sizes of the layers in the model's head.
     head_dropout : float, default=0.5
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
@@ -49,13 +60,15 @@ class DefaultNODEConfig:
         Whether to use batch normalization in the head layers.
     """
 
-    lr: float = 1e-04
+    lr: float = 1e-03
     lr_patience: int = 10
     weight_decay: float = 1e-06
     lr_factor: float = 0.1
     norm: str = None
     use_embeddings: bool = False
     embedding_activation: callable = nn.Identity()
+    embedding_tpye: str = "linear"
+    embedding_bias: bool = False
     layer_norm_after_embedding: bool = False
     d_model: int = 32
     num_layers: int = 4
diff --git a/mambular/configs/resnet_config.py b/mambular/configs/resnet_config.py
index f73c070..de893b6 100644
--- a/mambular/configs/resnet_config.py
+++ b/mambular/configs/resnet_config.py
@@ -7,17 +7,20 @@ class DefaultResNetConfig:
     """
     Configuration class for the default ResNet model with predefined hyperparameters.
 
-    Parameters
-    ----------
+    Optimizer Parameters
+    --------------------
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
+        Weight decay (L2 regularization penalty) applied by the optimizer.
     lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-    layer_sizes : list, default=(128, 128, 32)
+        Factor by which the learning rate is reduced when there is no improvement.
+
+    ResNet Architecture Parameters
+    ------------------------------
+    layer_sizes : list, default=(256, 128, 32)
         Sizes of the layers in the ResNet.
     activation : callable, default=nn.SELU()
         Activation function for the ResNet layers.
@@ -25,8 +28,8 @@ class DefaultResNetConfig:
         Whether to skip layers in the ResNet.
     dropout : float, default=0.5
         Dropout rate for regularization.
-    norm : str, default=None
-        Normalization method to be used, if any.
+    norm : bool, default=False
+        Whether to use normalization in the ResNet.
     use_glu : bool, default=False
         Whether to use Gated Linear Units (GLU) in the ResNet.
     skip_connections : bool, default=True
@@ -35,15 +38,28 @@ class DefaultResNetConfig:
         Whether to use batch normalization in the ResNet layers.
     layer_norm : bool, default=False
         Whether to use layer normalization in the ResNet layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
     num_blocks : int, default=3
         Number of residual blocks in the ResNet.
-    use_embeddings : bool, default=False
+
+    Embedding Parameters
+    ---------------------
+    use_embeddings : bool, default=True
         Whether to use embedding layers for all features.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    plr_lite : bool, default=False
+        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
+    average_embeddings : bool, default=True
+        Whether to average embeddings during the forward pass.
     embedding_activation : callable, default=nn.Identity()
-        Activation function for  embeddings.
+        Activation function for embeddings.
     layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    d_model : int, default=32
+        Whether to apply layer normalization after embedding layers.
+    d_model : int, default=64
         Dimensionality of the embeddings.
     """
 
@@ -65,7 +81,8 @@ class DefaultResNetConfig:
 
     # embedding params
     use_embeddings: bool = True
-    embedding_type: float = "plr"
+    embedding_type: float = "linear"
+    embedding_bias = False
     plr_lite: bool = False
     average_embeddings: bool = True
     embedding_activation: callable = nn.Identity()
diff --git a/mambular/configs/tabm_config.py b/mambular/configs/tabm_config.py
index 2e19de1..e39d26e 100644
--- a/mambular/configs/tabm_config.py
+++ b/mambular/configs/tabm_config.py
@@ -8,23 +8,24 @@ class DefaultTabMConfig:
     """
     Configuration class for the TabM model with batch ensembling and predefined hyperparameters.
 
-    Parameters
-    ----------
+    Optimizer Parameters
+    --------------------
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
         Weight decay (L2 penalty) for the optimizer.
     lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
+        Factor by which the learning rate is reduced when there is no improvement.
+
+    Architecture Parameters
+    ------------------------
     layer_sizes : list, default=(512, 512, 128)
         Sizes of the layers in the model.
     activation : callable, default=nn.ReLU()
         Activation function for the model layers.
-    skip_layers : bool, default=False
-        Whether to skip layers in the model.
-    dropout : float, default=0.5
+    dropout : float, default=0.3
         Dropout rate for regularization.
     norm : str, default=None
         Normalization method to be used, if any.
@@ -34,26 +35,44 @@ class DefaultTabMConfig:
         Whether to use batch normalization in the model layers.
     layer_norm : bool, default=False
         Whether to use layer normalization in the model layers.
-    use_embeddings : bool, default=False
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+
+    Embedding Parameters
+    ---------------------
+    use_embeddings : bool, default=True
         Whether to use embedding layers for all features.
-    embedding_activation : callable, default=nn.Identity()
+    embedding_type : str, default="plr"
+        Type of embedding to use ('plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    plr_lite : bool, default=False
+        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
+    average_embeddings : bool, default=False
+        Whether to average embeddings during the forward pass.
+    embedding_activation : callable, default=nn.ReLU()
         Activation function for embeddings.
     layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    d_model : int, default=32
+        Whether to apply layer normalization after embedding layers.
+    d_model : int, default=64
         Dimensionality of the embeddings.
-    ensemble_size : int, default=4
+
+    Batch Ensembling Parameters
+    ----------------------------
+    ensemble_size : int, default=32
         Number of ensemble members for batch ensembling.
     ensemble_scaling_in : bool, default=True
         Whether to use input scaling for each ensemble member.
     ensemble_scaling_out : bool, default=True
         Whether to use output scaling for each ensemble member.
-    ensemble_bias : bool, default=False
+    ensemble_bias : bool, default=True
         Whether to use a unique bias term for each ensemble member.
-    scaling_init : Literal['ones', 'random-signs'], default='ones'
+    scaling_init : {"ones", "random-signs", "normal"}, default="normal"
         Initialization method for scaling weights.
-    average_ensembles : bool, default=True
+    average_ensembles : bool, default=False
         Whether to average the outputs of the ensembles.
+    model_type : {"mini", "full"}, default="mini"
+        Model type to use ('mini' for reduced version, 'full' for complete model).
     """
 
     # lr params
@@ -75,6 +94,7 @@ class DefaultTabMConfig:
     # embedding params
     use_embeddings: bool = True
     embedding_type: float = "plr"
+    embedding_bias = False
     plr_lite: bool = False
     average_embeddings: bool = False
     embedding_activation: callable = nn.ReLU()
diff --git a/mambular/configs/tabularnn_config.py b/mambular/configs/tabularnn_config.py
index eb15d55..8aa3be5 100644
--- a/mambular/configs/tabularnn_config.py
+++ b/mambular/configs/tabularnn_config.py
@@ -65,7 +65,9 @@ class DefaultTabulaRNNConfig:
     rnn_dropout: float = 0.2
     norm: str = "RMSNorm"
     activation: callable = nn.SELU()
-    embedding_activation: callable = nn.Identity()
+    embedding_activation: callable = nn.ReLU()
+    embedding_type: str = "linear"
+    embedding_bias: bool = False
     head_layer_sizes: list = ()
     head_dropout: float = 0.5
     head_skip_layers: bool = False

From f90eb1f053ca1b50efe9f922b04cce52e698e5c1 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Thu, 28 Nov 2024 23:29:16 +0100
Subject: [PATCH 106/132] adding attention batch-ensemble

---
 .../layer_utils/batch_ensemble_layer.py       | 343 +++++++++++++++++-
 mambular/arch_utils/transformer_utils.py      | 257 +++++++++++++
 2 files changed, 599 insertions(+), 1 deletion(-)

diff --git a/mambular/arch_utils/layer_utils/batch_ensemble_layer.py b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
index 2a9e0d5..9ba90ed 100644
--- a/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
+++ b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
@@ -1,8 +1,9 @@
 import torch
 import torch.nn as nn
-from typing import Literal
+from typing import Literal, List
 import math
 from typing import Callable
+import torch.nn.functional as F
 
 
 class LinearBatchEnsembleLayer(nn.Module):
@@ -298,3 +299,343 @@ def forward(self, x: torch.Tensor, hidden: torch.Tensor = None) -> torch.Tensor:
         )  # Shape: (batch_size, seq_len, ensemble_size, hidden_size * num_directions)
 
         return outputs, hidden
+
+
+class MultiHeadAttentionBatchEnsemble(nn.Module):
+    """
+    Multi-head attention module with batch ensembling.
+
+    This module implements the multi-head attention mechanism with optional batch ensembling on selected projections.
+    Batch ensembling allows for efficient ensembling by sharing weights across ensemble members while introducing
+    diversity through scaling factors.
+
+    Parameters
+    ----------
+    embed_dim : int
+        The dimension of the embedding (input and output feature dimension).
+    num_heads : int
+        Number of attention heads.
+    ensemble_size : int
+        Number of ensemble members.
+    scaling_init : {'ones', 'random-signs', 'normal'}, optional
+        Initialization method for the scaling factors `r` and `s`. Default is 'ones'.
+        - 'ones': Initialize scaling factors to ones.
+        - 'random-signs': Initialize scaling factors to random signs (+1 or -1).
+        - 'normal': Initialize scaling factors from a normal distribution (mean=0, std=1).
+    batch_ensemble_projections : list of str, optional
+        List of projections to which batch ensembling should be applied.
+        Valid values are any combination of ['query', 'key', 'value', 'out_proj']. Default is ['query'].
+
+    Attributes
+    ----------
+    embed_dim : int
+        The dimension of the embedding.
+    num_heads : int
+        Number of attention heads.
+    head_dim : int
+        Dimension of each attention head (embed_dim // num_heads).
+    ensemble_size : int
+        Number of ensemble members.
+    batch_ensemble_projections : list of str
+        List of projections to which batch ensembling is applied.
+    q_proj : nn.Linear
+        Linear layer for projecting queries.
+    k_proj : nn.Linear
+        Linear layer for projecting keys.
+    v_proj : nn.Linear
+        Linear layer for projecting values.
+    out_proj : nn.Linear
+        Linear layer for projecting outputs.
+    r : nn.ParameterDict
+        Dictionary of input scaling factors for batch ensembling.
+    s : nn.ParameterDict
+        Dictionary of output scaling factors for batch ensembling.
+
+    Methods
+    -------
+    reset_parameters(scaling_init)
+        Initialize the parameters of the module.
+    forward(query, key, value, mask=None)
+        Perform the forward pass of the multi-head attention with batch ensembling.
+    process_projection(x, linear_layer, proj_name)
+        Process a projection with or without batch ensembling.
+    batch_ensemble_linear(x, linear_layer, r, s)
+        Apply a linear transformation with batch ensembling.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        ensemble_size: int,
+        scaling_init: Literal["ones", "random-signs", "normal"] = "ones",
+        batch_ensemble_projections: List[str] = ["query"],
+    ):
+        super(MultiHeadAttentionBatchEnsemble, self).__init__()
+        # Ensure embedding dimension is divisible by the number of heads
+        assert (
+            embed_dim % num_heads == 0
+        ), "Embedding dimension must be divisible by number of heads."
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.ensemble_size = ensemble_size
+        self.batch_ensemble_projections = batch_ensemble_projections
+
+        # Linear layers for projecting queries, keys, and values
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        # Output linear layer
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        # Batch ensembling parameters
+        self.r = nn.ParameterDict()
+        self.s = nn.ParameterDict()
+
+        # Initialize batch ensembling parameters for specified projections
+        for proj_name in batch_ensemble_projections:
+            if proj_name == "query":
+                self.r["q_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+                self.s["q_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+            elif proj_name == "key":
+                self.r["k_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+                self.s["k_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+            elif proj_name == "value":
+                self.r["v_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+                self.s["v_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+            elif proj_name == "out_proj":
+                self.r["out_proj"] = nn.Parameter(
+                    torch.Tensor(ensemble_size, embed_dim)
+                )
+                self.s["out_proj"] = nn.Parameter(
+                    torch.Tensor(ensemble_size, embed_dim)
+                )
+            else:
+                raise ValueError(
+                    f"Invalid projection name '{proj_name}'. Must be one of 'query', 'key', 'value', 'out_proj'."
+                )
+
+        # Initialize parameters
+        self.reset_parameters(scaling_init)
+
+    def reset_parameters(self, scaling_init: Literal["ones", "random-signs", "normal"]):
+        """
+        Initialize the parameters of the module.
+
+        Parameters
+        ----------
+        scaling_init : {'ones', 'random-signs', 'normal'}
+            Initialization method for the scaling factors `r` and `s`.
+            - 'ones': Initialize scaling factors to ones.
+            - 'random-signs': Initialize scaling factors to random signs (+1 or -1).
+            - 'normal': Initialize scaling factors from a normal distribution (mean=0, std=1).
+
+        Raises
+        ------
+        ValueError
+            If an invalid `scaling_init` method is provided.
+        """
+        # Initialize weight matrices using Kaiming uniform initialization
+        nn.init.kaiming_uniform_(self.q_proj.weight, a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.k_proj.weight, a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.v_proj.weight, a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.out_proj.weight, a=math.sqrt(5))
+
+        # Initialize biases uniformly
+        for layer in [self.q_proj, self.k_proj, self.v_proj, self.out_proj]:
+            if layer.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(layer.weight)
+                bound = 1 / math.sqrt(fan_in)
+                nn.init.uniform_(layer.bias, -bound, bound)
+
+        # Initialize scaling factors r and s based on selected initialization
+        scaling_init_fn = {
+            "ones": nn.init.ones_,
+            "random-signs": lambda x: torch.sign(torch.randn_like(x)),
+            "normal": lambda x: nn.init.normal_(x, mean=0.0, std=1.0),
+        }
+
+        init_fn = scaling_init_fn.get(scaling_init)
+        if init_fn is None:
+            raise ValueError(
+                f"Invalid scaling_init '{scaling_init}'. Must be one of 'ones', 'random-signs', 'normal'."
+            )
+
+        # Initialize r and s for specified projections
+        for key in self.r.keys():
+            init_fn(self.r[key])
+        for key in self.s.keys():
+            init_fn(self.s[key])
+
+    def forward(self, query, key, value, mask=None):
+        """
+        Perform the forward pass of the multi-head attention with batch ensembling.
+
+        Parameters
+        ----------
+        query : torch.Tensor
+            The query tensor of shape (N, S, E, D), where:
+                - N: Batch size
+                - S: Sequence length
+                - E: Ensemble size
+                - D: Embedding dimension
+        key : torch.Tensor
+            The key tensor of shape (N, S, E, D).
+        value : torch.Tensor
+            The value tensor of shape (N, S, E, D).
+        mask : torch.Tensor, optional
+            An optional mask tensor that is broadcastable to shape (N, 1, 1, 1, S). Positions with zero in the mask will be masked out.
+
+        Returns
+        -------
+        torch.Tensor
+            The output tensor of shape (N, S, E, D).
+
+        Raises
+        ------
+        AssertionError
+            If the ensemble size `E` does not match `self.ensemble_size`.
+        """
+        if x.dim() == 3:  # Case: (B, L, D) - no ensembles
+            batch_size, seq_len, input_size = x.shape
+            x = x.unsqueeze(2).expand(
+                -1, -1, self.ensemble_size, -1
+            )  # Shape: (B, L, ensemble_size, D)
+        elif (
+            x.dim() == 4 and x.size(2) == self.ensemble_size
+        ):  # Case: (B, L, ensemble_size, D)
+            batch_size, seq_len, ensemble_size, _ = x.shape
+            if ensemble_size != self.ensemble_size:
+                raise ValueError(
+                    f"Input shape {x.shape} is invalid. Expected shape: (B, S, ensemble_size, N)"
+                )
+        else:
+            raise ValueError(
+                f"Input shape {x.shape} is invalid. Expected shape: (B, L, D) or (B, L, ensemble_size, D)"
+            )
+        N, S, E, D = query.size()
+        assert E == self.ensemble_size, "Ensemble size mismatch."
+
+        # Process projections with or without batch ensembling
+        Q = self.process_projection(query, self.q_proj, "q_proj")  # Shape: (N, S, E, D)
+        K = self.process_projection(key, self.k_proj, "k_proj")  # Shape: (N, S, E, D)
+        V = self.process_projection(value, self.v_proj, "v_proj")  # Shape: (N, S, E, D)
+
+        # Reshape for multi-head attention
+        Q = Q.view(N, S, E, self.num_heads, self.head_dim).permute(
+            0, 2, 3, 1, 4
+        )  # (N, E, num_heads, S, head_dim)
+        K = K.view(N, S, E, self.num_heads, self.head_dim).permute(0, 2, 3, 1, 4)
+        V = V.view(N, S, E, self.num_heads, self.head_dim).permute(0, 2, 3, 1, 4)
+
+        # Compute scaled dot-product attention
+        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(
+            self.head_dim
+        )  # (N, E, num_heads, S, S)
+
+        if mask is not None:
+            # Expand mask to match attn_scores shape
+            mask = mask.unsqueeze(1).unsqueeze(1)  # (N, 1, 1, 1, S)
+            attn_scores = attn_scores.masked_fill(mask == 0, float("-inf"))
+
+        attn_weights = F.softmax(attn_scores, dim=-1)  # (N, E, num_heads, S, S)
+
+        # Apply attention weights to values
+        context = torch.matmul(attn_weights, V)  # (N, E, num_heads, S, head_dim)
+
+        # Reshape and permute back to (N, S, E, D)
+        context = (
+            context.permute(0, 3, 1, 2, 4).contiguous().view(N, S, E, self.embed_dim)
+        )  # (N, S, E, D)
+
+        # Apply output projection
+        output = self.process_projection(
+            context, self.out_proj, "out_proj"
+        )  # (N, S, E, D)
+
+        return output
+
+    def process_projection(self, x, linear_layer, proj_name):
+        """
+        Process a projection (query, key, value, or output) with or without batch ensembling.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            The input tensor of shape (N, S, E, D_in), where:
+                - N: Batch size
+                - S: Sequence length
+                - E: Ensemble size
+                - D_in: Input feature dimension
+        linear_layer : torch.nn.Linear
+            The linear layer to apply.
+        proj_name : str
+            The name of the projection ('q_proj', 'k_proj', 'v_proj', or 'out_proj').
+
+        Returns
+        -------
+        torch.Tensor
+            The output tensor of shape (N, S, E, D_out).
+        """
+        if proj_name in self.batch_ensemble_projections:
+            # Apply batch ensemble linear layer
+            r = self.r[proj_name]
+            s = self.s[proj_name]
+            return self.batch_ensemble_linear(x, linear_layer, r, s)
+        else:
+            # Process normally without batch ensembling
+            N, S, E, D_in = x.size()
+            x = x.view(N * E, S, D_in)  # Combine batch and ensemble dimensions
+            y = linear_layer(x)  # Apply linear layer
+            D_out = y.size(-1)
+            y = y.view(N, E, S, D_out).permute(0, 2, 1, 3)  # (N, S, E, D_out)
+            return y
+
+    def batch_ensemble_linear(self, x, linear_layer, r, s):
+        """
+        Apply a linear transformation with batch ensembling.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            The input tensor of shape (N, S, E, D_in), where:
+                - N: Batch size
+                - S: Sequence length
+                - E: Ensemble size
+                - D_in: Input feature dimension
+        linear_layer : torch.nn.Linear
+            The linear layer with weight matrix `W` of shape (D_out, D_in).
+        r : torch.Tensor
+            The input scaling factors of shape (E, D_in).
+        s : torch.Tensor
+            The output scaling factors of shape (E, D_out).
+
+        Returns
+        -------
+        torch.Tensor
+            The output tensor of shape (N, S, E, D_out).
+        """
+        W = linear_layer.weight  # Shape: (D_out, D_in)
+        b = linear_layer.bias  # Shape: (D_out)
+
+        N, S, E, D_in = x.shape
+        D_out = W.shape[0]
+
+        # Multiply input by r
+        x_r = x * r.view(1, 1, E, D_in)  # (N, S, E, D_in)
+
+        # Reshape x_r to (N*S*E, D_in)
+        x_r = x_r.view(-1, D_in)  # (N*S*E, D_in)
+
+        # Compute x_r @ W^T + b
+        y = F.linear(x_r, W, b)  # (N*S*E, D_out)
+
+        # Reshape y back to (N, S, E, D_out)
+        y = y.view(N, S, E, D_out)  # (N, S, E, D_out)
+
+        # Multiply by s
+        y = y * s.view(1, 1, E, D_out)  # (N, S, E, D_out)
+
+        return y
diff --git a/mambular/arch_utils/transformer_utils.py b/mambular/arch_utils/transformer_utils.py
index 34233d8..4334b84 100644
--- a/mambular/arch_utils/transformer_utils.py
+++ b/mambular/arch_utils/transformer_utils.py
@@ -1,6 +1,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from .layer_utils.batch_ensemble_layer import (
+    LinearBatchEnsembleLayer,
+    MultiHeadAttentionBatchEnsemble,
+)
 
 
 def reglu(x):
@@ -70,3 +74,256 @@ def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal=False
         src = src + self.dropout2(src2)
         src = self.norm2(src)
         return src
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Optional, List, Literal
+import copy
+
+
+class BatchEnsembleTransformerEncoderLayer(nn.Module):
+    """
+    Transformer Encoder Layer with Batch Ensembling.
+
+    This class implements a single layer of the Transformer encoder with batch ensembling applied to the
+    multi-head attention and feedforward network as desired.
+
+    Parameters
+    ----------
+    embed_dim : int
+        The dimension of the embedding.
+    num_heads : int
+        Number of attention heads.
+    ensemble_size : int
+        Number of ensemble members.
+    dim_feedforward : int, optional
+        Dimension of the feedforward network model. Default is 2048.
+    dropout : float, optional
+        Dropout value. Default is 0.1.
+    activation : {'relu', 'gelu'}, optional
+        Activation function of the intermediate layer. Default is 'relu'.
+    scaling_init : {'ones', 'random-signs', 'normal'}, optional
+        Initialization method for the scaling factors in batch ensembling. Default is 'ones'.
+    batch_ensemble_projections : list of str, optional
+        List of projections to which batch ensembling should be applied in the attention layer.
+        Default is ['query'].
+    batch_ensemble_ffn : bool, optional
+        Whether to apply batch ensembling to the feedforward network. Default is False.
+
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        ensemble_size: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: Literal["relu", "gelu"] = "relu",
+        scaling_init: Literal["ones", "random-signs", "normal"] = "ones",
+        batch_ensemble_projections: List[str] = ["query"],
+        batch_ensemble_ffn: bool = False,
+    ):
+        super(BatchEnsembleTransformerEncoderLayer, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.ensemble_size = ensemble_size
+        self.dim_feedforward = dim_feedforward
+        self.dropout = dropout
+        self.activation = activation
+        self.batch_ensemble_ffn = batch_ensemble_ffn
+
+        # Multi-head attention with batch ensembling
+        self.self_attn = MultiHeadAttentionBatchEnsemble(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            ensemble_size=ensemble_size,
+            scaling_init=scaling_init,
+            batch_ensemble_projections=batch_ensemble_projections,
+        )
+
+        # Feedforward network
+        if batch_ensemble_ffn:
+            # Apply batch ensembling to the feedforward network
+            self.linear1 = BatchEnsembleLinear(
+                embed_dim, dim_feedforward, ensemble_size, scaling_init
+            )
+            self.linear2 = BatchEnsembleLinear(
+                dim_feedforward, embed_dim, ensemble_size, scaling_init
+            )
+        else:
+            # Standard feedforward network
+            self.linear1 = nn.Linear(embed_dim, dim_feedforward)
+            self.linear2 = nn.Linear(dim_feedforward, embed_dim)
+
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        # Activation function
+        if activation == "relu":
+            self.activation_fn = F.relu
+        elif activation == "gelu":
+            self.activation_fn = F.gelu
+        else:
+            raise ValueError(
+                f"Invalid activation '{activation}'. Choose from 'relu' or 'gelu'."
+            )
+
+    def forward(self, src, src_mask: Optional[torch.Tensor] = None):
+        """
+        Pass the input through the encoder layer.
+
+        Parameters
+        ----------
+        src : torch.Tensor
+            The input tensor of shape (N, S, E, D), where:
+                - N: Batch size
+                - S: Sequence length
+                - E: Ensemble size
+                - D: Embedding dimension
+        src_mask : torch.Tensor, optional
+            The source mask tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            The output tensor of shape (N, S, E, D).
+
+        """
+        # Self-attention
+        src2 = self.self_attn(src, src, src, mask=src_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # Feedforward network
+        if self.batch_ensemble_ffn:
+            src2 = self.linear2(self.dropout(self.activation_fn(self.linear1(src))))
+        else:
+            N, S, E, D = src.shape
+            src_reshaped = src.view(N * E * S, D)
+            src2 = self.linear1(src_reshaped)
+            src2 = self.activation_fn(src2)
+            src2 = self.dropout(src2)
+            src2 = self.linear2(src2)
+            src2 = src2.view(N, S, E, D)
+
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def dropout(self, x):
+        """
+        Apply dropout to the input tensor.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor after applying dropout.
+
+        """
+        return F.dropout(x, p=self.dropout, training=self.training)
+
+
+class BatchEnsembleTransformerEncoder(nn.Module):
+    """
+    Transformer Encoder with Batch Ensembling.
+
+    This class implements the Transformer encoder consisting of multiple encoder layers with batch ensembling.
+
+    Parameters
+    ----------
+    num_layers : int
+        Number of encoder layers to stack.
+    embed_dim : int
+        The dimension of the embedding.
+    num_heads : int
+        Number of attention heads.
+    ensemble_size : int
+        Number of ensemble members.
+    dim_feedforward : int, optional
+        Dimension of the feedforward network model. Default is 2048.
+    dropout : float, optional
+        Dropout value. Default is 0.1.
+    activation : {'relu', 'gelu'}, optional
+        Activation function of the intermediate layer. Default is 'relu'.
+    scaling_init : {'ones', 'random-signs', 'normal'}, optional
+        Initialization method for the scaling factors in batch ensembling. Default is 'ones'.
+    batch_ensemble_projections : list of str, optional
+        List of projections to which batch ensembling should be applied in the attention layer.
+        Default is ['query'].
+    batch_ensemble_ffn : bool, optional
+        Whether to apply batch ensembling to the feedforward network. Default is False.
+    norm : nn.Module, optional
+        Optional layer normalization module.
+
+    """
+
+    def __init__(
+        self,
+        num_layers: int,
+        embed_dim: int,
+        num_heads: int,
+        ensemble_size: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: Literal["relu", "gelu"] = "relu",
+        scaling_init: Literal["ones", "random-signs", "normal"] = "ones",
+        batch_ensemble_projections: List[str] = ["query"],
+        batch_ensemble_ffn: bool = False,
+        norm: Optional[nn.Module] = None,
+    ):
+        super(BatchEnsembleTransformerEncoder, self).__init__()
+        self.layers = nn.ModuleList(
+            [
+                BatchEnsembleTransformerEncoderLayer(
+                    embed_dim=embed_dim,
+                    num_heads=num_heads,
+                    ensemble_size=ensemble_size,
+                    dim_feedforward=dim_feedforward,
+                    dropout=dropout,
+                    activation=activation,
+                    scaling_init=scaling_init,
+                    batch_ensemble_projections=batch_ensemble_projections,
+                    batch_ensemble_ffn=batch_ensemble_ffn,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = norm
+
+    def forward(self, src, mask: Optional[torch.Tensor] = None):
+        """
+        Pass the input through the encoder layers in turn.
+
+        Parameters
+        ----------
+        src : torch.Tensor
+            The input tensor of shape (N, S, E, D).
+        mask : torch.Tensor, optional
+            The source mask tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            The output tensor of shape (N, S, E, D).
+        """
+        output = src
+
+        for layer in self.layers:
+            output = layer(output, src_mask=mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output

From b1eccd622745ee6e0c20d41fe39b3d74e5ca024a Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 09:53:39 +0000
Subject: [PATCH 107/132] adding tabular cnn

---
 mambular/arch_utils/cnn_utils.py | 68 +++++++++++++++++++++++++++
 mambular/base_models/cnn.py      | 79 ++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100644 mambular/arch_utils/cnn_utils.py
 create mode 100644 mambular/base_models/cnn.py

diff --git a/mambular/arch_utils/cnn_utils.py b/mambular/arch_utils/cnn_utils.py
new file mode 100644
index 0000000..8822374
--- /dev/null
+++ b/mambular/arch_utils/cnn_utils.py
@@ -0,0 +1,68 @@
+import torch.nn as nn
+
+
+class CNNBlock(nn.Module):
+    """
+    A modular CNN block that allows for configurable convolutional, pooling, and dropout layers.
+
+    Attributes
+    ----------
+    cnn : nn.Sequential
+        A sequential container holding the convolutional, activation, pooling, and dropout layers.
+
+    Methods
+    -------
+    forward(x):
+        Defines the forward pass of the CNNBlock.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        layers = []
+        in_channels = config.input_channels
+
+        # Ensure dropout_positions is a list
+        dropout_positions = config.dropout_positions or []
+
+        for i in range(config.num_layers):
+            # Convolutional layer
+            layers.append(
+                nn.Conv2d(
+                    in_channels=in_channels,
+                    out_channels=config.out_channels_list[i],
+                    kernel_size=config.kernel_size_list[i],
+                    stride=config.stride_list[i],
+                    padding=config.padding_list[i],
+                )
+            )
+            layers.append(nn.ReLU())
+
+            # Pooling layer
+            if config.pooling_method == "max":
+                layers.append(
+                    nn.MaxPool2d(
+                        kernel_size=config.pooling_kernel_size_list[i],
+                        stride=config.pooling_stride_list[i],
+                    )
+                )
+            elif config.pooling_method == "avg":
+                layers.append(
+                    nn.AvgPool2d(
+                        kernel_size=config.pooling_kernel_size_list[i],
+                        stride=config.pooling_stride_list[i],
+                    )
+                )
+
+            # Dropout layer
+            if i in dropout_positions:
+                layers.append(nn.Dropout(p=config.dropout_rate))
+
+            in_channels = config.out_channels_list[i]
+
+        self.cnn = nn.Sequential(*layers)
+
+    def forward(self, x):
+        # Ensure input has shape (N, C, H, W)
+        if x.dim() == 3:
+            x = x.unsqueeze(1)
+        return self.cnn(x)
diff --git a/mambular/base_models/cnn.py b/mambular/base_models/cnn.py
new file mode 100644
index 0000000..386e7ac
--- /dev/null
+++ b/mambular/base_models/cnn.py
@@ -0,0 +1,79 @@
+import torch
+import torch.nn as nn
+from ..configs.cnn_config import DefaultCNNConfig
+from .basemodel import BaseModel
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.cnn_utils import CNNBlock
+
+
+class CNN(BaseModel):
+    """
+    A convolutional neural network (CNN) model designed for tabular data with support for categorical
+    and numerical features, configurable embeddings, and dynamic flattened size computation.
+
+    Attributes
+    ----------
+    embedding_layer : EmbeddingLayer
+        A layer that generates embeddings for categorical and numerical features.
+    cnn : CNNBlock
+        A modular CNN block for feature extraction.
+    fc : nn.Sequential
+        A fully connected layer for final predictions.
+
+    Methods
+    -------
+    forward(num_features, cat_features):
+        Forward pass through the embedding, CNN, and fully connected layers.
+    """
+
+    def __init__(
+        self,
+        cat_feature_info,
+        num_feature_info,
+        num_classes=1,
+        config: DefaultCNNConfig = DefaultCNNConfig(),
+        **kwargs,
+    ):
+        super().__init__(config=config, **kwargs)
+        self.save_hyperparameters(ignore=[])
+
+        self.returns_ensemble = False
+        self.n_features = len(num_feature_info) + len(cat_feature_info)
+
+        # Initialize the embedding layer
+        self.embedding_layer = EmbeddingLayer(
+            num_feature_info=num_feature_info,
+            cat_feature_info=cat_feature_info,
+            config=config,
+        )
+
+        # CNN block
+        self.cnn = CNNBlock(config)
+        n_features = len(num_feature_info) + len(cat_feature_info)
+
+        # Dynamically compute flattened size
+        with torch.no_grad():
+            sample_input = torch.zeros(
+                1,
+                config.input_channels,
+                n_features,
+                config.d_model,
+            )
+            sample_output = self.cnn(sample_input)
+            flattened_size = sample_output.view(1, -1).size(1)
+            print(flattened_size)
+
+        # Fully connected layers
+        self.fc = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(flattened_size, num_classes),
+        )
+
+    def forward(self, num_features, cat_features):
+        x = self.embedding_layer(num_features, cat_features)
+        x = x.unsqueeze(1)
+        # Generate embeddings (x) with shape (N, J, D)
+
+        x = self.cnn(x)
+        preds = self.fc(x)
+        return preds

From 650c911ab952b4a22c87f1e96817e22967884421 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 09:53:53 +0000
Subject: [PATCH 108/132] adding transformer BE layer

---
 .../layer_utils/batch_ensemble_layer.py       | 37 +++++--------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/batch_ensemble_layer.py b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
index 9ba90ed..21fe759 100644
--- a/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
+++ b/mambular/arch_utils/layer_utils/batch_ensemble_layer.py
@@ -393,18 +393,17 @@ def __init__(
         # Batch ensembling parameters
         self.r = nn.ParameterDict()
         self.s = nn.ParameterDict()
-
         # Initialize batch ensembling parameters for specified projections
         for proj_name in batch_ensemble_projections:
             if proj_name == "query":
-                self.r["q_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
-                self.s["q_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+                self.r["query"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+                self.s["query"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
             elif proj_name == "key":
-                self.r["k_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
-                self.s["k_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+                self.r["key"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+                self.s["key"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
             elif proj_name == "value":
-                self.r["v_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
-                self.s["v_proj"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+                self.r["value"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
+                self.s["value"] = nn.Parameter(torch.Tensor(ensemble_size, embed_dim))
             elif proj_name == "out_proj":
                 self.r["out_proj"] = nn.Parameter(
                     torch.Tensor(ensemble_size, embed_dim)
@@ -498,30 +497,14 @@ def forward(self, query, key, value, mask=None):
         AssertionError
             If the ensemble size `E` does not match `self.ensemble_size`.
         """
-        if x.dim() == 3:  # Case: (B, L, D) - no ensembles
-            batch_size, seq_len, input_size = x.shape
-            x = x.unsqueeze(2).expand(
-                -1, -1, self.ensemble_size, -1
-            )  # Shape: (B, L, ensemble_size, D)
-        elif (
-            x.dim() == 4 and x.size(2) == self.ensemble_size
-        ):  # Case: (B, L, ensemble_size, D)
-            batch_size, seq_len, ensemble_size, _ = x.shape
-            if ensemble_size != self.ensemble_size:
-                raise ValueError(
-                    f"Input shape {x.shape} is invalid. Expected shape: (B, S, ensemble_size, N)"
-                )
-        else:
-            raise ValueError(
-                f"Input shape {x.shape} is invalid. Expected shape: (B, L, D) or (B, L, ensemble_size, D)"
-            )
+
         N, S, E, D = query.size()
         assert E == self.ensemble_size, "Ensemble size mismatch."
 
         # Process projections with or without batch ensembling
-        Q = self.process_projection(query, self.q_proj, "q_proj")  # Shape: (N, S, E, D)
-        K = self.process_projection(key, self.k_proj, "k_proj")  # Shape: (N, S, E, D)
-        V = self.process_projection(value, self.v_proj, "v_proj")  # Shape: (N, S, E, D)
+        Q = self.process_projection(query, self.q_proj, "query")  # Shape: (N, S, E, D)
+        K = self.process_projection(key, self.k_proj, "key")  # Shape: (N, S, E, D)
+        V = self.process_projection(value, self.v_proj, "value")  # Shape: (N, S, E, D)
 
         # Reshape for multi-head attention
         Q = Q.view(N, S, E, self.num_heads, self.head_dim).permute(

From 3b8e585613091bf11e6f5b739430037a34511a55 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 09:54:14 +0000
Subject: [PATCH 109/132] adjusting embedding layer

---
 mambular/arch_utils/layer_utils/embedding_layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index 72d5ec6..f1ad244 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -161,7 +161,7 @@ def forward(self, num_features=None, cat_features=None):
         if self.embedding_type == "plr":
             # For PLR, pass all numerical features together
             if num_features is not None:
-                num_features = torch.stack(num_features, dim=1).squeeze(
+                num_features = torch.cat(num_features, dim=1).squeeze(
                     -1
                 )  # Stack features along the feature dimension
                 num_embeddings = self.num_embeddings(

From 27ff55605813eb8875393ff6307796653d21f0c6 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 09:54:24 +0000
Subject: [PATCH 110/132] adjusting embedding layer

---
 mambular/arch_utils/layer_utils/embedding_layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index f1ad244..72d5ec6 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -161,7 +161,7 @@ def forward(self, num_features=None, cat_features=None):
         if self.embedding_type == "plr":
             # For PLR, pass all numerical features together
             if num_features is not None:
-                num_features = torch.cat(num_features, dim=1).squeeze(
+                num_features = torch.stack(num_features, dim=1).squeeze(
                     -1
                 )  # Stack features along the feature dimension
                 num_embeddings = self.num_embeddings(

From 578ff77bafb6549066bdba3ef0050472b778ea78 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 09:54:40 +0000
Subject: [PATCH 111/132] including ftet

---
 mambular/arch_utils/transformer_utils.py | 162 ++++++++++++++---------
 mambular/base_models/ftet.py             | 130 ++++++++++++++++++
 2 files changed, 230 insertions(+), 62 deletions(-)
 create mode 100644 mambular/base_models/ftet.py

diff --git a/mambular/arch_utils/transformer_utils.py b/mambular/arch_utils/transformer_utils.py
index 4334b84..20ca280 100644
--- a/mambular/arch_utils/transformer_utils.py
+++ b/mambular/arch_utils/transformer_utils.py
@@ -5,6 +5,7 @@
     LinearBatchEnsembleLayer,
     MultiHeadAttentionBatchEnsemble,
 )
+from typing import Optional, List, Literal
 
 
 def reglu(x):
@@ -76,14 +77,6 @@ def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal=False
         return src
 
 
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-from typing import Optional, List, Literal
-import copy
-
-
 class BatchEnsembleTransformerEncoderLayer(nn.Module):
     """
     Transformer Encoder Layer with Batch Ensembling.
@@ -126,6 +119,7 @@ def __init__(
         scaling_init: Literal["ones", "random-signs", "normal"] = "ones",
         batch_ensemble_projections: List[str] = ["query"],
         batch_ensemble_ffn: bool = False,
+        ensemble_bias=False,
     ):
         super(BatchEnsembleTransformerEncoderLayer, self).__init__()
 
@@ -133,7 +127,7 @@ def __init__(
         self.num_heads = num_heads
         self.ensemble_size = ensemble_size
         self.dim_feedforward = dim_feedforward
-        self.dropout = dropout
+        self.dropout = nn.Dropout(dropout)
         self.activation = activation
         self.batch_ensemble_ffn = batch_ensemble_ffn
 
@@ -149,11 +143,19 @@ def __init__(
         # Feedforward network
         if batch_ensemble_ffn:
             # Apply batch ensembling to the feedforward network
-            self.linear1 = BatchEnsembleLinear(
-                embed_dim, dim_feedforward, ensemble_size, scaling_init
+            self.linear1 = LinearBatchEnsembleLayer(
+                embed_dim,
+                dim_feedforward,
+                ensemble_size,
+                scaling_init=scaling_init,
+                ensemble_bias=ensemble_bias,
             )
-            self.linear2 = BatchEnsembleLinear(
-                dim_feedforward, embed_dim, ensemble_size, scaling_init
+            self.linear2 = LinearBatchEnsembleLayer(
+                dim_feedforward,
+                embed_dim,
+                ensemble_size,
+                scaling_init=scaling_init,
+                ensemble_bias=ensemble_bias,
             )
         else:
             # Standard feedforward network
@@ -217,23 +219,6 @@ def forward(self, src, src_mask: Optional[torch.Tensor] = None):
         src = self.norm2(src)
         return src
 
-    def dropout(self, x):
-        """
-        Apply dropout to the input tensor.
-
-        Parameters
-        ----------
-        x : torch.Tensor
-            Input tensor.
-
-        Returns
-        -------
-        torch.Tensor
-            Output tensor after applying dropout.
-
-        """
-        return F.dropout(x, p=self.dropout, training=self.training)
-
 
 class BatchEnsembleTransformerEncoder(nn.Module):
     """
@@ -271,38 +256,77 @@ class BatchEnsembleTransformerEncoder(nn.Module):
 
     def __init__(
         self,
-        num_layers: int,
-        embed_dim: int,
-        num_heads: int,
-        ensemble_size: int,
-        dim_feedforward: int = 2048,
-        dropout: float = 0.1,
-        activation: Literal["relu", "gelu"] = "relu",
-        scaling_init: Literal["ones", "random-signs", "normal"] = "ones",
-        batch_ensemble_projections: List[str] = ["query"],
-        batch_ensemble_ffn: bool = False,
-        norm: Optional[nn.Module] = None,
+        config,
     ):
         super(BatchEnsembleTransformerEncoder, self).__init__()
-        self.layers = nn.ModuleList(
-            [
-                BatchEnsembleTransformerEncoderLayer(
-                    embed_dim=embed_dim,
-                    num_heads=num_heads,
-                    ensemble_size=ensemble_size,
-                    dim_feedforward=dim_feedforward,
-                    dropout=dropout,
-                    activation=activation,
-                    scaling_init=scaling_init,
-                    batch_ensemble_projections=batch_ensemble_projections,
-                    batch_ensemble_ffn=batch_ensemble_ffn,
-                )
-                for _ in range(num_layers)
-            ]
+        d_model = getattr(config, "d_model", 128)
+        nhead = getattr(config, "n_heads", 8)
+        dim_feedforward = getattr(config, "transformer_dim_feedforward", 256)
+        dropout = getattr(config, "attn_dropout", 0.5)
+        activation = getattr(config, "transformer_activation", F.relu)
+        num_layers = getattr(config, "n_layers", 4)
+        ff_dropout = getattr(config, "ff_dropout", 0.5)
+        ensemble_projections = getattr(config, "batch_ensemble_projections", ["query"])
+        scaling_init = getattr(config, "scaling_init", "ones")
+        batch_ensemble_ffn = getattr(config, "batch_ensemble_ffn", False)
+        ensemble_bias = getattr(config, "ensemble_bias", False)
+        model_type = getattr(config, "model_type", "full")
+        scaling_init = getattr(config, "scaling_init", "ones")
+
+        self.ensemble_size = getattr(config, "ensemble_size", 32)
+
+        self.layers = nn.ModuleList()
+
+        self.layers.append(
+            BatchEnsembleTransformerEncoderLayer(
+                embed_dim=d_model,
+                num_heads=nhead,
+                ensemble_size=self.ensemble_size,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                activation=activation,
+                batch_ensemble_projections=ensemble_projections,
+                batch_ensemble_ffn=batch_ensemble_ffn,
+                scaling_init="normal",
+                ensemble_bias=ensemble_bias,
+            )
         )
-        self.norm = norm
 
-    def forward(self, src, mask: Optional[torch.Tensor] = None):
+        for i in range(1, num_layers):
+            if model_type == "mini":
+                self.layers.append(
+                    BatchEnsembleTransformerEncoderLayer(
+                        embed_dim=d_model,
+                        num_heads=nhead,
+                        ensemble_size=self.ensemble_size,
+                        dim_feedforward=dim_feedforward,
+                        dropout=dropout,
+                        activation=activation,
+                        scaling_init=scaling_init,
+                        batch_ensemble_projections=[],
+                        batch_ensemble_ffn=False,
+                        ensemble_bias=ensemble_bias,
+                    )
+                )
+
+            else:
+                self.layers.append(
+                    BatchEnsembleTransformerEncoderLayer(
+                        embed_dim=d_model,
+                        num_heads=nhead,
+                        ensemble_size=self.ensemble_size,
+                        dim_feedforward=dim_feedforward,
+                        dropout=dropout,
+                        activation=activation,
+                        batch_ensemble_projections=ensemble_projections,
+                        batch_ensemble_ffn=batch_ensemble_ffn,
+                        ensemble_bias=ensemble_bias,
+                    )
+                )
+
+        self.ensemble_projections = ensemble_projections
+
+    def forward(self, x, mask: Optional[torch.Tensor] = None):
         """
         Pass the input through the encoder layers in turn.
 
@@ -318,12 +342,26 @@ def forward(self, src, mask: Optional[torch.Tensor] = None):
         torch.Tensor
             The output tensor of shape (N, S, E, D).
         """
-        output = src
+        if x.dim() == 3:  # Case: (B, L, D) - no ensembles
+            batch_size, seq_len, input_size = x.shape
+            x = x.unsqueeze(2).expand(
+                -1, -1, self.ensemble_size, -1
+            )  # Shape: (B, L, ensemble_size, D)
+        elif (
+            x.dim() == 4 and x.size(2) == self.ensemble_size
+        ):  # Case: (B, L, ensemble_size, D)
+            batch_size, seq_len, ensemble_size, _ = x.shape
+            if ensemble_size != self.ensemble_size:
+                raise ValueError(
+                    f"Input shape {x.shape} is invalid. Expected shape: (B, S, ensemble_size, N)"
+                )
+        else:
+            raise ValueError(
+                f"Input shape {x.shape} is invalid. Expected shape: (B, L, D) or (B, L, ensemble_size, D)"
+            )
+        output = x
 
         for layer in self.layers:
             output = layer(output, src_mask=mask)
 
-        if self.norm is not None:
-            output = self.norm(output)
-
         return output
diff --git a/mambular/base_models/ftet.py b/mambular/base_models/ftet.py
new file mode 100644
index 0000000..fa9bd23
--- /dev/null
+++ b/mambular/base_models/ftet.py
@@ -0,0 +1,130 @@
+import torch
+import torch.nn as nn
+from ..arch_utils.mlp_utils import MLPhead
+from ..arch_utils.get_norm_fn import get_normalization_layer
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.transformer_utils import BatchEnsembleTransformerEncoder
+from ..configs.ftet_config import DefaultFTETConfig
+from .basemodel import BaseModel
+from ..arch_utils.layer_utils.sn_linear import SNLinear
+
+
+class FTET(BaseModel):
+    """
+    A Feature Transformer model for tabular data with categorical and numerical features, using embedding, transformer
+    encoding, and pooling to produce final predictions.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features, including their names and dimensions.
+    num_feature_info : dict
+        Dictionary containing information about numerical features, including their names and dimensions.
+    num_classes : int, optional
+        The number of output classes or target dimensions for regression, by default 1.
+    config : DefaultFTTransformerConfig, optional
+        Configuration object containing model hyperparameters such as dropout rates, hidden layer sizes,
+        transformer settings, and other architectural configurations, by default DefaultFTTransformerConfig().
+    **kwargs : dict
+        Additional keyword arguments for the BaseModel class.
+
+    Attributes
+    ----------
+    pooling_method : str
+        The pooling method to aggregate features after transformer encoding.
+    cat_feature_info : dict
+        Stores categorical feature information.
+    num_feature_info : dict
+        Stores numerical feature information.
+    embedding_layer : EmbeddingLayer
+        Layer for embedding categorical and numerical features.
+    norm_f : nn.Module
+        Normalization layer for the transformer output.
+    encoder : nn.TransformerEncoder
+        Transformer encoder for sequential processing of embedded features.
+    tabular_head : MLPhead
+        MLPhead layer to produce the final prediction based on the output of the transformer encoder.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Perform a forward pass through the model, including embedding, transformer encoding, pooling, and prediction steps.
+
+    """
+
+    def __init__(
+        self,
+        cat_feature_info,
+        num_feature_info,
+        num_classes=1,
+        config: DefaultFTETConfig = DefaultFTETConfig(),
+        **kwargs,
+    ):
+        super().__init__(config=config, **kwargs)
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        if not self.hparams.average_ensembles:
+            self.returns_ensemble = True  # Directly set ensemble flag
+        else:
+            self.returns_ensemble = False
+
+        self.cat_feature_info = cat_feature_info
+        self.num_feature_info = num_feature_info
+
+        # embedding layer
+        self.embedding_layer = EmbeddingLayer(
+            num_feature_info=num_feature_info,
+            cat_feature_info=cat_feature_info,
+            config=config,
+        )
+
+        # transformer encoder
+        self.norm_f = get_normalization_layer(config)
+        self.encoder = BatchEnsembleTransformerEncoder(config)
+
+        if self.hparams.average_ensembles:
+            self.final_layer = nn.Linear(self.hparams.d_model, num_classes)
+        else:
+            self.final_layer = SNLinear(
+                self.hparams.ensemble_size,
+                self.hparams.d_model,
+                num_classes,
+            )
+
+        # pooling
+        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
+
+    def forward(self, num_features, cat_features):
+        """
+        Defines the forward pass of the model.
+
+        Parameters
+        ----------
+        num_features : Tensor
+            Tensor containing the numerical features.
+        cat_features : Tensor
+            Tensor containing the categorical features.
+
+        Returns
+        -------
+        Tensor
+            The output predictions of the model.
+        """
+        x = self.embedding_layer(num_features, cat_features)
+
+        x = self.encoder(x)
+
+        x = self.pool_sequence(x)  # Shape: (batch_size, ensemble_size, hidden_size)
+
+        if self.hparams.average_ensembles:
+            x = x.mean(axis=1)  # Shape (batch_size, num_classes)
+
+        x = self.final_layer(
+            x
+        )  # Shape (batch_size, (ensemble_size), num_classes) if not averaged
+
+        if not self.hparams.average_ensembles:
+            x = x.squeeze(-1)
+
+        return x

From fe161b1d92a3edf792aefc7b73969316f2076514 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 09:55:09 +0000
Subject: [PATCH 112/132] adjusting default configs with new doc structure

---
 mambular/configs/__init__.py                  |   6 +-
 mambular/configs/cnn_config.py                |  95 ++++++++++++++++
 mambular/configs/ftet_config.py               | 105 ++++++++++++++++++
 mambular/configs/fttransformer_config.py      |   3 +
 mambular/configs/tabm_config.py               |  10 +-
 mambular/configs/tabtransformer_config.py     |   2 +
 .../{batchtabrnn_config.py => trem_config.py} |   4 +-
 7 files changed, 216 insertions(+), 9 deletions(-)
 create mode 100644 mambular/configs/cnn_config.py
 create mode 100644 mambular/configs/ftet_config.py
 rename mambular/configs/{batchtabrnn_config.py => trem_config.py} (97%)

diff --git a/mambular/configs/__init__.py b/mambular/configs/__init__.py
index 7935925..bf9b7b3 100644
--- a/mambular/configs/__init__.py
+++ b/mambular/configs/__init__.py
@@ -9,7 +9,8 @@
 from .ndtf_config import DefaultNDTFConfig
 from .node_config import DefaultNODEConfig
 from .tabm_config import DefaultTabMConfig
-from .batchtabrnn_config import DefaultBatchTabRNNConfig
+from .trem_config import DefaultTREMConfig
+from .cnn_config import DefaultCNNConfig
 
 
 __all__ = [
@@ -24,5 +25,6 @@
     "DefaultNDTFConfig",
     "DefaultNODEConfig",
     "DefaultTabMConfig",
-    "DefaultBatchTabRNNConfig",
+    "DefaultTREMConfig",
+    "DefaultCNNConfig",
 ]
diff --git a/mambular/configs/cnn_config.py b/mambular/configs/cnn_config.py
new file mode 100644
index 0000000..825c9cd
--- /dev/null
+++ b/mambular/configs/cnn_config.py
@@ -0,0 +1,95 @@
+from dataclasses import dataclass
+import torch.nn as nn
+
+
+@dataclass
+class DefaultCNNConfig:
+    """
+    Configuration class for the default CNN model with predefined hyperparameters.
+
+    Optimizer Parameters
+    --------------------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which the learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 regularization) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+
+    Embedding Parameters
+    ---------------------
+    use_embeddings : bool, default=False
+        Whether to use embedding layers for all features.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    d_model : int, default=32
+        Dimensionality of the embeddings.
+    plr_lite : bool, default=False
+        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
+
+    CNN Parameters
+    --------------------
+    input_channels : int, default=1
+        Number of input channels (e.g., 1 for grayscale images).
+    num_layers : int, default=4
+        Number of convolutional layers.
+    out_channels_list : list, default=(64, 64, 128, 128)
+        List of output channels for each convolutional layer.
+    kernel_size_list : list, default=(3, 3, 3, 3)
+        List of kernel sizes for each convolutional layer.
+    stride_list : list, default=(1, 1, 1, 1)
+        List of stride values for each convolutional layer.
+    padding_list : list, default=(1, 1, 1, 1)
+        List of padding values for each convolutional layer.
+    pooling_method : str, default="max"
+        Pooling method ('max' or 'avg').
+    pooling_kernel_size_list : list, default=(2, 2, 1, 1)
+        List of kernel sizes for pooling layers for each convolutional layer.
+    pooling_stride_list : list, default=(2, 2, 1, 1)
+        List of stride values for pooling layers for each convolutional layer.
+
+    Dropout Parameters
+    -------------------
+    dropout_rate : float, default=0.5
+        Probability of dropping neurons during training.
+    dropout_positions : list, default=None
+        List of indices of layers after which dropout should be applied. If None, no dropout is applied.
+    """
+
+    # Optimizer parameters
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-06
+    lr_factor: float = 0.1
+
+    # Embedding parameters
+    layer_norm: bool = False
+    layer_norm_eps: float = 1e-05
+    use_embeddings: bool = False
+    embedding_activation: callable = nn.Identity()
+    embedding_type: str = "linear"
+    embedding_bias: bool = False
+    layer_norm_after_embedding: bool = False
+    d_model: int = 32
+    plr_lite: bool = False
+
+    # CNN parameters
+    input_channels: int = 1
+    num_layers: int = 4
+    out_channels_list: list = (64, 64, 128, 128)
+    kernel_size_list: list = (3, 3, 3, 3)
+    stride_list: list = (1, 1, 1, 1)
+    padding_list: list = (1, 1, 1, 1)
+    pooling_method: str = "max"
+    pooling_kernel_size_list: list = (2, 2, 1, 1)
+    pooling_stride_list: list = (2, 2, 1, 1)
+    dropout_rate: float = 0.5  # Probability to drop neurons
+    dropout_positions: list = None
diff --git a/mambular/configs/ftet_config.py b/mambular/configs/ftet_config.py
new file mode 100644
index 0000000..538e98a
--- /dev/null
+++ b/mambular/configs/ftet_config.py
@@ -0,0 +1,105 @@
+from dataclasses import dataclass
+import torch.nn as nn
+from ..arch_utils.transformer_utils import ReGLU
+from typing import Optional, List, Literal
+
+
+@dataclass
+class DefaultFTETConfig:
+    """
+    Configuration class for the FTET model with predefined hyperparameters.
+
+    Attributes
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which the learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 regularization) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+    d_model : int, default=128
+        Dimensionality of the transformer model.
+    n_layers : int, default=4
+        Number of transformer layers.
+    n_heads : int, default=8
+        Number of attention heads in the transformer.
+    attn_dropout : float, default=0.2
+        Dropout rate for the attention mechanism.
+    ff_dropout : float, default=0.1
+        Dropout rate for the feed-forward layers.
+    norm : str, default="LayerNorm"
+        Type of normalization to be used ('LayerNorm', 'RMSNorm', etc.).
+    activation : callable, default=nn.SELU()
+        Activation function for the transformer layers.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in embedding layers.
+    head_layer_sizes : list, default=()
+        Sizes of the fully connected layers in the model's head.
+    head_dropout : float, default=0.5
+        Dropout rate for the head layers.
+    head_skip_layers : bool, default=False
+        Whether to use skip connections in the head layers.
+    head_activation : callable, default=nn.SELU()
+        Activation function for the head layers.
+    head_use_batch_norm : bool, default=False
+        Whether to use batch normalization in the head layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding layers.
+    pooling_method : str, default="avg"
+        Pooling method to be used ('cls', 'avg', etc.).
+    use_cls : bool, default=False
+        Whether to use a CLS token for pooling.
+    norm_first : bool, default=False
+        Whether to apply normalization before other operations in each transformer block.
+    bias : bool, default=True
+        Whether to use bias in linear layers.
+    transformer_activation : callable, default=ReGLU()
+        Activation function for the transformer feed-forward layers.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization to improve numerical stability.
+    transformer_dim_feedforward : int, default=256
+        Dimensionality of the feed-forward layers in the transformer.
+    cat_encoding : str, default="int"
+        Method for encoding categorical features ('int', 'one-hot', or 'linear').
+    """
+
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-06
+    lr_factor: float = 0.1
+    d_model: int = 128
+    n_layers: int = 4
+    n_heads: int = 8
+    attn_dropout: float = 0.3
+    ff_dropout: float = 0.5
+    norm: str = "LayerNorm"
+    activation: callable = nn.SELU()
+    embedding_activation: callable = nn.ReLU()
+    embedding_type: str = "linear"
+    embedding_bias: bool = False
+    layer_norm_after_embedding: bool = False
+    pooling_method: str = "avg"
+    use_cls: bool = False
+    norm_first: bool = False
+    bias: bool = True
+    transformer_activation: str = "relu"
+    layer_norm_eps: float = 1e-05
+    transformer_dim_feedforward: int = 256
+    cat_encoding: str = "int"
+
+    # Batch ensembling specific configurations
+    ensemble_size: int = 32
+    ensemble_scaling_in: bool = True
+    ensemble_scaling_out: bool = True
+    ensemble_bias: bool = True
+    scaling_init: Literal["ones", "random-signs", "normal"] = "normal"
+    average_ensembles: bool = False
+    model_type: Literal["mini", "full"] = "full"
+    batch_ensemble_projections: list = ("query", "key", "value", "out_proj")
+    abtch_ensemble_ffn: bool = False
diff --git a/mambular/configs/fttransformer_config.py b/mambular/configs/fttransformer_config.py
index 19f6641..d154c24 100644
--- a/mambular/configs/fttransformer_config.py
+++ b/mambular/configs/fttransformer_config.py
@@ -34,6 +34,8 @@ class DefaultFTTransformerConfig:
         Activation function for the transformer layers.
     embedding_activation : callable, default=nn.Identity()
         Activation function for embeddings.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
     embedding_bias : bool, default=False
         Whether to use bias in embedding layers.
     head_layer_sizes : list, default=()
@@ -78,6 +80,7 @@ class DefaultFTTransformerConfig:
     norm: str = "LayerNorm"
     activation: callable = nn.SELU()
     embedding_activation: callable = nn.Identity()
+    embedding_type: str = "linear"
     embedding_bias: bool = False
     head_layer_sizes: list = ()
     head_dropout: float = 0.5
diff --git a/mambular/configs/tabm_config.py b/mambular/configs/tabm_config.py
index e39d26e..2c967c2 100644
--- a/mambular/configs/tabm_config.py
+++ b/mambular/configs/tabm_config.py
@@ -82,9 +82,9 @@ class DefaultTabMConfig:
     lr_factor: float = 0.1
 
     # arch params
-    layer_sizes: list = (512, 512, 128)
+    layer_sizes: list = (512, 512, 256)
     activation: callable = nn.ReLU()
-    dropout: float = 0.3
+    dropout: float = 0.2
     norm: str = None
     use_glu: bool = False
     batch_norm: bool = False
@@ -97,15 +97,15 @@ class DefaultTabMConfig:
     embedding_bias = False
     plr_lite: bool = False
     average_embeddings: bool = False
-    embedding_activation: callable = nn.ReLU()
+    embedding_activation: callable = nn.Identity()
     layer_norm_after_embedding: bool = False
-    d_model: int = 64
+    d_model: int = 128
 
     # Batch ensembling specific configurations
     ensemble_size: int = 32
     ensemble_scaling_in: bool = True
     ensemble_scaling_out: bool = True
     ensemble_bias: bool = True
-    scaling_init: Literal["ones", "random-signs", "normal"] = "normal"
+    scaling_init: Literal["ones", "random-signs", "normal"] = "ones"
     average_ensembles: bool = False
     model_type: Literal["mini", "full"] = "mini"
diff --git a/mambular/configs/tabtransformer_config.py b/mambular/configs/tabtransformer_config.py
index a1131c9..91fcb5e 100644
--- a/mambular/configs/tabtransformer_config.py
+++ b/mambular/configs/tabtransformer_config.py
@@ -74,6 +74,8 @@ class DefaultTabTransformerConfig:
     norm: str = "LayerNorm"
     activation: callable = nn.SELU()
     embedding_activation: callable = nn.Identity()
+    embedding_type: str = "linear"
+    embedding_bias: bool = False
     head_layer_sizes: list = ()
     head_dropout: float = 0.5
     head_skip_layers: bool = False
diff --git a/mambular/configs/batchtabrnn_config.py b/mambular/configs/trem_config.py
similarity index 97%
rename from mambular/configs/batchtabrnn_config.py
rename to mambular/configs/trem_config.py
index 3b41218..1479a2d 100644
--- a/mambular/configs/batchtabrnn_config.py
+++ b/mambular/configs/trem_config.py
@@ -4,9 +4,9 @@
 
 
 @dataclass
-class DefaultBatchTabRNNConfig:
+class DefaultTREMConfig:
     """
-    Configuration class for the TabulaRNN model with predefined hyperparameters.
+    Configuration class for the Tabular Recurrent Ensemble Model (TREM)
 
     Attributes
     ----------

From 4f915253e8aed7c6b641298ec07bd665932ec8a5 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 09:55:36 +0000
Subject: [PATCH 113/132] including new models and adjusting sklearn classes to
 hpo

---
 mambular/models/__init__.py                 |  16 +-
 mambular/models/cnn.py                      | 344 ++++++++++++++++++++
 mambular/models/ftet.py                     |  21 ++
 mambular/models/fttransformer.py            | 158 +++++----
 mambular/models/mambular.py                 | 293 +++++++++--------
 mambular/models/sklearn_base_classifier.py  |  77 ++---
 mambular/models/sklearn_base_lss.py         |  85 ++---
 mambular/models/sklearn_base_regressor.py   |  77 ++---
 mambular/models/{batchtabrnn.py => trem.py} |  34 +-
 9 files changed, 728 insertions(+), 377 deletions(-)
 create mode 100644 mambular/models/cnn.py
 create mode 100644 mambular/models/ftet.py
 rename mambular/models/{batchtabrnn.py => trem.py} (91%)

diff --git a/mambular/models/__init__.py b/mambular/models/__init__.py
index 51eac90..48b153c 100644
--- a/mambular/models/__init__.py
+++ b/mambular/models/__init__.py
@@ -26,7 +26,9 @@
 from .ndtf import NDTFClassifier, NDTFRegressor, NDTFLSS
 from .node import NODEClassifier, NODERegressor, NODELSS
 from .tabm import TabMClassifier, TabMRegressor, TabMLSS
-from .batchtabrnn import BatchTabRNNRegressor, BatchTabRNNClassifier, BatchTabRNNLSS
+from .trem import TREMRegressor, TREMClassifier, TREMLSS
+from .cnn import CNNRegressor, CNNClassifier, CNNLSS
+from .ftet import FTETRegressor, FTETClassifier, FTETLSS
 
 
 __all__ = [
@@ -66,7 +68,13 @@
     "TabMClassifier",
     "TabMRegressor",
     "TabMLSS",
-    "BatchTabRNNRegressor",
-    "BatchTabRNNClassifier",
-    "BatchTabRNNLSS",
+    "TREMRegressor",
+    "TREMClassifier",
+    "TREMLSS",
+    "CNNRegressor",
+    "CNNClassifier",
+    "CNNLSS",
+    "FTETRegressor",
+    "FTETClassifier",
+    "FTETLSS",
 ]
diff --git a/mambular/models/cnn.py b/mambular/models/cnn.py
new file mode 100644
index 0000000..caaf0f7
--- /dev/null
+++ b/mambular/models/cnn.py
@@ -0,0 +1,344 @@
+from .sklearn_base_regressor import SklearnBaseRegressor
+from .sklearn_base_lss import SklearnBaseLSS
+from .sklearn_base_classifier import SklearnBaseClassifier
+from ..base_models import CNN
+from ..configs.cnn_config import DefaultCNNConfig
+
+
+class CNNRegressor(SklearnBaseRegressor):
+    """
+    CNN regressor. This class extends the SklearnBaseRegressor class and uses the CNN model
+    with the default CNN configuration.
+
+    The accepted arguments to the CNNRegressor class include both the attributes in the DefaultCNNConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Optimizer Parameters
+    --------------------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which the learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 regularization) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+
+    Embedding Parameters
+    ---------------------
+    use_embeddings : bool, default=False
+        Whether to use embedding layers for all features.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    d_model : int, default=32
+        Dimensionality of the embeddings.
+    plr_lite : bool, default=False
+        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
+
+    CNN Parameters
+    --------------------
+    input_channels : int, default=1
+        Number of input channels (e.g., 1 for grayscale images).
+    num_layers : int, default=4
+        Number of convolutional layers.
+    out_channels_list : list, default=(64, 64, 128, 128)
+        List of output channels for each convolutional layer.
+    kernel_size_list : list, default=(3, 3, 3, 3)
+        List of kernel sizes for each convolutional layer.
+    stride_list : list, default=(1, 1, 1, 1)
+        List of stride values for each convolutional layer.
+    padding_list : list, default=(1, 1, 1, 1)
+        List of padding values for each convolutional layer.
+    pooling_method : str, default="max"
+        Pooling method ('max' or 'avg').
+    pooling_kernel_size_list : list, default=(2, 2, 1, 1)
+        List of kernel sizes for pooling layers for each convolutional layer.
+    pooling_stride_list : list, default=(2, 2, 1, 1)
+        List of stride values for pooling layers for each convolutional layer.
+
+    Dropout Parameters
+    -------------------
+    dropout_rate : float, default=0.5
+        Probability of dropping neurons during training.
+    dropout_positions : list, default=None
+        List of indices of layers after which dropout should be applied. If None, no dropout is applied.
+
+    Preprocessing Params
+    ---------------------
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the CNNRegressor class are the same as the attributes in the DefaultCNNConfig dataclass.
+    - CNNRegressor uses SklearnBaseRegressor as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseRegressor : The parent class for CNNRegressor.
+
+    Examples
+    --------
+    >>> from mambular.models import CNNRegressor
+    >>> model = CNNRegressor(d_model=64, n_layers=8)
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=CNN, config=DefaultCNNConfig, **kwargs)
+
+
+class CNNLSS(SklearnBaseLSS):
+    """
+    CNN regressor. This class extends the SklearnBaseLSS class and uses the CNN model
+    with the default CNN configuration.
+
+    The accepted arguments to the CNNLSS class include both the attributes in the DefaultCNNConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Optimizer Parameters
+    --------------------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which the learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 regularization) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+
+    Embedding Parameters
+    ---------------------
+    use_embeddings : bool, default=False
+        Whether to use embedding layers for all features.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    d_model : int, default=32
+        Dimensionality of the embeddings.
+    plr_lite : bool, default=False
+        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
+
+    CNN Parameters
+    --------------------
+    input_channels : int, default=1
+        Number of input channels (e.g., 1 for grayscale images).
+    num_layers : int, default=4
+        Number of convolutional layers.
+    out_channels_list : list, default=(64, 64, 128, 128)
+        List of output channels for each convolutional layer.
+    kernel_size_list : list, default=(3, 3, 3, 3)
+        List of kernel sizes for each convolutional layer.
+    stride_list : list, default=(1, 1, 1, 1)
+        List of stride values for each convolutional layer.
+    padding_list : list, default=(1, 1, 1, 1)
+        List of padding values for each convolutional layer.
+    pooling_method : str, default="max"
+        Pooling method ('max' or 'avg').
+    pooling_kernel_size_list : list, default=(2, 2, 1, 1)
+        List of kernel sizes for pooling layers for each convolutional layer.
+    pooling_stride_list : list, default=(2, 2, 1, 1)
+        List of stride values for pooling layers for each convolutional layer.
+
+    Dropout Parameters
+    -------------------
+    dropout_rate : float, default=0.5
+        Probability of dropping neurons during training.
+    dropout_positions : list, default=None
+        List of indices of layers after which dropout should be applied. If None, no dropout is applied.
+
+    Preprocessing Params
+    ---------------------
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the CNNLSS class are the same as the attributes in the DefaultCNNConfig dataclass.
+    - CNNLSS uses SklearnBaseLSS as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseLSS : The parent class for CNNLSS.
+
+    Examples
+    --------
+    >>> from mambular.models import CNNLSS
+    >>> model = CNNLSS(d_model=64, n_layers=8)
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=CNN, config=DefaultCNNConfig, **kwargs)
+
+
+class CNNClassifier(SklearnBaseClassifier):
+    """
+    CNN regressor. This class extends the SklearnBaseCLassifier class and uses the CNN model
+    with the default CNN configuration.
+
+    The accepted arguments to the CNNCLassifier class include both the attributes in the DefaultCNNConfig dataclass
+    and the parameters for the Preprocessor class.
+
+    Optimizer Parameters
+    --------------------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement after which the learning rate will be reduced.
+    weight_decay : float, default=1e-06
+        Weight decay (L2 regularization) for the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate will be reduced.
+
+    Embedding Parameters
+    ---------------------
+    use_embeddings : bool, default=False
+        Whether to use embedding layers for all features.
+    embedding_activation : callable, default=nn.Identity()
+        Activation function for embeddings.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    d_model : int, default=32
+        Dimensionality of the embeddings.
+    plr_lite : bool, default=False
+        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
+
+    CNN Parameters
+    --------------------
+    input_channels : int, default=1
+        Number of input channels (e.g., 1 for grayscale images).
+    num_layers : int, default=4
+        Number of convolutional layers.
+    out_channels_list : list, default=(64, 64, 128, 128)
+        List of output channels for each convolutional layer.
+    kernel_size_list : list, default=(3, 3, 3, 3)
+        List of kernel sizes for each convolutional layer.
+    stride_list : list, default=(1, 1, 1, 1)
+        List of stride values for each convolutional layer.
+    padding_list : list, default=(1, 1, 1, 1)
+        List of padding values for each convolutional layer.
+    pooling_method : str, default="max"
+        Pooling method ('max' or 'avg').
+    pooling_kernel_size_list : list, default=(2, 2, 1, 1)
+        List of kernel sizes for pooling layers for each convolutional layer.
+    pooling_stride_list : list, default=(2, 2, 1, 1)
+        List of stride values for pooling layers for each convolutional layer.
+
+    Dropout Parameters
+    -------------------
+    dropout_rate : float, default=0.5
+        Probability of dropping neurons during training.
+    dropout_positions : list, default=None
+        List of indices of layers after which dropout should be applied. If None, no dropout is applied.
+
+    Preprocessing Params
+    ---------------------
+    n_bins : int, default=50
+        The number of bins to use for numerical feature binning. This parameter is relevant
+        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    numerical_preprocessing : str, default="ple"
+        The preprocessing strategy for numerical features. Valid options are
+        'binning', 'one_hot', 'standardization', and 'normalization'.
+    use_decision_tree_bins : bool, default=False
+        If True, uses decision tree regression/classification to determine
+        optimal bin edges for numerical feature binning. This parameter is
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+    binning_strategy : str, default="uniform"
+        Defines the strategy for binning numerical features. Options include 'uniform',
+        'quantile', or other sklearn-compatible strategies.
+    cat_cutoff : float or int, default=0.03
+        Indicates the cutoff after which integer values are treated as categorical.
+        If float, it's treated as a percentage. If int, it's the maximum number of
+        unique values for a column to be considered categorical.
+    treat_all_integers_as_numerical : bool, default=False
+        If True, all integer columns will be treated as numerical, regardless
+        of their unique value count or proportion.
+    degree : int, default=3
+        The degree of the polynomial features to be used in preprocessing.
+    knots : int, default=12
+        The number of knots to be used in spline transformations.
+
+    Notes
+    -----
+    - The accepted arguments to the CNNCLassifier class are the same as the attributes in the DefaultCNNConfig dataclass.
+    - CNNCLassifier uses SklearnBaseCLassifier as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
+
+    See Also
+    --------
+    mambular.models.SklearnBaseCLassifier : The parent class for CNNCLassifier.
+
+    Examples
+    --------
+    >>> from mambular.models import CNNCLassifier
+    >>> model = CNNCLassifier(d_model=64, n_layers=8)
+    >>> model.fit(X_train, y_train)
+    >>> preds = model.predict(X_test)
+    >>> model.evaluate(X_test, y_test)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(model=CNN, config=DefaultCNNConfig, **kwargs)
diff --git a/mambular/models/ftet.py b/mambular/models/ftet.py
new file mode 100644
index 0000000..297e41f
--- /dev/null
+++ b/mambular/models/ftet.py
@@ -0,0 +1,21 @@
+from .sklearn_base_regressor import SklearnBaseRegressor
+from .sklearn_base_classifier import SklearnBaseClassifier
+from .sklearn_base_lss import SklearnBaseLSS
+
+from ..base_models.ftet import FTET
+from ..configs.ftet_config import DefaultFTETConfig
+
+
+class FTETRegressor(SklearnBaseRegressor):
+    def __init__(self, **kwargs):
+        super().__init__(model=FTET, config=DefaultFTETConfig, **kwargs)
+
+
+class FTETClassifier(SklearnBaseClassifier):
+    def __init__(self, **kwargs):
+        super().__init__(model=FTET, config=DefaultFTETConfig, **kwargs)
+
+
+class FTETLSS(SklearnBaseLSS):
+    def __init__(self, **kwargs):
+        super().__init__(model=FTET, config=DefaultFTETConfig, **kwargs)
diff --git a/mambular/models/fttransformer.py b/mambular/models/fttransformer.py
index a84e448..6e0ee74 100644
--- a/mambular/models/fttransformer.py
+++ b/mambular/models/fttransformer.py
@@ -19,55 +19,59 @@ class FTTransformerRegressor(SklearnBaseRegressor):
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    family : str, default=None
-        Distributional family to be used for the model.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
+        Weight decay (L2 regularization) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
-    d_model : int, default=64
-        Dimensionality of the model.
-    n_layers : int, default=8
-        Number of layers in the transformer.
-    n_heads : int, default=4
+    d_model : int, default=128
+        Dimensionality of the transformer model.
+    n_layers : int, default=4
+        Number of transformer layers.
+    n_heads : int, default=8
         Number of attention heads in the transformer.
-    attn_dropout : float, default=0.3
+    attn_dropout : float, default=0.2
         Dropout rate for the attention mechanism.
-    ff_dropout : float, default=0.3
+    ff_dropout : float, default=0.1
         Dropout rate for the feed-forward layers.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
+    norm : str, default="LayerNorm"
+        Type of normalization to be used ('LayerNorm', 'RMSNorm', etc.).
     activation : callable, default=nn.SELU()
-        Activation function for the transformer.
+        Activation function for the transformer layers.
     embedding_activation : callable, default=nn.Identity()
         Activation function for embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in embedding layers.
+    head_layer_sizes : list, default=()
+        Sizes of the fully connected layers in the model's head.
     head_dropout : float, default=0.5
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
-        Whether to skip layers in the head.
+        Whether to use skip connections in the head layers.
     head_activation : callable, default=nn.SELU()
         Activation function for the head layers.
     head_use_batch_norm : bool, default=False
         Whether to use batch normalization in the head layers.
     layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    pooling_method : str, default="cls"
+        Whether to apply layer normalization after embedding layers.
+    pooling_method : str, default="avg"
         Pooling method to be used ('cls', 'avg', etc.).
+    use_cls : bool, default=False
+        Whether to use a CLS token for pooling.
     norm_first : bool, default=False
         Whether to apply normalization before other operations in each transformer block.
     bias : bool, default=True
-        Whether to use bias in the linear layers.
-    transformer_activation : callable, default=nn.SELU()
-        Activation function for the transformer layers.
+        Whether to use bias in linear layers.
+    transformer_activation : callable, default=ReGLU()
+        Activation function for the transformer feed-forward layers.
     layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    transformer_dim_feedforward : int, default=512
+        Epsilon value for layer normalization to improve numerical stability.
+    transformer_dim_feedforward : int, default=256
         Dimensionality of the feed-forward layers in the transformer.
     cat_encoding : str, default="int"
-        whether to use integer encoding or one-hot encoding for cat features.
+        Method for encoding categorical features ('int', 'one-hot', or 'linear').
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -130,53 +134,59 @@ class FTTransformerClassifier(SklearnBaseClassifier):
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
+        Weight decay (L2 regularization) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
-    d_model : int, default=64
-        Dimensionality of the model.
-    n_layers : int, default=8
-        Number of layers in the transformer.
-    n_heads : int, default=4
+    d_model : int, default=128
+        Dimensionality of the transformer model.
+    n_layers : int, default=4
+        Number of transformer layers.
+    n_heads : int, default=8
         Number of attention heads in the transformer.
-    attn_dropout : float, default=0.3
+    attn_dropout : float, default=0.2
         Dropout rate for the attention mechanism.
-    ff_dropout : float, default=0.3
+    ff_dropout : float, default=0.1
         Dropout rate for the feed-forward layers.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
+    norm : str, default="LayerNorm"
+        Type of normalization to be used ('LayerNorm', 'RMSNorm', etc.).
     activation : callable, default=nn.SELU()
-        Activation function for the transformer.
+        Activation function for the transformer layers.
     embedding_activation : callable, default=nn.Identity()
         Activation function for embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in embedding layers.
+    head_layer_sizes : list, default=()
+        Sizes of the fully connected layers in the model's head.
     head_dropout : float, default=0.5
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
-        Whether to skip layers in the head.
+        Whether to use skip connections in the head layers.
     head_activation : callable, default=nn.SELU()
         Activation function for the head layers.
     head_use_batch_norm : bool, default=False
         Whether to use batch normalization in the head layers.
     layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    pooling_method : str, default="cls"
+        Whether to apply layer normalization after embedding layers.
+    pooling_method : str, default="avg"
         Pooling method to be used ('cls', 'avg', etc.).
+    use_cls : bool, default=False
+        Whether to use a CLS token for pooling.
     norm_first : bool, default=False
         Whether to apply normalization before other operations in each transformer block.
     bias : bool, default=True
-        Whether to use bias in the linear layers.
-    transformer_activation : callable, default=nn.SELU()
-        Activation function for the transformer layers.
+        Whether to use bias in linear layers.
+    transformer_activation : callable, default=ReGLU()
+        Activation function for the transformer feed-forward layers.
     layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    transformer_dim_feedforward : int, default=512
+        Epsilon value for layer normalization to improve numerical stability.
+    transformer_dim_feedforward : int, default=256
         Dimensionality of the feed-forward layers in the transformer.
     cat_encoding : str, default="int"
-        whether to use integer encoding or one-hot encoding for cat features.
+        Method for encoding categorical features ('int', 'one-hot', or 'linear').
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -239,53 +249,59 @@ class FTTransformerLSS(SklearnBaseLSS):
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
+        Weight decay (L2 regularization) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
-    d_model : int, default=64
-        Dimensionality of the model.
-    n_layers : int, default=8
-        Number of layers in the transformer.
-    n_heads : int, default=4
+    d_model : int, default=128
+        Dimensionality of the transformer model.
+    n_layers : int, default=4
+        Number of transformer layers.
+    n_heads : int, default=8
         Number of attention heads in the transformer.
-    attn_dropout : float, default=0.3
+    attn_dropout : float, default=0.2
         Dropout rate for the attention mechanism.
-    ff_dropout : float, default=0.3
+    ff_dropout : float, default=0.1
         Dropout rate for the feed-forward layers.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
+    norm : str, default="LayerNorm"
+        Type of normalization to be used ('LayerNorm', 'RMSNorm', etc.).
     activation : callable, default=nn.SELU()
-        Activation function for the transformer.
+        Activation function for the transformer layers.
     embedding_activation : callable, default=nn.Identity()
         Activation function for embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in embedding layers.
+    head_layer_sizes : list, default=()
+        Sizes of the fully connected layers in the model's head.
     head_dropout : float, default=0.5
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
-        Whether to skip layers in the head.
+        Whether to use skip connections in the head layers.
     head_activation : callable, default=nn.SELU()
         Activation function for the head layers.
     head_use_batch_norm : bool, default=False
         Whether to use batch normalization in the head layers.
     layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    pooling_method : str, default="cls"
+        Whether to apply layer normalization after embedding layers.
+    pooling_method : str, default="avg"
         Pooling method to be used ('cls', 'avg', etc.).
+    use_cls : bool, default=False
+        Whether to use a CLS token for pooling.
     norm_first : bool, default=False
         Whether to apply normalization before other operations in each transformer block.
     bias : bool, default=True
-        Whether to use bias in the linear layers.
-    transformer_activation : callable, default=nn.SELU()
-        Activation function for the transformer layers.
+        Whether to use bias in linear layers.
+    transformer_activation : callable, default=ReGLU()
+        Activation function for the transformer feed-forward layers.
     layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    transformer_dim_feedforward : int, default=512
+        Epsilon value for layer normalization to improve numerical stability.
+    transformer_dim_feedforward : int, default=256
         Dimensionality of the feed-forward layers in the transformer.
     cat_encoding : str, default="int"
-        whether to use integer encoding or one-hot encoding for cat features.
+        Method for encoding categorical features ('int', 'one-hot', or 'linear').
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
diff --git a/mambular/models/mambular.py b/mambular/models/mambular.py
index 1d4c7ae..4fbd4b0 100644
--- a/mambular/models/mambular.py
+++ b/mambular/models/mambular.py
@@ -13,52 +13,73 @@ class MambularRegressor(SklearnBaseRegressor):
     The accepted arguments to the MambularRegressor class include both the attributes in the DefaultMambularConfig dataclass
     and the parameters for the Preprocessor class.
 
-    Parameters
-    ----------
+     Optimizer Parameters
+    --------------------
     lr : float, default=1e-04
         Learning rate for the optimizer.
     lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
+        Number of epochs with no improvement after which the learning rate will be reduced.
     weight_decay : float, default=1e-06
         Weight decay (L2 penalty) for the optimizer.
     lr_factor : float, default=0.1
         Factor by which the learning rate will be reduced.
+
+    Mambular Model Parameters
+    -----------------------
     d_model : int, default=64
         Dimensionality of the model.
-    n_layers : int, default=8
+    n_layers : int, default=4
         Number of layers in the model.
     expand_factor : int, default=2
         Expansion factor for the feed-forward layers.
     bias : bool, default=False
         Whether to use bias in the linear layers.
-    d_conv : int, default=16
-        Dimensionality of the convolutional layers.
-    conv_bias : bool, default=True
-        Whether to use bias in the convolutional layers.
-    dropout : float, default=0.05
+    dropout : float, default=0.0
         Dropout rate for regularization.
     dt_rank : str, default="auto"
-        Rank of the decision tree.
-    d_state : int, default=32
+        Rank of the decision tree used in the model.
+    d_state : int, default=128
         Dimensionality of the state in recurrent layers.
     dt_scale : float, default=1.0
-        Scaling factor for decision tree.
+        Scaling factor for decision tree parameters.
     dt_init : str, default="random"
-        Initialization method for decision tree.
+        Initialization method for decision tree parameters.
     dt_max : float, default=0.1
         Maximum value for decision tree initialization.
     dt_min : float, default=1e-04
         Minimum value for decision tree initialization.
     dt_init_floor : float, default=1e-04
         Floor value for decision tree initialization.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
-    activation : callable, default=nn.SELU()
+    norm : str, default="LayerNorm"
+        Type of normalization used ('LayerNorm', 'RMSNorm', etc.).
+    activation : callable, default=nn.SiLU()
         Activation function for the model.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=True
+        Whether weight decay is applied to A-D matrices.
+    BC_layer_norm : bool, default=False
+        Whether to apply layer normalization to B-C matrices.
+
+    Embedding Parameters
+    ---------------------
     embedding_activation : callable, default=nn.Identity()
         Activation function for embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    shuffle_embeddings : bool, default=False
+        Whether to shuffle embeddings before being passed to Mamba layers.
+    cat_encoding : str, default="int"
+        Encoding method for categorical features ('int', 'one-hot', etc.).
+
+    Head Parameters
+    ---------------
+    head_layer_sizes : list, default=()
+        Sizes of the layers in the model's head.
     head_dropout : float, default=0.5
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
@@ -67,30 +88,27 @@ class MambularRegressor(SklearnBaseRegressor):
         Activation function for the head layers.
     head_use_batch_norm : bool, default=False
         Whether to use batch normalization in the head layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
+
+    Additional Features
+    --------------------
     pooling_method : str, default="avg"
-        Pooling method to be used ('avg', 'max', etc.).
+        Pooling method to use ('avg', 'max', etc.).
     bidirectional : bool, default=False
-        Whether to use bidirectional processing of the input sequences.
+        Whether to process data bidirectionally.
     use_learnable_interaction : bool, default=False
-        Whether to use learnable feature interactions before passing through mamba blocks.
-    use_cls : bool, default=True
-        Whether to append a cls to the end of each 'sequence'.
-    shuffle_embeddings : bool, default=False.
-        Whether to shuffle the embeddings before being passed to the Mamba layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    AD_weight_decay : bool, default=True
-        whether weight decay is also applied to A-D matrices.
-    BC_layer_norm: bool, default=False
-        whether to apply layer normalization to B-C matrices.
-    cat_encoding : str, default="int"
-        whether to use integer encoding or one-hot encoding for cat features.
+        Whether to use learnable feature interactions before passing through Mamba blocks.
+    use_cls : bool, default=False
+        Whether to append a CLS token to the input sequences.
     use_pscan : bool, default=False
-        whether to use pscan for the ssm
+        Whether to use PSCAN for the state-space model.
+
+    Mamba Version
+    -------------
     mamba_version : str, default="mamba-torch"
-        options are "mamba-torch", "mamba1" and "mamba2"
+        Version of the Mamba model to use ('mamba-torch', 'mamba1', 'mamba2').
+
+    Preprocessing Params
+    ---------------------
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -146,52 +164,62 @@ class MambularClassifier(SklearnBaseClassifier):
     The accepted arguments to the MambularClassifier class include both the attributes in the DefaultMambularConfig dataclass
     and the parameters for the Preprocessor class.
 
-    Parameters
-    ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
+    Mambular Model Parameters
+    -----------------------
     d_model : int, default=64
         Dimensionality of the model.
-    n_layers : int, default=8
+    n_layers : int, default=4
         Number of layers in the model.
     expand_factor : int, default=2
         Expansion factor for the feed-forward layers.
     bias : bool, default=False
         Whether to use bias in the linear layers.
-    d_conv : int, default=16
-        Dimensionality of the convolutional layers.
-    conv_bias : bool, default=True
-        Whether to use bias in the convolutional layers.
-    dropout : float, default=0.05
+    dropout : float, default=0.0
         Dropout rate for regularization.
     dt_rank : str, default="auto"
-        Rank of the decision tree.
-    d_state : int, default=32
+        Rank of the decision tree used in the model.
+    d_state : int, default=128
         Dimensionality of the state in recurrent layers.
     dt_scale : float, default=1.0
-        Scaling factor for decision tree.
+        Scaling factor for decision tree parameters.
     dt_init : str, default="random"
-        Initialization method for decision tree.
+        Initialization method for decision tree parameters.
     dt_max : float, default=0.1
         Maximum value for decision tree initialization.
     dt_min : float, default=1e-04
         Minimum value for decision tree initialization.
     dt_init_floor : float, default=1e-04
         Floor value for decision tree initialization.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
-    activation : callable, default=nn.SELU()
+    norm : str, default="LayerNorm"
+        Type of normalization used ('LayerNorm', 'RMSNorm', etc.).
+    activation : callable, default=nn.SiLU()
         Activation function for the model.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=True
+        Whether weight decay is applied to A-D matrices.
+    BC_layer_norm : bool, default=False
+        Whether to apply layer normalization to B-C matrices.
+
+    Embedding Parameters
+    ---------------------
     embedding_activation : callable, default=nn.Identity()
         Activation function for embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    shuffle_embeddings : bool, default=False
+        Whether to shuffle embeddings before being passed to Mamba layers.
+    cat_encoding : str, default="int"
+        Encoding method for categorical features ('int', 'one-hot', etc.).
+
+    Head Parameters
+    ---------------
+    head_layer_sizes : list, default=()
+        Sizes of the layers in the model's head.
     head_dropout : float, default=0.5
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
@@ -200,28 +228,27 @@ class MambularClassifier(SklearnBaseClassifier):
         Activation function for the head layers.
     head_use_batch_norm : bool, default=False
         Whether to use batch normalization in the head layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
+
+    Additional Features
+    --------------------
     pooling_method : str, default="avg"
-        Pooling method to be used ('avg', 'max', etc.).
+        Pooling method to use ('avg', 'max', etc.).
     bidirectional : bool, default=False
-        Whether to use bidirectional processing of the input sequences.
+        Whether to process data bidirectionally.
     use_learnable_interaction : bool, default=False
-        Whether to use learnable feature interactions before passing through mamba blocks.
-    shuffle_embeddings : bool, default=False.
-        Whether to shuffle the embeddings before being passed to the Mamba layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    AD_weight_decay : bool, default=True
-        whether weight decay is also applied to A-D matrices.
-    BC_layer_norm: bool, default=False
-        whether to apply layer normalization to B-C matrices.
-    cat_encoding : str, default="int"
-        whether to use integer encoding or one-hot encoding for cat features.
+        Whether to use learnable feature interactions before passing through Mamba blocks.
+    use_cls : bool, default=False
+        Whether to append a CLS token to the input sequences.
     use_pscan : bool, default=False
-        whether to use pscan for the ssm
+        Whether to use PSCAN for the state-space model.
+
+    Mamba Version
+    -------------
     mamba_version : str, default="mamba-torch"
-        options are "mamba-torch", "mamba1" and "mamba2"
+        Version of the Mamba model to use ('mamba-torch', 'mamba1', 'mamba2').
+
+    Preprocessing Params
+    ---------------------
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
@@ -277,54 +304,62 @@ class MambularLSS(SklearnBaseLSS):
     The accepted arguments to the MambularLSS class include both the attributes in the DefaultMambularConfig dataclass
     and the parameters for the Preprocessor class.
 
-    Parameters
-    ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    family : str, default=None
-        Distributional family to be used for the model.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
+    Mambular Model Parameters
+    -----------------------
     d_model : int, default=64
         Dimensionality of the model.
-    n_layers : int, default=8
+    n_layers : int, default=4
         Number of layers in the model.
     expand_factor : int, default=2
         Expansion factor for the feed-forward layers.
     bias : bool, default=False
         Whether to use bias in the linear layers.
-    d_conv : int, default=16
-        Dimensionality of the convolutional layers.
-    conv_bias : bool, default=True
-        Whether to use bias in the convolutional layers.
-    dropout : float, default=0.05
+    dropout : float, default=0.0
         Dropout rate for regularization.
     dt_rank : str, default="auto"
-        Rank of the decision tree.
-    d_state : int, default=32
+        Rank of the decision tree used in the model.
+    d_state : int, default=128
         Dimensionality of the state in recurrent layers.
     dt_scale : float, default=1.0
-        Scaling factor for decision tree.
+        Scaling factor for decision tree parameters.
     dt_init : str, default="random"
-        Initialization method for decision tree.
+        Initialization method for decision tree parameters.
     dt_max : float, default=0.1
         Maximum value for decision tree initialization.
     dt_min : float, default=1e-04
         Minimum value for decision tree initialization.
     dt_init_floor : float, default=1e-04
         Floor value for decision tree initialization.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
-    activation : callable, default=nn.SELU()
+    norm : str, default="LayerNorm"
+        Type of normalization used ('LayerNorm', 'RMSNorm', etc.).
+    activation : callable, default=nn.SiLU()
         Activation function for the model.
+    layer_norm_eps : float, default=1e-05
+        Epsilon value for layer normalization.
+    AD_weight_decay : bool, default=True
+        Whether weight decay is applied to A-D matrices.
+    BC_layer_norm : bool, default=False
+        Whether to apply layer normalization to B-C matrices.
+
+    Embedding Parameters
+    ---------------------
     embedding_activation : callable, default=nn.Identity()
         Activation function for embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in the embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding.
+    shuffle_embeddings : bool, default=False
+        Whether to shuffle embeddings before being passed to Mamba layers.
+    cat_encoding : str, default="int"
+        Encoding method for categorical features ('int', 'one-hot', etc.).
+
+    Head Parameters
+    ---------------
+    head_layer_sizes : list, default=()
+        Sizes of the layers in the model's head.
     head_dropout : float, default=0.5
         Dropout rate for the head layers.
     head_skip_layers : bool, default=False
@@ -333,41 +368,30 @@ class MambularLSS(SklearnBaseLSS):
         Activation function for the head layers.
     head_use_batch_norm : bool, default=False
         Whether to use batch normalization in the head layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
+
+    Additional Features
+    --------------------
     pooling_method : str, default="avg"
-        Pooling method to be used ('avg', 'max', etc.).
+        Pooling method to use ('avg', 'max', etc.).
     bidirectional : bool, default=False
-        Whether to use bidirectional processing of the input sequences.
+        Whether to process data bidirectionally.
     use_learnable_interaction : bool, default=False
-        Whether to use learnable feature interactions before passing through mamba blocks.
-    shuffle_embeddings : bool, default=False.
-        Whether to shuffle the embeddings before being passed to the Mamba layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    AD_weight_decay : bool, default=True
-        whether weight decay is also applied to A-D matrices.
-    BC_layer_norm: bool, default=False
-        whether to apply layer normalization to B-C matrices.
-    cat_encoding : str, default="int"
-        whether to use integer encoding or one-hot encoding for cat features.
+        Whether to use learnable feature interactions before passing through Mamba blocks.
+    use_cls : bool, default=False
+        Whether to append a CLS token to the input sequences.
     use_pscan : bool, default=False
-        whether to use pscan for the ssm
+        Whether to use PSCAN for the state-space model.
+
+    Mamba Version
+    -------------
     mamba_version : str, default="mamba-torch"
-        options are "mamba-torch", "mamba1" and "mamba2"
+        Version of the Mamba model to use ('mamba-torch', 'mamba1', 'mamba2').
+
+    Preprocessing Params
+    ---------------------
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    shuffle_embeddings : bool, default=False.
-        Whether to shuffle the embeddings before being passed to the Mamba layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    AD_weight_decay : bool, default=True
-        whether weight decay is also applied to A-D matrices.
-    BC_layer_norm: bool, default=False
-        whether to apply layer normalization to B-C matrices.
-    cat_encoding : str, default="int"
-        whether to use integer encoding or one-hot encoding for cat features.
     numerical_preprocessing : str, default="ple"
         The preprocessing strategy for numerical features. Valid options are
         'binning', 'one_hot', 'standardization', and 'normalization'.
@@ -378,9 +402,6 @@ class MambularLSS(SklearnBaseLSS):
     binning_strategy : str, default="uniform"
         Defines the strategy for binning numerical features. Options include 'uniform',
         'quantile', or other sklearn-compatible strategies.
-    task : str, default="regression"
-        Indicates the type of machine learning task ('regression' or 'classification'). This can
-        influence certain preprocessing behaviors, especially when using decision tree-based binning as ple.
     cat_cutoff : float or int, default=0.03
         Indicates the cutoff after which integer values are treated as categorical.
         If float, it's treated as a percentage. If int, it's the maximum number of
diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 022a072..1fd64be 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -341,53 +341,26 @@ def fit(
             The fitted classifier.
         """
         if rebuild:
-            if not isinstance(X, pd.DataFrame):
-                X = pd.DataFrame(X)
-            if isinstance(y, pd.Series):
-                y = y.values
-            if X_val is not None:
-                if not isinstance(X_val, pd.DataFrame):
-                    X_val = pd.DataFrame(X_val)
-                if isinstance(y_val, pd.Series):
-                    y_val = y_val.values
-
-            self.data_module = MambularDataModule(
-                preprocessor=self.preprocessor,
-                batch_size=batch_size,
-                shuffle=shuffle,
+            self.build_model(
+                X=X,
+                y=y,
+                val_size=val_size,
                 X_val=X_val,
                 y_val=y_val,
-                val_size=val_size,
                 random_state=random_state,
-                regression=False,
-                **dataloader_kwargs,
-            )
-
-            self.data_module.preprocess_data(
-                X, y, X_val, y_val, val_size=val_size, random_state=random_state
+                batch_size=batch_size,
+                shuffle=shuffle,
+                lr=lr,
+                lr_patience=lr_patience,
+                lr_factor=lr_factor,
+                weight_decay=weight_decay,
+                dataloader_kwargs=dataloader_kwargs,
             )
 
-            num_classes = len(np.unique(y))
-
-            self.task_model = TaskModel(
-                model_class=self.base_model,
-                num_classes=num_classes,
-                config=self.config,
-                cat_feature_info=self.data_module.cat_feature_info,
-                num_feature_info=self.data_module.num_feature_info,
-                lr_patience=(
-                    lr_patience if lr_patience is not None else self.config.lr_patience
-                ),
-                lr=lr if lr is not None else self.config.lr,
-                lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-                weight_decay=(
-                    weight_decay
-                    if weight_decay is not None
-                    else self.config.weight_decay
-                ),
-                optimizer_type=self.optimizer_type,
-                optimizer_args=self.optimizer_kwargs,
-            )
+        else:
+            assert (
+                self.built
+            ), "The model must be built before calling the fit method. Either call .build_model() or set rebuild=True"
 
         early_stop_callback = EarlyStopping(
             monitor=monitor, min_delta=0.00, patience=patience, verbose=False, mode=mode
@@ -705,6 +678,7 @@ def _objective(hyperparams):
             nonlocal best_val_loss, best_epoch_val_loss  # Access across trials
 
             head_layer_sizes = []
+            head_layer_size_length = None
 
             for key, param_value in zip(param_names, hyperparams):
                 if key == "head_layer_size_length":
@@ -733,8 +707,6 @@ def _objective(hyperparams):
                     head_layer_sizes[:head_layer_size_length],
                 )
 
-                print(head_layer_sizes)
-
             # Build the model with updated hyperparameters
             self.build_model(
                 X, y, X_val=X_val, y_val=y_val, lr=self.config.lr, **optimize_kwargs
@@ -787,14 +759,19 @@ def _objective(hyperparams):
 
         # Update the model with the best-found hyperparameters
         best_hparams = result.x
-        if "head_layer_sizes" in self.config.__dataclass_fields__:
-            head_layer_sizes = []
+        head_layer_sizes = (
+            [] if "head_layer_sizes" in self.config.__dataclass_fields__ else None
+        )
+        layer_sizes = [] if "layer_sizes" in self.config.__dataclass_fields__ else None
 
         # Iterate over the best hyperparameters found by optimization
         for key, param_value in zip(param_names, best_hparams):
-            if key.startswith("head_layer_size_"):
+            if key.startswith("head_layer_size_") and head_layer_sizes is not None:
                 # These are the individual head layer sizes
                 head_layer_sizes.append(round_to_nearest_16(param_value))
+            elif key.startswith("layer_size_") and layer_sizes is not None:
+                # These are the individual layer sizes
+                layer_sizes.append(round_to_nearest_16(param_value))
             else:
                 # For all other config values, update normally
                 field_type = self.config.__dataclass_fields__[key].type
@@ -803,9 +780,11 @@ def _objective(hyperparams):
                 else:
                     setattr(self.config, key, param_value)
 
-        # After the loop, set head_layer_sizes in the config
-        if head_layer_sizes:
+        # After the loop, set head_layer_sizes or layer_sizes in the config
+        if head_layer_sizes is not None and head_layer_sizes:
             setattr(self.config, "head_layer_sizes", head_layer_sizes)
+        if layer_sizes is not None and layer_sizes:
+            setattr(self.config, "layer_sizes", layer_sizes)
 
         print("Best hyperparameters found:", best_hparams)
 
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index 6631eed..5c9494a 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -308,6 +308,7 @@ def fit(
         checkpoint_path="model_checkpoints",
         distributional_kwargs=None,
         dataloader_kwargs={},
+        rebuild=True,
         **trainer_kwargs,
     ):
         """
@@ -384,51 +385,27 @@ def fit(
         else:
             raise ValueError("Unsupported family: {}".format(family))
 
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-        if isinstance(y, pd.Series):
-            y = y.values
-        if X_val is not None:
-            if not isinstance(X_val, pd.DataFrame):
-                X_val = pd.DataFrame(X_val)
-            if isinstance(y_val, pd.Series):
-                y_val = y_val.values
-
-        self.data_module = MambularDataModule(
-            preprocessor=self.preprocessor,
-            batch_size=batch_size,
-            shuffle=shuffle,
-            X_val=X_val,
-            y_val=y_val,
-            val_size=val_size,
-            random_state=random_state,
-            regression=True,
-            **dataloader_kwargs,
-        )
-
-        self.data_module.preprocess_data(
-            X, y, X_val, y_val, val_size=val_size, random_state=random_state
-        )
+        if rebuild:
+            self.build_model(
+                X=X,
+                y=y,
+                val_size=val_size,
+                X_val=X_val,
+                y_val=y_val,
+                random_state=random_state,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                lr=lr,
+                lr_patience=lr_patience,
+                lr_factor=lr_factor,
+                weight_decay=weight_decay,
+                dataloader_kwargs=dataloader_kwargs,
+            )
 
-        self.task_model = TaskModel(
-            model_class=self.base_model,
-            num_classes=self.family.param_count,
-            family=self.family,
-            config=self.config,
-            cat_feature_info=self.data_module.cat_feature_info,
-            num_feature_info=self.data_module.num_feature_info,
-            lr=lr if lr is not None else self.config.lr,
-            lr_patience=(
-                lr_patience if lr_patience is not None else self.config.lr_patience
-            ),
-            lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-            weight_decay=(
-                weight_decay if weight_decay is not None else self.config.weight_decay
-            ),
-            lss=True,
-            optimizer_type=self.optimizer_type,
-            optimizer_args=self.optimizer_kwargs,
-        )
+        else:
+            assert (
+                self.built
+            ), "The model must be built before calling the fit method. Either call .build_model() or set rebuild=True"
 
         early_stop_callback = EarlyStopping(
             monitor=monitor, min_delta=0.00, patience=patience, verbose=False, mode=mode
@@ -689,6 +666,7 @@ def _objective(hyperparams):
             nonlocal best_val_loss, best_epoch_val_loss  # Access across trials
 
             head_layer_sizes = []
+            head_layer_size_length = None
 
             for key, param_value in zip(param_names, hyperparams):
                 if key == "head_layer_size_length":
@@ -717,8 +695,6 @@ def _objective(hyperparams):
                     head_layer_sizes[:head_layer_size_length],
                 )
 
-                print(head_layer_sizes)
-
             # Build the model with updated hyperparameters
             self.build_model(
                 X, y, X_val=X_val, y_val=y_val, lr=self.config.lr, **optimize_kwargs
@@ -769,14 +745,19 @@ def _objective(hyperparams):
 
         # Update the model with the best-found hyperparameters
         best_hparams = result.x
-        if "head_layer_sizes" in self.config.__dataclass_fields__:
-            head_layer_sizes = []
+        head_layer_sizes = (
+            [] if "head_layer_sizes" in self.config.__dataclass_fields__ else None
+        )
+        layer_sizes = [] if "layer_sizes" in self.config.__dataclass_fields__ else None
 
         # Iterate over the best hyperparameters found by optimization
         for key, param_value in zip(param_names, best_hparams):
-            if key.startswith("head_layer_size_"):
+            if key.startswith("head_layer_size_") and head_layer_sizes is not None:
                 # These are the individual head layer sizes
                 head_layer_sizes.append(round_to_nearest_16(param_value))
+            elif key.startswith("layer_size_") and layer_sizes is not None:
+                # These are the individual layer sizes
+                layer_sizes.append(round_to_nearest_16(param_value))
             else:
                 # For all other config values, update normally
                 field_type = self.config.__dataclass_fields__[key].type
@@ -785,9 +766,11 @@ def _objective(hyperparams):
                 else:
                     setattr(self.config, key, param_value)
 
-        # After the loop, set head_layer_sizes in the config
-        if head_layer_sizes:
+        # After the loop, set head_layer_sizes or layer_sizes in the config
+        if head_layer_sizes is not None and head_layer_sizes:
             setattr(self.config, "head_layer_sizes", head_layer_sizes)
+        if layer_sizes is not None and layer_sizes:
+            setattr(self.config, "layer_sizes", layer_sizes)
 
         print("Best hyperparameters found:", best_hparams)
 
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index 8bc3048..b2072e4 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -337,53 +337,26 @@ def fit(
             The fitted regressor.
         """
         if rebuild:
-            if not isinstance(X, pd.DataFrame):
-                X = pd.DataFrame(X)
-            if isinstance(y, pd.Series):
-                y = y.values
-            if X_val is not None:
-                if not isinstance(X_val, pd.DataFrame):
-                    X_val = pd.DataFrame(X_val)
-                if isinstance(y_val, pd.Series):
-                    y_val = y_val.values
-
-            self.data_module = MambularDataModule(
-                preprocessor=self.preprocessor,
-                batch_size=batch_size,
-                shuffle=shuffle,
+            self.build_model(
+                X=X,
+                y=y,
+                val_size=val_size,
                 X_val=X_val,
                 y_val=y_val,
-                val_size=val_size,
                 random_state=random_state,
-                regression=True,
-                **dataloader_kwargs,
-            )
-
-            self.data_module.preprocess_data(
-                X, y, X_val, y_val, val_size=val_size, random_state=random_state
-            )
-
-            self.task_model = TaskModel(
-                model_class=self.base_model,
-                config=self.config,
-                cat_feature_info=self.data_module.cat_feature_info,
-                num_feature_info=self.data_module.num_feature_info,
-                lr=lr if lr is not None else self.config.lr,
-                lr_patience=(
-                    lr_patience if lr_patience is not None else self.config.lr_patience
-                ),
-                lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-                weight_decay=(
-                    weight_decay
-                    if weight_decay is not None
-                    else self.config.weight_decay
-                ),
-                optimizer_type=self.optimizer_type,
-                optimizer_args=self.optimizer_kwargs,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                lr=lr,
+                lr_patience=lr_patience,
+                lr_factor=lr_factor,
+                weight_decay=weight_decay,
+                dataloader_kwargs=dataloader_kwargs,
             )
 
         else:
-            assert self.built, "The model must be built before calling the fit method."
+            assert (
+                self.built
+            ), "The model must be built before calling the fit method. Either call .build_model() or set rebuild=True"
 
         early_stop_callback = EarlyStopping(
             monitor=monitor, min_delta=0.00, patience=patience, verbose=False, mode=mode
@@ -440,7 +413,7 @@ def predict(self, X, device=None):
 
         # Move tensors to appropriate device
         if device is None:
-            qdevice = next(self.task_model.parameters()).device
+            device = next(self.task_model.parameters()).device
         if isinstance(cat_tensors, list):
             cat_tensors = [tensor.to(device) for tensor in cat_tensors]
         else:
@@ -605,6 +578,7 @@ def _objective(hyperparams):
             nonlocal best_val_loss, best_epoch_val_loss  # Access across trials
 
             head_layer_sizes = []
+            head_layer_size_length = None
 
             for key, param_value in zip(param_names, hyperparams):
                 if key == "head_layer_size_length":
@@ -633,8 +607,6 @@ def _objective(hyperparams):
                     head_layer_sizes[:head_layer_size_length],
                 )
 
-                print(head_layer_sizes)
-
             # Build the model with updated hyperparameters
             self.build_model(
                 X, y, X_val=X_val, y_val=y_val, lr=self.config.lr, **optimize_kwargs
@@ -687,14 +659,19 @@ def _objective(hyperparams):
 
         # Update the model with the best-found hyperparameters
         best_hparams = result.x
-        if "head_layer_sizes" in self.config.__dataclass_fields__:
-            head_layer_sizes = []
+        head_layer_sizes = (
+            [] if "head_layer_sizes" in self.config.__dataclass_fields__ else None
+        )
+        layer_sizes = [] if "layer_sizes" in self.config.__dataclass_fields__ else None
 
         # Iterate over the best hyperparameters found by optimization
         for key, param_value in zip(param_names, best_hparams):
-            if key.startswith("head_layer_size_"):
+            if key.startswith("head_layer_size_") and head_layer_sizes is not None:
                 # These are the individual head layer sizes
                 head_layer_sizes.append(round_to_nearest_16(param_value))
+            elif key.startswith("layer_size_") and layer_sizes is not None:
+                # These are the individual layer sizes
+                layer_sizes.append(round_to_nearest_16(param_value))
             else:
                 # For all other config values, update normally
                 field_type = self.config.__dataclass_fields__[key].type
@@ -703,9 +680,11 @@ def _objective(hyperparams):
                 else:
                     setattr(self.config, key, param_value)
 
-        # After the loop, set head_layer_sizes in the config
-        if head_layer_sizes:
+        # After the loop, set head_layer_sizes or layer_sizes in the config
+        if head_layer_sizes is not None and head_layer_sizes:
             setattr(self.config, "head_layer_sizes", head_layer_sizes)
+        if layer_sizes is not None and layer_sizes:
+            setattr(self.config, "layer_sizes", layer_sizes)
 
         print("Best hyperparameters found:", best_hparams)
 
diff --git a/mambular/models/batchtabrnn.py b/mambular/models/trem.py
similarity index 91%
rename from mambular/models/batchtabrnn.py
rename to mambular/models/trem.py
index 2f3b711..af182ab 100644
--- a/mambular/models/batchtabrnn.py
+++ b/mambular/models/trem.py
@@ -2,16 +2,16 @@
 from .sklearn_base_classifier import SklearnBaseClassifier
 from .sklearn_base_lss import SklearnBaseLSS
 
-from ..base_models.batch_tabrnn import BatchTabRNN
-from ..configs.batchtabrnn_config import DefaultBatchTabRNNConfig
+from ..base_models.trem import TREM
+from ..configs.trem_config import DefaultTREMConfig
 
 
-class BatchTabRNNRegressor(SklearnBaseRegressor):
+class TREMRegressor(SklearnBaseRegressor):
     """
-    RNN regressor. This class extends the SklearnBaseRegressor class and uses the BatchTabRNN model
-    with the default BatchTabRNN configuration.
+    RNN regressor. This class extends the SklearnBaseRegressor class and uses the TREM model
+    with the default TREM configuration.
 
-    The accepted arguments to the BatchTabRNNRegressor class include both the attributes in the DefaultBatchTabRNNConfig dataclass
+    The accepted arguments to the TREMRegressor class include both the attributes in the DefaultTREMConfig dataclass
     and the parameters for the Preprocessor class.
 
     Parameters
@@ -87,15 +87,15 @@ class BatchTabRNNRegressor(SklearnBaseRegressor):
     """
 
     def __init__(self, **kwargs):
-        super().__init__(model=BatchTabRNN, config=DefaultBatchTabRNNConfig, **kwargs)
+        super().__init__(model=TREM, config=DefaultTREMConfig, **kwargs)
 
 
-class BatchTabRNNClassifier(SklearnBaseClassifier):
+class TREMClassifier(SklearnBaseClassifier):
     """
-    RNN classifier. This class extends the SklearnBaseClassifier class and uses the BatchTabRNN model
-    with the default BatchTabRNN configuration.
+    RNN classifier. This class extends the SklearnBaseClassifier class and uses the TREM model
+    with the default TREM configuration.
 
-    The accepted arguments to the BatchTabRNNClassifier class include both the attributes in the DefaultBatchTabRNNConfig dataclass
+    The accepted arguments to the TREMClassifier class include both the attributes in the DefaultTREMConfig dataclass
     and the parameters for the Preprocessor class.
 
     Parameters
@@ -171,15 +171,15 @@ class BatchTabRNNClassifier(SklearnBaseClassifier):
     """
 
     def __init__(self, **kwargs):
-        super().__init__(model=BatchTabRNN, config=DefaultBatchTabRNNConfig, **kwargs)
+        super().__init__(model=TREM, config=DefaultTREMConfig, **kwargs)
 
 
-class BatchTabRNNLSS(SklearnBaseLSS):
+class TREMLSS(SklearnBaseLSS):
     """
-    RNN LSS. This class extends the SklearnBaseLSS class and uses the BatchTabRNN model
-    with the default BatchTabRNN configuration.
+    RNN LSS. This class extends the SklearnBaseLSS class and uses the TREM model
+    with the default TREM configuration.
 
-    The accepted arguments to the BatchTabRNNLSS class include both the attributes in the DefaultBatchTabRNNConfig dataclass
+    The accepted arguments to the TREMLSS class include both the attributes in the DefaultTREMConfig dataclass
     and the parameters for the Preprocessor class.
 
     Parameters
@@ -257,4 +257,4 @@ class BatchTabRNNLSS(SklearnBaseLSS):
     """
 
     def __init__(self, **kwargs):
-        super().__init__(model=BatchTabRNN, config=DefaultBatchTabRNNConfig, **kwargs)
+        super().__init__(model=TREM, config=DefaultTREMConfig, **kwargs)

From fe3c1d76d5686253484ada777511c555b85dd5e6 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 09:55:48 +0000
Subject: [PATCH 114/132] add verbose to preprocessing

---
 mambular/preprocessing/preprocessor.py | 33 +++++++++++++++-----------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
index c485f88..1787c2e 100644
--- a/mambular/preprocessing/preprocessor.py
+++ b/mambular/preprocessing/preprocessor.py
@@ -417,7 +417,7 @@ def fit_transform(self, X, y=None):
         self.fitted = True
         return self.transform(X)
 
-    def get_feature_info(self):
+    def get_feature_info(self, verbose=True):
         """
         Retrieves information about how features are encoded within the model's preprocessor.
         This method identifies the type of encoding applied to each feature, categorizing them into binned or ordinal
@@ -467,24 +467,27 @@ def get_feature_info(self):
                         other_encoding_info[
                             feature_name
                         ] = n_bins  # Number of bins before one-hot encoding
-                        print(
-                            f"Numerical Feature (Discretized & One-Hot Encoded): {feature_name}, Number of bins before one-hot encoding: {n_bins}"
-                        )
+                        if verbose:
+                            print(
+                                f"Numerical Feature (Discretized & One-Hot Encoded): {feature_name}, Number of bins before one-hot encoding: {n_bins}"
+                            )
                     else:
                         # Only discretization without subsequent one-hot encoding
                         binned_or_ordinal_info[feature_name] = n_bins
-                        print(
-                            f"Numerical Feature (Binned): {feature_name}, Number of bins: {n_bins}"
-                        )
+                        if verbose:
+                            print(
+                                f"Numerical Feature (Binned): {feature_name}, Number of bins: {n_bins}"
+                            )
 
                 # Handle features processed with continuous ordinal encoding
                 elif "continuous_ordinal" in steps:
                     step = transformer_pipeline.named_steps["continuous_ordinal"]
                     n_categories = len(step.mapping_[columns.index(feature_name)])
                     binned_or_ordinal_info[feature_name] = n_categories
-                    print(
-                        f"Categorical Feature (Ordinal Encoded): {feature_name}, Number of unique categories: {n_categories}"
-                    )
+                    if verbose:
+                        print(
+                            f"Categorical Feature (Ordinal Encoded): {feature_name}, Number of unique categories: {n_categories}"
+                        )
 
                 # Handle other numerical feature encodings
                 else:
@@ -496,10 +499,12 @@ def get_feature_info(self):
                             np.zeros((1, len(columns)))
                         )
                         other_encoding_info[feature_name] = transformed_feature.shape[1]
-                        print(
-                            f"Feature: {feature_name} ({step_descriptions}), Encoded feature dimension: {transformed_feature.shape[1]}"
-                        )
+                        if verbose:
+                            print(
+                                f"Feature: {feature_name} ({step_descriptions}), Encoded feature dimension: {transformed_feature.shape[1]}"
+                            )
 
-                print("-" * 50)
+                if verbose:
+                    print("-" * 50)
 
         return binned_or_ordinal_info, other_encoding_info

From c0a62ca7627d122b8080bff6b178df393f2b1ee6 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 09:55:55 +0000
Subject: [PATCH 115/132] .

---
 mambular/base_models/trem.py | 122 +++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 mambular/base_models/trem.py

diff --git a/mambular/base_models/trem.py b/mambular/base_models/trem.py
new file mode 100644
index 0000000..5330e0c
--- /dev/null
+++ b/mambular/base_models/trem.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn as nn
+from ..arch_utils.layer_utils.sn_linear import SNLinear
+from ..configs import DefaultTREMConfig
+from .basemodel import BaseModel
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
+from ..arch_utils.rnn_utils import EnsembleConvRNN
+from ..arch_utils.get_norm_fn import get_normalization_layer
+from dataclasses import replace
+
+
+class TREM(BaseModel):
+    """
+    Tabular Recurrent Ensemble Model (TREM)
+    A batch ensemble model combining RNN and tabular data handling for multivariate time series or sequential tabular data.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features, including their names and dimensions.
+    num_feature_info : dict
+        Dictionary containing information about numerical features, including their names and dimensions.
+    num_classes : int, optional
+        The number of output classes or target dimensions for regression, by default 1.
+    config : DefaultTREMConfig, optional
+        Configuration object containing model hyperparameters such as dropout rates, hidden layer sizes, ensemble settings,
+        and other architectural configurations, by default DefaultTREMConfig().
+    **kwargs : dict
+        Additional keyword arguments for the BaseModel class.
+
+    Attributes
+    ----------
+    cat_feature_info : dict
+        Stores categorical feature information.
+    num_feature_info : dict
+        Stores numerical feature information.
+    pooling_method : str
+        The pooling method to aggregate sequence or ensemble features, specified in config.
+    ensemble_first : bool
+        Flag indicating if ensembles should be processed before pooling over the sequence.
+    embedding_layer : EmbeddingLayer
+        Layer for embedding categorical and numerical features.
+    rnn : EnsembleConvRNN
+        Ensemble RNN layer for processing sequential data.
+    tabular_head : MLPhead
+        MLPhead layer to produce the final prediction based on the output of the RNN and pooling layers.
+    linear : nn.Linear
+        Linear transformation layer for projecting features into a different dimension.
+    norm_f : nn.Module
+        Normalization layer.
+    ensemble_linear : nn.Linear, optional
+        Linear layer to learn a weighted combination of ensemble outputs, if configured.
+
+    Methods
+    -------
+    forward(num_features, cat_features)
+        Perform a forward pass through the model, including embedding, RNN, pooling, and prediction steps.
+
+    """
+
+    def __init__(
+        self,
+        cat_feature_info,
+        num_feature_info,
+        num_classes=1,
+        config: DefaultTREMConfig = DefaultTREMConfig(),
+        **kwargs,
+    ):
+        super().__init__(config=config, **kwargs)
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        if not self.hparams.average_ensembles:
+            self.returns_ensemble = True  # Directly set ensemble flag
+        else:
+            self.returns_ensemble = False
+
+        self.cat_feature_info = cat_feature_info
+        self.num_feature_info = num_feature_info
+
+        self.embedding_layer = EmbeddingLayer(
+            num_feature_info=num_feature_info,
+            cat_feature_info=cat_feature_info,
+            config=config,
+        )
+        self.rnn = EnsembleConvRNN(config=config)
+
+        temp_config = replace(config, d_model=config.dim_feedforward)
+        self.norm_f = get_normalization_layer(temp_config)
+
+        if self.hparams.average_ensembles:
+            self.final_layer = nn.Linear(self.hparams.dim_feedforward, num_classes)
+        else:
+            self.final_layer = SNLinear(
+                self.hparams.ensemble_size,
+                self.hparams.dim_feedforward,
+                num_classes,
+            )
+
+        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
+
+    def forward(self, num_features, cat_features):
+        x = self.embedding_layer(num_features, cat_features)
+
+        # RNN forward pass
+        out, _ = self.rnn(
+            x
+        )  # Shape: (batch_size, sequence_length, ensemble_size, hidden_size)
+
+        out = self.pool_sequence(out)  # Shape: (batch_size, ensemble_size, hidden_size)
+
+        if self.hparams.average_ensembles:
+            x = out.mean(axis=1)  # Shape (batch_size, num_classes)
+
+        x = self.final_layer(
+            out
+        )  # Shape (batch_size, (ensemble_size), num_classes) if not averaged
+
+        if not self.hparams.average_ensembles:
+            x = x.squeeze(-1)
+
+        return x

From 3bbe703252f47dceec2ad6a09ed0f18d4a4800ed Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 11:17:43 +0000
Subject: [PATCH 116/132] adjust import

---
 mambular/models/cnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/models/cnn.py b/mambular/models/cnn.py
index caaf0f7..b205fbf 100644
--- a/mambular/models/cnn.py
+++ b/mambular/models/cnn.py
@@ -1,7 +1,7 @@
 from .sklearn_base_regressor import SklearnBaseRegressor
 from .sklearn_base_lss import SklearnBaseLSS
 from .sklearn_base_classifier import SklearnBaseClassifier
-from ..base_models import CNN
+from ..base_models.cnn import CNN
 from ..configs.cnn_config import DefaultCNNConfig
 
 

From 0709050d2a18f47f6aa1a7b3178e71dd04cd2177 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 17:39:30 +0000
Subject: [PATCH 117/132] remove normalization layer

---
 mambular/arch_utils/mamba_utils/mamba_arch.py | 6 ------
 mambular/base_models/mambular.py              | 2 --
 2 files changed, 8 deletions(-)

diff --git a/mambular/arch_utils/mamba_utils/mamba_arch.py b/mambular/arch_utils/mamba_utils/mamba_arch.py
index 51da1cf..ace6dd4 100644
--- a/mambular/arch_utils/mamba_utils/mamba_arch.py
+++ b/mambular/arch_utils/mamba_utils/mamba_arch.py
@@ -6,9 +6,6 @@
     RMSNorm,
     LayerNorm,
     LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
 )
 from ..get_norm_fn import get_normalization_layer
 
@@ -165,9 +162,6 @@ def __init__(
             "RMSNorm": RMSNorm,
             "LayerNorm": LayerNorm,
             "LearnableLayerScaling": LearnableLayerScaling,
-            "BatchNorm": BatchNorm,
-            "InstanceNorm": InstanceNorm,
-            "GroupNorm": GroupNorm,
         }
 
         # Check if the provided normalization layer is valid
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index 7a01ddf..747045b 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -76,7 +76,6 @@ def __init__(
             self.mamba = Mamba(config)
         else:
             self.mamba = MambaOriginal(config)
-        self.norm_f = get_normalization_layer(config)
 
         self.tabular_head = MLPhead(
             input_dim=self.hparams.d_model,
@@ -116,7 +115,6 @@ def forward(self, num_features, cat_features):
 
         x = self.pool_sequence(x)
 
-        x = self.norm_f(x)
         preds = self.tabular_head(x)
 
         return preds

From ca9dff0b757f1de0bc84c10c4a031e4b99570bc4 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 17:39:54 +0000
Subject: [PATCH 118/132] adjust mambular config to new params

---
 mambular/configs/mambular_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mambular/configs/mambular_config.py b/mambular/configs/mambular_config.py
index b38a605..1411951 100644
--- a/mambular/configs/mambular_config.py
+++ b/mambular/configs/mambular_config.py
@@ -120,7 +120,7 @@ class DefaultMambularConfig:
     dt_max: float = 0.1
     dt_min: float = 1e-04
     dt_init_floor: float = 1e-04
-    norm: str = "LayerNorm"
+    norm: str = "RMSNorm"
     activation: callable = nn.SiLU()
     embedding_activation: callable = nn.Identity()
     embedding_type: str = "linear"

From 6d5f843b6730016e6c97a9ad7966bdd604b1f4a0 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 17:40:04 +0000
Subject: [PATCH 119/132] fix hpo bug

---
 mambular/models/sklearn_base_classifier.py | 66 ++++++++++++++-------
 mambular/models/sklearn_base_lss.py        | 65 ++++++++++++++-------
 mambular/models/sklearn_base_regressor.py  | 67 ++++++++++++++--------
 3 files changed, 132 insertions(+), 66 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 1fd64be..10e6bc2 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -625,6 +625,15 @@ def optimize_hparams(
         max_epochs=200,
         prune_by_epoch=True,
         prune_epoch=5,
+        fixed_params={
+            "pooling_method": "avg",
+            "head_skip_layers": False,
+            "head_layer_size_length": 0,
+            "cat_encoding": "int",
+            "head_skip_layer": False,
+            "use_cls": False,
+        },
+        custom_search_space=None,
         **optimize_kwargs,
     ):
         """
@@ -656,7 +665,11 @@ def optimize_hparams(
         """
 
         # Define the hyperparameter search space from the model config
-        param_names, param_space = get_search_space(self.config)
+        param_names, param_space = get_search_space(
+            self.config,
+            fixed_params=fixed_params,
+            custom_search_space=custom_search_space,
+        )
 
         # Initial model fitting to get the baseline validation loss
         self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs)
@@ -727,32 +740,41 @@ def _objective(hyperparams):
             self.task_model.pruning_epoch = prune_epoch
 
             # Fit the model (limit epochs for faster optimization)
-            self.fit(
-                X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
-            )
+            try:
+                # Wrap the risky operation (model fitting) in a try-except block
+                self.fit(
+                    X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
+                )
 
-            # Retrieve the current validation loss
-            if X_val is not None and y_val is not None:
-                val_loss = self.evaluate(
-                    X_val, y_val, metrics={"Accuracy": (accuracy_score, False)}
-                )["Accuracy"]
-            else:
-                val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
-                    "val_loss"
-                ]
+                # Evaluate validation loss
+                if X_val is not None and y_val is not None:
+                    val_loss = self.evaluate(
+                        X_val, y_val, metrics={"Mean Squared Error": mean_squared_error}
+                    )["Mean Squared Error"]
+                else:
+                    val_loss = self.trainer.validate(self.task_model, self.data_module)[
+                        0
+                    ]["val_loss"]
+
+                # Pruning based on validation loss at specific epoch
+                epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
 
-            # Retrieve validation loss at the specified epoch (e.g., epoch 5)
-            epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
+                if prune_by_epoch and epoch_val_loss < best_epoch_val_loss:
+                    best_epoch_val_loss = epoch_val_loss
 
-            # Update the best validation loss at the specified epoch
-            if prune_by_epoch and epoch_val_loss < best_epoch_val_loss:
-                best_epoch_val_loss = epoch_val_loss
+                if val_loss < best_val_loss:
+                    best_val_loss = val_loss
 
-            # Update the best overall validation loss
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
+                return val_loss
 
-            return val_loss
+            except Exception as e:
+                # Penalize the hyperparameter configuration with a large value
+                print(
+                    f"Error encountered during fit with hyperparameters {hyperparams}: {e}"
+                )
+                return (
+                    best_val_loss * 100
+                )  # Large value to discourage this configuration
 
         # Perform Bayesian optimization using scikit-optimize
         result = gp_minimize(_objective, param_space, n_calls=time, random_state=42)
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index 5c9494a..e36877e 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -612,6 +612,15 @@ def optimize_hparams(
         max_epochs=200,
         prune_by_epoch=True,
         prune_epoch=5,
+        fixed_params={
+            "pooling_method": "avg",
+            "head_skip_layers": False,
+            "head_layer_size_length": 0,
+            "cat_encoding": "int",
+            "head_skip_layer": False,
+            "use_cls": False,
+        },
+        custom_search_space=None,
         **optimize_kwargs,
     ):
         """
@@ -643,7 +652,11 @@ def optimize_hparams(
         """
 
         # Define the hyperparameter search space from the model config
-        param_names, param_space = get_search_space(self.config)
+        param_names, param_space = get_search_space(
+            self.config,
+            fixed_params=fixed_params,
+            custom_search_space=custom_search_space,
+        )
 
         # Initial model fitting to get the baseline validation loss
         self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs)
@@ -714,31 +727,41 @@ def _objective(hyperparams):
             self.task_model.early_pruning_threshold = early_pruning_threshold
             self.task_model.pruning_epoch = prune_epoch
 
-            # Fit the model (limit epochs for faster optimization)
-            self.fit(
-                X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
-            )
+            try:
+                # Wrap the risky operation (model fitting) in a try-except block
+                self.fit(
+                    X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
+                )
 
-            # Retrieve the current validation loss
-            if X_val is not None and y_val is not None:
-                val_loss = self.score(X_val, y_val)
-            else:
-                val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
-                    "val_loss"
-                ]
+                # Evaluate validation loss
+                if X_val is not None and y_val is not None:
+                    val_loss = self.evaluate(
+                        X_val, y_val, metrics={"Mean Squared Error": mean_squared_error}
+                    )["Mean Squared Error"]
+                else:
+                    val_loss = self.trainer.validate(self.task_model, self.data_module)[
+                        0
+                    ]["val_loss"]
+
+                # Pruning based on validation loss at specific epoch
+                epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
 
-            # Retrieve validation loss at the specified epoch (e.g., epoch 5)
-            epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
+                if prune_by_epoch and epoch_val_loss < best_epoch_val_loss:
+                    best_epoch_val_loss = epoch_val_loss
 
-            # Update the best validation loss at the specified epoch
-            if prune_by_epoch and epoch_val_loss < best_epoch_val_loss:
-                best_epoch_val_loss = epoch_val_loss
+                if val_loss < best_val_loss:
+                    best_val_loss = val_loss
 
-            # Update the best overall validation loss
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
+                return val_loss
 
-            return val_loss
+            except Exception as e:
+                # Penalize the hyperparameter configuration with a large value
+                print(
+                    f"Error encountered during fit with hyperparameters {hyperparams}: {e}"
+                )
+                return (
+                    best_val_loss * 100
+                )  # Large value to discourage this configuration
 
         # Perform Bayesian optimization using scikit-optimize
         result = gp_minimize(_objective, param_space, n_calls=time, random_state=42)
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index b2072e4..414449d 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -525,6 +525,15 @@ def optimize_hparams(
         max_epochs=200,
         prune_by_epoch=True,
         prune_epoch=5,
+        fixed_params={
+            "pooling_method": "avg",
+            "head_skip_layers": False,
+            "head_layer_size_length": 0,
+            "cat_encoding": "int",
+            "head_skip_layer": False,
+            "use_cls": False,
+        },
+        custom_search_space=None,
         **optimize_kwargs,
     ):
         """
@@ -556,7 +565,11 @@ def optimize_hparams(
         """
 
         # Define the hyperparameter search space from the model config
-        param_names, param_space = get_search_space(self.config)
+        param_names, param_space = get_search_space(
+            self.config,
+            fixed_params=fixed_params,
+            custom_search_space=custom_search_space,
+        )
 
         # Initial model fitting to get the baseline validation loss
         self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs)
@@ -626,33 +639,41 @@ def _objective(hyperparams):
             self.task_model.early_pruning_threshold = early_pruning_threshold
             self.task_model.pruning_epoch = prune_epoch
 
-            # Fit the model (limit epochs for faster optimization)
-            self.fit(
-                X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
-            )
+            try:
+                # Wrap the risky operation (model fitting) in a try-except block
+                self.fit(
+                    X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
+                )
 
-            # Retrieve the current validation loss
-            if X_val is not None and y_val is not None:
-                val_loss = self.evaluate(
-                    X_val, y_val, metrics={"Mean Squared Error": mean_squared_error}
-                )["Mean Squared Error"]
-            else:
-                val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
-                    "val_loss"
-                ]
+                # Evaluate validation loss
+                if X_val is not None and y_val is not None:
+                    val_loss = self.evaluate(
+                        X_val, y_val, metrics={"Mean Squared Error": mean_squared_error}
+                    )["Mean Squared Error"]
+                else:
+                    val_loss = self.trainer.validate(self.task_model, self.data_module)[
+                        0
+                    ]["val_loss"]
+
+                # Pruning based on validation loss at specific epoch
+                epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
 
-            # Retrieve validation loss at the specified epoch (e.g., epoch 5)
-            epoch_val_loss = self.task_model.epoch_val_loss_at(prune_epoch)
+                if prune_by_epoch and epoch_val_loss < best_epoch_val_loss:
+                    best_epoch_val_loss = epoch_val_loss
 
-            # Update the best validation loss at the specified epoch
-            if prune_by_epoch and epoch_val_loss < best_epoch_val_loss:
-                best_epoch_val_loss = epoch_val_loss
+                if val_loss < best_val_loss:
+                    best_val_loss = val_loss
 
-            # Update the best overall validation loss
-            if val_loss < best_val_loss:
-                best_val_loss = val_loss
+                return val_loss
 
-            return val_loss
+            except Exception as e:
+                # Penalize the hyperparameter configuration with a large value
+                print(
+                    f"Error encountered during fit with hyperparameters {hyperparams}: {e}"
+                )
+                return (
+                    best_val_loss * 100
+                )  # Large value to discourage this configuration
 
         # Perform Bayesian optimization using scikit-optimize
         result = gp_minimize(_objective, param_space, n_calls=time, random_state=42)

From c6b266f64f9d8c0ee2c1a8336c507347d4d59cf8 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Mon, 2 Dec 2024 17:40:23 +0000
Subject: [PATCH 120/132] imrpove hpo config mapper defaults

---
 mambular/utils/config_mapper.py | 73 ++++++++++++++++++++++++---------
 1 file changed, 54 insertions(+), 19 deletions(-)

diff --git a/mambular/utils/config_mapper.py b/mambular/utils/config_mapper.py
index d7f4b85..2b4d349 100644
--- a/mambular/utils/config_mapper.py
+++ b/mambular/utils/config_mapper.py
@@ -8,7 +8,18 @@ def round_to_nearest_16(x):
     return int(round(x / 16) * 16)
 
 
-def get_search_space(config):
+def get_search_space(
+    config,
+    fixed_params={
+        "pooling_method": "avg",
+        "head_skip_layers": False,
+        "head_layer_size_length": 0,
+        "cat_encoding": "int",
+        "head_skip_layer": False,
+        "use_cls": False,
+    },
+    custom_search_space=None,
+):
     """
     Given a model configuration, return the hyperparameter search space
     based on the config attributes.
@@ -17,6 +28,12 @@ def get_search_space(config):
     ----------
     config : dataclass
         The configuration object for the model.
+    fixed_params : dict, optional
+        Dictionary of fixed parameters and their values. Defaults to
+        {"pooling_method": "avg", "head_skip_layers": False, "head_layer_size_length": 0}.
+    custom_search_space : dict, optional
+        Dictionary defining custom search spaces for parameters.
+        Overrides the default `search_space_mapping` for the specified parameters.
 
     Returns
     -------
@@ -26,6 +43,11 @@ def get_search_space(config):
         A list of hyperparameter ranges for Bayesian optimization.
     """
 
+    # Handle the custom search space
+    if custom_search_space is None:
+        custom_search_space = {}
+
+    # Base search space mapping
     search_space_mapping = {
         # Learning rate-related parameters
         "lr": Real(1e-6, 1e-2, prior="log-uniform"),
@@ -33,20 +55,19 @@ def get_search_space(config):
         "lr_factor": Real(0.1, 0.5),
         # Model architecture parameters
         "n_layers": Integer(1, 8),
-        "d_model": Integer(16, 512),  # Dimension of the model
+        "d_model": Categorical([32, 64, 128, 256, 512, 1024]),
         "dropout": Real(0.0, 0.5),
         "expand_factor": Integer(1, 4),
-        "d_state": Integer(16, 512),
+        "d_state": Categorical([32, 64, 128, 256]),
         "ff_dropout": Real(0.0, 0.5),
         "rnn_dropout": Real(0.0, 0.5),
         "attn_dropout": Real(0.0, 0.5),
-        "n_heads": Integer(1, 8),
+        "n_heads": Categorical([2, 4, 8]),
         "transformer_dim_feedforward": Integer(16, 512),
         # Convolution-related parameters
-        "d_conv": Integer(4, 128),  # Dimension of convolution layers
         "conv_bias": Categorical([True, False]),
         # Normalization and regularization
-        "norm": Categorical(["LayerNorm", "BatchNorm", "RMSNorm"]),
+        "norm": Categorical(["LayerNorm", "RMSNorm"]),
         "weight_decay": Real(1e-8, 1e-2, prior="log-uniform"),
         "layer_norm_eps": Real(1e-7, 1e-4),
         "head_dropout": Real(0.0, 0.5),
@@ -74,29 +95,43 @@ def get_search_space(config):
         "cat_encoding": Categorical(["int", "one-hot"]),
     }
 
-    layer_size_min, layer_size_max = 16, 512  # Dynamic layer sizes
-    max_head_layers = 5  # Set a maximum number of layers for optimization
+    # Apply custom search space overrides
+    search_space_mapping.update(custom_search_space)
 
     param_names = []
     param_space = []
 
+    # Iterate through config fields
     for field in config.__dataclass_fields__:
+        if field in fixed_params:
+            # Fix the parameter value directly in the config
+            setattr(config, field, fixed_params[field])
+            continue  # Skip optimization for this parameter
+
         if field in search_space_mapping:
+            # Add to search space if not fixed
             param_names.append(field)
             param_space.append(search_space_mapping[field])
 
-    # Handle head_layer_sizes dynamically by setting the length and individual sizes
+    # Handle dynamic head_layer_sizes based on head_layer_size_length
     if "head_layer_sizes" in config.__dataclass_fields__:
-        param_names.append("head_layer_size_length")
-        param_space.append(
-            Integer(1, max_head_layers)
-        )  # Optimize the length of the list
-
-        # Optimize individual layer sizes based on max_head_layers
-        for i in range(max_head_layers):
-            # Optimize over integers and multiply by 16 to ensure divisibility by 16
-            param_names.append(f"head_layer_size_{i+1}")
-            param_space.append(Integer(layer_size_min, layer_size_max))
+        head_layer_size_length = fixed_params.get("head_layer_size_length", 0)
+
+        # If no layers are desired, set head_layer_sizes to []
+        if head_layer_size_length == 0:
+            setattr(config, "head_layer_sizes", [])
+        else:
+            # Optimize the number of head layers
+            param_names.append("head_layer_size_length")
+            param_space.append(Integer(1, max_head_layers))
+
+            # Optimize individual layer sizes
+            max_head_layers = 5
+            layer_size_min, layer_size_max = 16, 512
+            for i in range(max_head_layers):
+                layer_key = f"head_layer_size_{i+1}"
+                param_names.append(layer_key)
+                param_space.append(Integer(layer_size_min, layer_size_max))
 
     return param_names, param_space
 

From a35f5c0b3e9a3984929ae20d636a2e183e813edd Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 3 Dec 2024 16:13:55 +0000
Subject: [PATCH 121/132] adapt models to new preprocessor

---
 mambular/base_models/__init__.py | 8 ++++++++
 mambular/base_models/mambatab.py | 2 --
 mambular/base_models/mlp.py      | 7 ++-----
 mambular/base_models/ndtf.py     | 7 ++-----
 mambular/base_models/node.py     | 7 ++-----
 mambular/base_models/resnet.py   | 7 ++-----
 mambular/base_models/tabm.py     | 5 ++---
 7 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/mambular/base_models/__init__.py b/mambular/base_models/__init__.py
index 895881b..87246ad 100644
--- a/mambular/base_models/__init__.py
+++ b/mambular/base_models/__init__.py
@@ -7,6 +7,10 @@
 from .tabtransformer import TabTransformer
 from .mambatab import MambaTab
 from .mambattn import MambAttn
+from .cnn import CNN
+from .node import NODE
+from .trem import TREM
+from .tabm import TabM
 
 __all__ = [
     "TaskModel",
@@ -18,4 +22,8 @@
     "BaseModel",
     "MambaTab",
     "MambAttn",
+    "CNN",
+    "TabM",
+    "NODE",
+    "TREM",
 ]
diff --git a/mambular/base_models/mambatab.py b/mambular/base_models/mambatab.py
index 2a87352..5f04dec 100644
--- a/mambular/base_models/mambatab.py
+++ b/mambular/base_models/mambatab.py
@@ -84,8 +84,6 @@ def __init__(
 
         self.axis = config.axis
 
-        head_activation = self.hparams.head_activation
-
         self.tabular_head = MLPhead(
             input_dim=self.hparams.d_model,
             config=config,
diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index 8ac0970..97001ed 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -3,6 +3,7 @@
 from ..configs.mlp_config import DefaultMLPConfig
 from .basemodel import BaseModel
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
+from ..utils.get_feature_dimensions import get_feature_dimensions
 
 
 class MLP(BaseModel):
@@ -73,11 +74,7 @@ def __init__(
         # Initialize layers
         self.layers = nn.ModuleList()
 
-        input_dim = 0
-        for feature_name, input_shape in num_feature_info.items():
-            input_dim += input_shape
-        for feature_name, input_shape in cat_feature_info.items():
-            input_dim += 1
+        input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
 
         if self.hparams.use_embeddings:
             self.embedding_layer = EmbeddingLayer(
diff --git a/mambular/base_models/ndtf.py b/mambular/base_models/ndtf.py
index 794f2ac..fdebd03 100644
--- a/mambular/base_models/ndtf.py
+++ b/mambular/base_models/ndtf.py
@@ -4,6 +4,7 @@
 from .basemodel import BaseModel
 from ..arch_utils.neural_decision_tree import NeuralDecisionTree
 import numpy as np
+from ..utils.get_feature_dimensions import get_feature_dimensions
 
 
 class NDTF(BaseModel):
@@ -66,11 +67,7 @@ def __init__(
         self.num_feature_info = num_feature_info
         self.returns_ensemble = False
 
-        input_dim = 0
-        for feature_name, input_shape in num_feature_info.items():
-            input_dim += input_shape
-        for feature_name, input_shape in cat_feature_info.items():
-            input_dim += 1
+        input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
 
         self.input_dimensions = [input_dim]
 
diff --git a/mambular/base_models/node.py b/mambular/base_models/node.py
index 0ed3f4e..bd3d284 100644
--- a/mambular/base_models/node.py
+++ b/mambular/base_models/node.py
@@ -4,6 +4,7 @@
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.node_utils import DenseBlock
 from ..arch_utils.mlp_utils import MLPhead
+from ..utils.get_feature_dimensions import get_feature_dimensions
 
 
 class NODE(BaseModel):
@@ -75,11 +76,7 @@ def __init__(
             self.embedding_layer = EmbeddingLayer(config)
 
         else:
-            input_dim = 0
-            for feature_name, input_shape in num_feature_info.items():
-                input_dim += input_shape
-            for feature_name, input_shape in cat_feature_info.items():
-                input_dim += 1
+            input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
 
         self.d_out = num_classes
         self.block = DenseBlock(
diff --git a/mambular/base_models/resnet.py b/mambular/base_models/resnet.py
index 14851f6..69e47d6 100644
--- a/mambular/base_models/resnet.py
+++ b/mambular/base_models/resnet.py
@@ -5,6 +5,7 @@
 from .basemodel import BaseModel
 from ..arch_utils.resnet_utils import ResidualBlock
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
+from ..utils.get_feature_dimensions import get_feature_dimensions
 
 
 class ResNet(BaseModel):
@@ -83,11 +84,7 @@ def __init__(
             )
 
         else:
-            input_dim = 0
-            for feature_name, input_shape in num_feature_info.items():
-                input_dim += input_shape
-            for feature_name, input_shape in cat_feature_info.items():
-                input_dim += 1
+            input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
 
         self.initial_layer = nn.Linear(input_dim, self.hparams.layer_sizes[0])
 
diff --git a/mambular/base_models/tabm.py b/mambular/base_models/tabm.py
index ead84ca..3bb9801 100644
--- a/mambular/base_models/tabm.py
+++ b/mambular/base_models/tabm.py
@@ -6,6 +6,7 @@
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.layer_utils.batch_ensemble_layer import LinearBatchEnsembleLayer
 from ..arch_utils.layer_utils.sn_linear import SNLinear
+from ..utils.get_feature_dimensions import get_feature_dimensions
 
 
 class TabM(BaseModel):
@@ -46,9 +47,7 @@ def __init__(
                 ) * config.d_model
 
         else:
-            # Calculate input dimension
-            input_dim = sum(input_shape for input_shape in num_feature_info.values())
-            input_dim += len(cat_feature_info)
+            input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
 
         # Input layer with batch ensembling
         self.layers.append(

From f3c218d1007745701ae3bbd32f658144d2adc298 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 3 Dec 2024 16:14:15 +0000
Subject: [PATCH 122/132] include new preprocessor funcs and adapt get_info
 logic

---
 mambular/preprocessing/prepro_utils.py   |  54 ++++-
 mambular/preprocessing/preprocessor.py   | 238 ++++++++++++++++-------
 mambular/utils/get_feature_dimensions.py |   8 +
 3 files changed, 233 insertions(+), 67 deletions(-)
 create mode 100644 mambular/utils/get_feature_dimensions.py

diff --git a/mambular/preprocessing/prepro_utils.py b/mambular/preprocessing/prepro_utils.py
index 60858b6..74fa6d6 100644
--- a/mambular/preprocessing/prepro_utils.py
+++ b/mambular/preprocessing/prepro_utils.py
@@ -27,7 +27,6 @@ def transform(self, X):
             labels=False,
             include_lowest=True,
         )
-        print(binned_data)
         return np.expand_dims(np.array(binned_data), 1)
 
 
@@ -168,3 +167,56 @@ def get_feature_names_out(self, input_features=None):
                 [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))]
             )
         return np.array(feature_names)
+
+
+class NoTransformer(TransformerMixin, BaseEstimator):
+    """
+    A transformer that does not preprocess the data but retains compatibility with the sklearn pipeline API.
+    It simply returns the input data as is.
+
+    Methods:
+        fit(X, y=None): Fits the transformer to the data (no operation).
+        transform(X): Returns the input data unprocessed.
+        get_feature_names_out(input_features=None): Returns the original feature names.
+    """
+
+    def fit(self, X, y=None):
+        """
+        Fits the transformer to the data. No operation is performed.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to fit.
+            y (ignored): Not used, present for API consistency by convention.
+
+        Returns:
+            self: Returns the instance itself.
+        """
+        return self
+
+    def transform(self, X):
+        """
+        Returns the input data unprocessed.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to transform.
+
+        Returns:
+            X (array-like): The same input data, unmodified.
+        """
+        return X
+
+    def get_feature_names_out(self, input_features=None):
+        """
+        Returns the original feature names.
+
+        Parameters:
+            input_features (list of str or None): The names of the input features.
+
+        Returns:
+            feature_names (array of shape (n_features,)): The original feature names.
+        """
+        if input_features is None:
+            raise ValueError(
+                "input_features must be provided to generate feature names."
+            )
+        return np.array(input_features)
diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
index 1787c2e..919474b 100644
--- a/mambular/preprocessing/preprocessor.py
+++ b/mambular/preprocessing/preprocessor.py
@@ -11,18 +11,25 @@
     QuantileTransformer,
     PolynomialFeatures,
     SplineTransformer,
+    PowerTransformer,
+    OneHotEncoder,
 )
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 
 from .ple_encoding import PLE
-from .prepro_utils import ContinuousOrdinalEncoder, CustomBinner, OneHotFromOrdinal
+from .prepro_utils import (
+    ContinuousOrdinalEncoder,
+    CustomBinner,
+    OneHotFromOrdinal,
+    NoTransformer,
+)
 
 
 class Preprocessor:
     """
     A comprehensive preprocessor for structured data, capable of handling both numerical and categorical features.
     It supports various preprocessing strategies for numerical data, including binning, one-hot encoding,
-    standardization, and normalization. Categorical features can be transformed using continuous ordinal encoding.
+    standardization, and minmax. Categorical features can be transformed using continuous ordinal encoding.
     Additionally, it allows for the use of decision tree-derived bin edges for numerical feature binning.
 
     The class is designed to work seamlessly with pandas DataFrames, facilitating easy integration into
@@ -32,14 +39,14 @@ class Preprocessor:
     ----------
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
-        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+        only if `numerical_preprocessing` is set to 'binning' or 'one-hot'.
     numerical_preprocessing : str, default="ple"
         The preprocessing strategy for numerical features. Valid options are
-        'binning', 'one_hot', 'standardization', and 'normalization'.
+        'binning', 'one-hot', 'standardization', and 'minmax'.
     use_decision_tree_bins : bool, default=False
         If True, uses decision tree regression/classification to determine
         optimal bin edges for numerical feature binning. This parameter is
-        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+        relevant only if `numerical_preprocessing` is set to 'binning' or 'one-hot'.
     binning_strategy : str, default="uniform"
         Defines the strategy for binning numerical features. Options include 'uniform',
         'quantile', or other sklearn-compatible strategies.
@@ -71,6 +78,7 @@ def __init__(
         self,
         n_bins=50,
         numerical_preprocessing="ple",
+        categorical_preprocessing="int",
         use_decision_tree_bins=False,
         binning_strategy="uniform",
         task="regression",
@@ -80,19 +88,36 @@ def __init__(
         knots=12,
     ):
         self.n_bins = n_bins
-        self.numerical_preprocessing = numerical_preprocessing.lower()
+        self.numerical_preprocessing = (
+            numerical_preprocessing.lower()
+            if numerical_preprocessing is not None
+            else "none"
+        )
+        self.categorical_preprocessing = (
+            categorical_preprocessing.lower()
+            if categorical_preprocessing is not None
+            else "none"
+        )
         if self.numerical_preprocessing not in [
             "ple",
             "binning",
-            "one_hot",
+            "one-hot",
             "standardization",
-            "normalization",
+            "min-max",
             "quantile",
             "polynomial",
             "splines",
+            "box-cox",
+            "yeo-johnson",
+            "none",
         ]:
             raise ValueError(
-                "Invalid numerical_preprocessing value. Supported values are 'ple', 'binning', 'one_hot', 'standardization', 'quantile', 'polynomial', 'splines' and 'normalization'."
+                "Invalid numerical_preprocessing value. Supported values are 'ple', 'binning', 'box-cox', 'one-hot', 'standardization', 'quantile', 'polynomial', 'splines', 'minmax' or 'None'."
+            )
+
+        if self.categorical_preprocessing not in ["int", "one-hot", "none"]:
+            raise ValueError(
+                "invalid categorical_preprocessing value. Supported values are 'int' and 'one-hot'"
             )
 
         self.use_decision_tree_bins = use_decision_tree_bins
@@ -184,7 +209,7 @@ def fit(self, X, y=None):
                     ("imputer", SimpleImputer(strategy="mean"))
                 ]
 
-                if self.numerical_preprocessing in ["binning", "one_hot"]:
+                if self.numerical_preprocessing in ["binning", "one-hot"]:
                     bins = (
                         self._get_decision_tree_bins(X[[feature]], y, [feature])
                         if self.use_decision_tree_bins
@@ -216,7 +241,7 @@ def fit(self, X, y=None):
                             ]
                         )
 
-                    if self.numerical_preprocessing == "one_hot":
+                    if self.numerical_preprocessing == "one-hot":
                         numeric_transformer_steps.extend(
                             [
                                 ("onehot_from_ordinal", OneHotFromOrdinal()),
@@ -226,9 +251,9 @@ def fit(self, X, y=None):
                 elif self.numerical_preprocessing == "standardization":
                     numeric_transformer_steps.append(("scaler", StandardScaler()))
 
-                elif self.numerical_preprocessing == "normalization":
+                elif self.numerical_preprocessing == "minmax":
                     numeric_transformer_steps.append(
-                        ("normalizer", MinMaxScaler(feature_range=(-1, 1)))
+                        ("minmax", MinMaxScaler(feature_range=(-1, 1)))
                     )
 
                 elif self.numerical_preprocessing == "quantile":
@@ -249,8 +274,6 @@ def fit(self, X, y=None):
                             PolynomialFeatures(self.degree, include_bias=False),
                         )
                     )
-                    # if self.degree > 10:
-                    #    numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
 
                 elif self.numerical_preprocessing == "splines":
                     numeric_transformer_steps.append(
@@ -266,28 +289,72 @@ def fit(self, X, y=None):
 
                 elif self.numerical_preprocessing == "ple":
                     numeric_transformer_steps.append(
-                        ("normalizer", MinMaxScaler(feature_range=(-1, 1)))
+                        ("minmax", MinMaxScaler(feature_range=(-1, 1)))
                     )
                     numeric_transformer_steps.append(
                         ("ple", PLE(n_bins=self.n_bins, task=self.task))
                     )
 
+                elif self.numerical_preprocessing == "box-cox":
+                    numeric_transformer_steps.append(
+                        (
+                            "box-cox",
+                            PowerTransformer(method="box-cox", standardize=True),
+                        )
+                    )
+
+                elif self.numerical_preprocessing == "yeo-johnson":
+                    numeric_transformer_steps.append(
+                        (
+                            "yeo-johnson",
+                            PowerTransformer(method="yeo-johnson", standardize=True),
+                        )
+                    )
+
+                elif self.numerical_preprocessing == "none":
+                    numeric_transformer_steps.append(
+                        (
+                            "none",
+                            NoTransformer(),
+                        )
+                    )
+
                 numeric_transformer = Pipeline(numeric_transformer_steps)
 
                 transformers.append((f"num_{feature}", numeric_transformer, [feature]))
 
         if categorical_features:
             for feature in categorical_features:
-                # Create a pipeline for each categorical feature
-                categorical_transformer = Pipeline(
-                    [
-                        ("imputer", SimpleImputer(strategy="most_frequent")),
-                        (
-                            "continuous_ordinal",
-                            ContinuousOrdinalEncoder(),
-                        ),
-                    ]
-                )
+                if self.categorical_preprocessing == "int":
+                    # Use ContinuousOrdinalEncoder for "int"
+                    categorical_transformer = Pipeline(
+                        [
+                            ("imputer", SimpleImputer(strategy="most_frequent")),
+                            ("continuous_ordinal", ContinuousOrdinalEncoder()),
+                        ]
+                    )
+                elif self.categorical_preprocessing == "one-hot":
+                    # Use OneHotEncoder for "one-hot"
+                    categorical_transformer = Pipeline(
+                        [
+                            ("imputer", SimpleImputer(strategy="most_frequent")),
+                            ("onehot", OneHotEncoder()),
+                        ]
+                    )
+
+                elif self.categorical_preprocessing == "none":
+                    # Use OneHotEncoder for "one-hot"
+                    categorical_transformer = Pipeline(
+                        [
+                            ("imputer", SimpleImputer(strategy="most_frequent")),
+                            ("none", NoTransformer()),
+                        ]
+                    )
+                else:
+                    raise ValueError(
+                        f"Unknown categorical_preprocessing type: {self.categorical_preprocessing}"
+                    )
+
                 # Append the transformer for the current categorical feature
                 transformers.append(
                     (f"cat_{feature}", categorical_transformer, [feature])
@@ -442,8 +509,8 @@ def get_feature_info(self, verbose=True):
                   features after encoding transformations (e.g., one-hot encoding dimensions).
 
         """
-        binned_or_ordinal_info = {}
-        other_encoding_info = {}
+        numerical_feature_info = {}
+        categorical_feature_info = {}
 
         if not self.column_transformer:
             raise RuntimeError("The preprocessor has not been fitted yet.")
@@ -456,55 +523,94 @@ def get_feature_info(self, verbose=True):
             steps = [step[0] for step in transformer_pipeline.steps]
 
             for feature_name in columns:
-                # Handle features processed with discretization
-                if "discretizer" in steps:
-                    step = transformer_pipeline.named_steps["discretizer"]
-                    n_bins = step.n_bins_[0] if hasattr(step, "n_bins_") else None
-
-                    # Check if discretization is followed by one-hot encoding
-                    if "onehot_from_ordinal" in steps:
-                        # Classify as other encoding due to the expanded feature dimensions from one-hot encoding
-                        other_encoding_info[
-                            feature_name
-                        ] = n_bins  # Number of bins before one-hot encoding
-                        if verbose:
-                            print(
-                                f"Numerical Feature (Discretized & One-Hot Encoded): {feature_name}, Number of bins before one-hot encoding: {n_bins}"
-                            )
-                    else:
-                        # Only discretization without subsequent one-hot encoding
-                        binned_or_ordinal_info[feature_name] = n_bins
-                        if verbose:
-                            print(
-                                f"Numerical Feature (Binned): {feature_name}, Number of bins: {n_bins}"
-                            )
-
-                # Handle features processed with continuous ordinal encoding
+                # Initialize common fields
+                preprocessing_type = " -> ".join(steps)
+                dimension = None
+                categories = None
+
+                # Numerical features
+                if "discretizer" in steps or any(
+                    step in steps
+                    for step in [
+                        "standardization",
+                        "minmax",
+                        "quantile",
+                        "polynomial",
+                        "splines",
+                    ]
+                ):
+                    last_step = transformer_pipeline.steps[-1][1]
+                    if hasattr(last_step, "transform"):
+                        dummy_input = np.zeros(
+                            (1, 1)
+                        )  # Single-column input for dimension check
+                        transformed_feature = last_step.transform(dummy_input)
+                        dimension = transformed_feature.shape[1]
+                    numerical_feature_info[feature_name] = {
+                        "preprocessing": preprocessing_type,
+                        "dimension": dimension,
+                        "categories": None,  # Numerical features don't have categories
+                    }
+                    if verbose:
+                        print(
+                            f"Numerical Feature: {feature_name}, Info: {numerical_feature_info[feature_name]}"
+                        )
+
+                # Categorical features
                 elif "continuous_ordinal" in steps:
                     step = transformer_pipeline.named_steps["continuous_ordinal"]
-                    n_categories = len(step.mapping_[columns.index(feature_name)])
-                    binned_or_ordinal_info[feature_name] = n_categories
+                    categories = len(step.mapping_[columns.index(feature_name)])
+                    dimension = 1  # Ordinal encoding always outputs one dimension
+                    categorical_feature_info[feature_name] = {
+                        "preprocessing": preprocessing_type,
+                        "dimension": dimension,
+                        "categories": categories,
+                    }
+                    if verbose:
+                        print(
+                            f"Categorical Feature (Ordinal): {feature_name}, Info: {categorical_feature_info[feature_name]}"
+                        )
+
+                elif "onehot" in steps:
+                    step = transformer_pipeline.named_steps["onehot"]
+                    if hasattr(step, "categories_"):
+                        categories = sum(len(cat) for cat in step.categories_)
+                        dimension = categories  # One-hot encoding expands into multiple dimensions
+                    categorical_feature_info[feature_name] = {
+                        "preprocessing": preprocessing_type,
+                        "dimension": dimension,
+                        "categories": categories,
+                    }
                     if verbose:
                         print(
-                            f"Categorical Feature (Ordinal Encoded): {feature_name}, Number of unique categories: {n_categories}"
+                            f"Categorical Feature (One-Hot): {feature_name}, Info: {categorical_feature_info[feature_name]}"
                         )
 
-                # Handle other numerical feature encodings
+                # Fallback for other transformations
                 else:
                     last_step = transformer_pipeline.steps[-1][1]
-                    step_names = [step[0] for step in transformer_pipeline.steps]
-                    step_descriptions = " -> ".join(step_names)
                     if hasattr(last_step, "transform"):
-                        transformed_feature = last_step.transform(
-                            np.zeros((1, len(columns)))
+                        dummy_input = np.zeros((1, 1))
+                        transformed_feature = last_step.transform(dummy_input)
+                        dimension = transformed_feature.shape[1]
+                    if "cat" in name:
+                        categorical_feature_info[feature_name] = {
+                            "preprocessing": preprocessing_type,
+                            "dimension": dimension,
+                            "categories": None,  # Categories not defined for unknown categorical transformations
+                        }
+                    else:
+                        numerical_feature_info[feature_name] = {
+                            "preprocessing": preprocessing_type,
+                            "dimension": dimension,
+                            "categories": None,  # Numerical features don't have categories
+                        }
+                    if verbose:
+                        print(
+                            f"Feature: {feature_name}, Info: {preprocessing_type}, Dimension: {dimension}"
                         )
-                        other_encoding_info[feature_name] = transformed_feature.shape[1]
-                        if verbose:
-                            print(
-                                f"Feature: {feature_name} ({step_descriptions}), Encoded feature dimension: {transformed_feature.shape[1]}"
-                            )
 
                 if verbose:
                     print("-" * 50)
 
-        return binned_or_ordinal_info, other_encoding_info
+        return numerical_feature_info, categorical_feature_info
diff --git a/mambular/utils/get_feature_dimensions.py b/mambular/utils/get_feature_dimensions.py
new file mode 100644
index 0000000..5e1e61b
--- /dev/null
+++ b/mambular/utils/get_feature_dimensions.py
@@ -0,0 +1,8 @@
+def get_feature_dimensions(num_feature_info, cat_feature_info):
+    input_dim = 0
+    for feature_name, feature_info in num_feature_info.items():
+        input_dim += feature_info["dimension"]
+    for feature_name, eature_info in cat_feature_info.items():
+        input_dim += feature_info["dimension"]
+
+    return input_dim

From 112cd965932048899fbc4af6ce01137524d05e3f Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 3 Dec 2024 19:05:39 +0000
Subject: [PATCH 123/132] adjust embedding layer to new preprocessing

---
 .../arch_utils/layer_utils/embedding_layer.py | 65 ++++++++-----------
 1 file changed, 27 insertions(+), 38 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index 72d5ec6..478a70f 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -29,7 +29,6 @@ def __init__(self, num_feature_info, cat_feature_info, config):
         )
         self.use_cls = getattr(config, "use_cls", False)
         self.cls_position = getattr(config, "cls_position", 0)
-        self.cat_encoding = getattr(config, "cat_encoding", "int")
         self.embedding_dropout = (
             nn.Dropout(getattr(config, "embedding_dropout", 0.0))
             if getattr(config, "embedding_dropout", None) is not None
@@ -45,8 +44,8 @@ def __init__(self, num_feature_info, cat_feature_info, config):
         if self.embedding_type == "ndt":
             self.num_embeddings = nn.ModuleList(
                 [
-                    NeuralEmbeddingTree(input_shape, self.d_model)
-                    for feature_name, input_shape in num_feature_info.items()
+                    NeuralEmbeddingTree(feature_info["dimension"], self.d_model)
+                    for feature_name, feature_info in num_feature_info.items()
                 ]
             )
         elif self.embedding_type == "plr":
@@ -62,10 +61,14 @@ def __init__(self, num_feature_info, cat_feature_info, config):
             self.num_embeddings = nn.ModuleList(
                 [
                     nn.Sequential(
-                        nn.Linear(input_shape, self.d_model, bias=self.embedding_bias),
+                        nn.Linear(
+                            feature_info["dimension"],
+                            self.d_model,
+                            bias=self.embedding_bias,
+                        ),
                         self.embedding_activation,
                     )
-                    for feature_name, input_shape in num_feature_info.items()
+                    for feature_name, feature_info in num_feature_info.items()
                 ]
             )
         else:
@@ -73,39 +76,24 @@ def __init__(self, num_feature_info, cat_feature_info, config):
                 "Invalid embedding_type. Choose from 'linear', 'ndt', or 'plr'."
             )
 
-        if self.cat_encoding == "int":
-            self.cat_embeddings = nn.ModuleList(
-                [
-                    nn.Sequential(
-                        nn.Embedding(num_categories + 1, self.d_model),
-                        self.embedding_activation,
-                    )
-                    for feature_name, num_categories in cat_feature_info.items()
-                ]
-            )
-        elif self.cat_encoding == "one-hot":
-            self.cat_embeddings = nn.ModuleList(
-                [
-                    nn.Sequential(
-                        OneHotEncoding(num_categories),
-                        nn.Linear(
-                            num_categories, self.d_model, bias=self.embedding_bias
-                        ),
-                        self.embedding_activation,
-                    )
-                    for feature_name, num_categories in cat_feature_info.items()
-                ]
-            )
-        elif self.cat_encoding == "linear":
-            self.cat_embeddings = nn.ModuleList(
-                [
-                    nn.Sequential(
-                        nn.Linear(input_shape, self.d_model, bias=self.embedding_bias),
-                        self.embedding_activation,
-                    )
-                    for feature_name, input_shape in cat_feature_info.items()
-                ]
-            )
+        self.cat_embeddings = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Embedding(feature_info["categories"] + 1, self.d_model),
+                    self.embedding_activation,
+                )
+                if feature_info["dimension"] == 1
+                else nn.Sequential(
+                    nn.Linear(
+                        feature_info["dimension"],
+                        self.d_model,
+                        bias=self.embedding_bias,
+                    ),
+                    self.embedding_activation,
+                )
+                for feature_name, feature_info in cat_feature_info.items()
+            ]
+        )
 
         # Class token if required
         if self.use_cls:
@@ -136,6 +124,7 @@ def forward(self, num_features=None, cat_features=None):
         ValueError
             If no features are provided to the model.
         """
+
         # Class token initialization
         if self.use_cls:
             batch_size = (

From 142d4b27686fadac7c5c38655bc7cf4e49dc4f6e Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 3 Dec 2024 19:05:45 +0000
Subject: [PATCH 124/132] delete model class

---
 mambular/base_models/batch_tabrnn.py | 127 ---------------------------
 1 file changed, 127 deletions(-)
 delete mode 100644 mambular/base_models/batch_tabrnn.py

diff --git a/mambular/base_models/batch_tabrnn.py b/mambular/base_models/batch_tabrnn.py
deleted file mode 100644
index 5c6ae27..0000000
--- a/mambular/base_models/batch_tabrnn.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import torch
-import torch.nn as nn
-from ..arch_utils.layer_utils.sn_linear import SNLinear
-from ..configs.batchtabrnn_config import DefaultBatchTabRNNConfig
-from .basemodel import BaseModel
-from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
-from ..arch_utils.rnn_utils import EnsembleConvRNN
-from ..arch_utils.get_norm_fn import get_normalization_layer
-from dataclasses import replace
-from ..arch_utils.layer_utils.sn_linear import SNLinear
-
-
-class BatchTabRNN(BaseModel):
-    """
-    A batch ensemble model combining RNN and tabular data handling for multivariate time series or sequential tabular data.
-
-    Parameters
-    ----------
-    cat_feature_info : dict
-        Dictionary containing information about categorical features, including their names and dimensions.
-    num_feature_info : dict
-        Dictionary containing information about numerical features, including their names and dimensions.
-    num_classes : int, optional
-        The number of output classes or target dimensions for regression, by default 1.
-    config : DefaultBatchTabRNNConfig, optional
-        Configuration object containing model hyperparameters such as dropout rates, hidden layer sizes, ensemble settings,
-        and other architectural configurations, by default DefaultBatchTabRNNConfig().
-    **kwargs : dict
-        Additional keyword arguments for the BaseModel class.
-
-    Attributes
-    ----------
-    cat_feature_info : dict
-        Stores categorical feature information.
-    num_feature_info : dict
-        Stores numerical feature information.
-    pooling_method : str
-        The pooling method to aggregate sequence or ensemble features, specified in config.
-    ensemble_first : bool
-        Flag indicating if ensembles should be processed before pooling over the sequence.
-    embedding_layer : EmbeddingLayer
-        Layer for embedding categorical and numerical features.
-    rnn : EnsembleConvRNN
-        Ensemble RNN layer for processing sequential data.
-    tabular_head : MLPhead
-        MLPhead layer to produce the final prediction based on the output of the RNN and pooling layers.
-    linear : nn.Linear
-        Linear transformation layer for projecting features into a different dimension.
-    norm_f : nn.Module
-        Normalization layer.
-    ensemble_linear : nn.Linear, optional
-        Linear layer to learn a weighted combination of ensemble outputs, if configured.
-
-    Methods
-    -------
-    forward(num_features, cat_features)
-        Perform a forward pass through the model, including embedding, RNN, pooling, and prediction steps.
-
-    """
-
-    def __init__(
-        self,
-        cat_feature_info,
-        num_feature_info,
-        num_classes=1,
-        config: DefaultBatchTabRNNConfig = DefaultBatchTabRNNConfig(),
-        **kwargs,
-    ):
-        super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
-
-        if not self.hparams.average_ensembles:
-            self.returns_ensemble = True  # Directly set ensemble flag
-        else:
-            self.returns_ensemble = False
-
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
-
-        self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
-            config=config,
-        )
-        self.rnn = EnsembleConvRNN(config=config)
-
-        self.linear = nn.Linear(
-            self.hparams.d_model,
-            self.hparams.dim_feedforward,
-        )
-
-        temp_config = replace(config, d_model=config.dim_feedforward)
-        self.norm_f = get_normalization_layer(temp_config)
-
-        if self.hparams.average_ensembles:
-            self.final_layer = nn.Linear(self.hparams.dim_feedforward, num_classes)
-        else:
-            self.final_layer = SNLinear(
-                self.hparams.ensemble_size,
-                self.hparams.dim_feedforward,
-                num_classes,
-            )
-
-        n_inputs = len(num_feature_info) + len(cat_feature_info)
-        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
-
-    def forward(self, num_features, cat_features):
-        x = self.embedding_layer(num_features, cat_features)
-
-        # RNN forward pass
-        out, _ = self.rnn(
-            x
-        )  # Shape: (batch_size, sequence_length, ensemble_size, hidden_size)
-
-        out = self.pool_sequence(out)  # Shape: (batch_size, ensemble_size, hidden_size)
-
-        if self.hparams.average_ensembles:
-            x = out.mean(axis=1)  # Shape (batch_size, num_classes)
-
-        x = self.final_layer(
-            out
-        )  # Shape (batch_size, (ensemble_size), num_classes) if not averaged
-
-        if not self.hparams.average_ensembles:
-            x = x.squeeze(-1)
-
-        return x

From 627d48ae96c38e9bbcceb2ebef476a9059d45de4 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 3 Dec 2024 19:05:56 +0000
Subject: [PATCH 125/132] adjust configs

---
 mambular/configs/mlp_config.py  | 2 +-
 mambular/configs/tabm_config.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mambular/configs/mlp_config.py b/mambular/configs/mlp_config.py
index ee83c00..fd1fe09 100644
--- a/mambular/configs/mlp_config.py
+++ b/mambular/configs/mlp_config.py
@@ -64,7 +64,7 @@ class DefaultMLPConfig:
     layer_sizes: list = (256, 128, 32)
     activation: callable = nn.SELU()
     skip_layers: bool = False
-    dropout: float = 0.5
+    dropout: float = 0.2
     use_glu: bool = False
     skip_connections: bool = False
     batch_norm: bool = False
diff --git a/mambular/configs/tabm_config.py b/mambular/configs/tabm_config.py
index 2c967c2..f9e0d37 100644
--- a/mambular/configs/tabm_config.py
+++ b/mambular/configs/tabm_config.py
@@ -78,13 +78,13 @@ class DefaultTabMConfig:
     # lr params
     lr: float = 1e-04
     lr_patience: int = 10
-    weight_decay: float = 1e-06
+    weight_decay: float = 1e-05
     lr_factor: float = 0.1
 
     # arch params
-    layer_sizes: list = (512, 512, 256)
+    layer_sizes: list = (256, 256, 128)
     activation: callable = nn.ReLU()
-    dropout: float = 0.2
+    dropout: float = 0.5
     norm: str = None
     use_glu: bool = False
     batch_norm: bool = False
@@ -99,7 +99,7 @@ class DefaultTabMConfig:
     average_embeddings: bool = False
     embedding_activation: callable = nn.Identity()
     layer_norm_after_embedding: bool = False
-    d_model: int = 128
+    d_model: int = 32
 
     # Batch ensembling specific configurations
     ensemble_size: int = 32

From a696987483431f2be3b52eb29a129cb529dd17db Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 3 Dec 2024 19:06:16 +0000
Subject: [PATCH 126/132] include cat_preprocessing in preprocessor arg names

---
 mambular/models/sklearn_base_classifier.py | 1 +
 mambular/models/sklearn_base_lss.py        | 1 +
 mambular/models/sklearn_base_regressor.py  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 10e6bc2..dc759bb 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -25,6 +25,7 @@ def __init__(self, model, config, **kwargs):
         self.preprocessor_arg_names = [
             "n_bins",
             "numerical_preprocessing",
+            "categorical_preprocessing",
             "use_decision_tree_bins",
             "binning_strategy",
             "task",
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index e36877e..d97eab6 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -47,6 +47,7 @@ def __init__(self, model, config, **kwargs):
         self.preprocessor_arg_names = [
             "n_bins",
             "numerical_preprocessing",
+            "categorical_preprocessing",
             "use_decision_tree_bins",
             "binning_strategy",
             "task",
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index 414449d..b77d11b 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -26,6 +26,7 @@ def __init__(self, model, config, **kwargs):
         self.preprocessor_arg_names = [
             "n_bins",
             "numerical_preprocessing",
+            "categorical_preprocessing",
             "use_decision_tree_bins",
             "binning_strategy",
             "task",

From c354c0096e9261e2ff234727fc2155830d3a1c6e Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 3 Dec 2024 19:06:30 +0000
Subject: [PATCH 127/132] include float preprocessing

---
 mambular/preprocessing/prepro_utils.py | 12 ++++++++++++
 mambular/preprocessing/preprocessor.py | 19 ++++++++++++-------
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/mambular/preprocessing/prepro_utils.py b/mambular/preprocessing/prepro_utils.py
index 74fa6d6..b75270a 100644
--- a/mambular/preprocessing/prepro_utils.py
+++ b/mambular/preprocessing/prepro_utils.py
@@ -220,3 +220,15 @@ def get_feature_names_out(self, input_features=None):
                 "input_features must be provided to generate feature names."
             )
         return np.array(input_features)
+
+
+class ToFloatTransformer(TransformerMixin, BaseEstimator):
+    """
+    A transformer that converts input data to float type.
+    """
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        return X.astype(float)
diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
index 919474b..3d7e83e 100644
--- a/mambular/preprocessing/preprocessor.py
+++ b/mambular/preprocessing/preprocessor.py
@@ -22,6 +22,7 @@
     CustomBinner,
     OneHotFromOrdinal,
     NoTransformer,
+    ToFloatTransformer,
 )
 
 
@@ -339,6 +340,7 @@ def fit(self, X, y=None):
                         [
                             ("imputer", SimpleImputer(strategy="most_frequent")),
                             ("onehot", OneHotEncoder()),
+                            ("to_float", ToFloatTransformer()),
                         ]
                     )
 
@@ -453,17 +455,20 @@ def _split_transformed_output(self, X, transformed_X):
         """
         start = 0
         transformed_dict = {}
-        for (
-            name,
-            transformer,
-            columns,
-        ) in self.column_transformer.transformers_:
+        for name, transformer, columns in self.column_transformer.transformers_:
             if transformer != "drop":
                 end = start + transformer.transform(X[[columns[0]]]).shape[1]
-                dtype = int if "cat" in name else float
+
+                # Determine dtype based on the transformer steps
+                transformer_steps = [step[0] for step in transformer.steps]
+                if "continuous_ordinal" in transformer_steps:
+                    dtype = int  # Use int for ordinal/integer encoding
+                else:
+                    dtype = float  # Default to float for other encodings
+
+                # Assign transformed data with the correct dtype
                 transformed_dict[name] = transformed_X[:, start:end].astype(dtype)
                 start = end
-
         return transformed_dict
 
     def fit_transform(self, X, y=None):

From eadafe15f0657847ce021aa08cf38de4cd618187 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 3 Dec 2024 19:25:39 +0000
Subject: [PATCH 128/132] adapt readme

---
 README.md                         |  56 +++--
 mambular/base_models/__init__.py  |   4 -
 mambular/base_models/cnn.py       |  79 -------
 mambular/base_models/ftet.py      | 130 -----------
 mambular/base_models/trem.py      | 122 -----------
 mambular/data_utils/datamodule.py |  29 ++-
 mambular/models/__init__.py       |  13 --
 mambular/models/cnn.py            | 344 ------------------------------
 mambular/models/ftet.py           |  21 --
 mambular/models/trem.py           | 260 ----------------------
 10 files changed, 57 insertions(+), 1001 deletions(-)
 delete mode 100644 mambular/base_models/cnn.py
 delete mode 100644 mambular/base_models/ftet.py
 delete mode 100644 mambular/base_models/trem.py
 delete mode 100644 mambular/models/cnn.py
 delete mode 100644 mambular/models/ftet.py
 delete mode 100644 mambular/models/trem.py

diff --git a/README.md b/README.md
index 5d5e1b0..e78de22 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
     <h1>Mambular: Tabular Deep Learning (with Mamba)</h1>
 </div>
 
-Mambular is a Python library for tabular deep learning. It includes models that leverage the Mamba (State Space Model) architecture, as well as other popular models like TabTransformer, FTTransformer, and tabular ResNets. Check out our paper `Mambular: A Sequential Model for Tabular Deep Learning`, available [here](https://arxiv.org/abs/2408.06291).
+Mambular is a Python library for tabular deep learning. It includes models that leverage the Mamba (State Space Model) architecture, as well as other popular models like TabTransformer, FTTransformer, and tabular ResNets. Check out our paper `Mambular: A Sequential Model for Tabular Deep Learning`, available [here](https://arxiv.org/abs/2408.06291). Also check out our paper introducing (TabulaRNN)[https://arxiv.org/pdf/2411.17207] and analyzing the efficiency of NLP inspired tabular models. 
 
 <h3> Table of Contents </h3>
 
@@ -63,7 +63,7 @@ Mambular is a Python package that brings the power of advanced deep learning arc
 | `ResNet`         | An adaptation of the ResNet architecture for tabular data applications.                                                                                 |
 | `TabTransformer` | A transformer-based model for tabular data introduced by [Huang et al.](https://arxiv.org/abs/2012.06678), enhancing feature learning capabilities.     |
 | `MambaTab`       | A tabular model using a Mamba-Block on a joint input representation described [here](https://arxiv.org/abs/2401.08867) . Not a sequential model.        |
-| `TabulaRNN`      | A Recurrent Neural Network for Tabular data. Not yet included in the benchmarks. Paper Link will follow.                                                 |
+| `TabulaRNN`      | A Recurrent Neural Network for Tabular data. Not yet included in the benchmarks introduced [here](https://arxiv.org/pdf/2411.17207).                                                 |
 | `MambAttention`  | A combination between Mamba and Transformers, similar to Jamba by [Lieber et al.](https://arxiv.org/abs/2403.19887). Not yet included in the benchmarks |
 
 
@@ -104,12 +104,18 @@ Mambular simplifies data preprocessing with a range of tools designed for easy t
 
 <h3> Data Type Detection and Transformation </h3>
 
-- **Ordinal & One-Hot Encoding**: Automatically transforms categorical data into numerical formats.
-- **Binning**: Discretizes numerical features; can use decision trees for optimal binning.
-- **Normalization & Standardization**: Scales numerical data appropriately.
-- **Piecewise Linear Encodings (PLE)**: Encodes periodicity in numerical data.
-- **Quantile & Spline Transformations**: Applies advanced transformations to handle nonlinearity and distributional shifts.
-- **Polynomial Features**: Generates polynomial and interaction terms to capture complex relationships.
+- **Ordinal & One-Hot Encoding**: Automatically transforms categorical data into numerical formats using continuous ordinal encoding or one-hot encoding. Includes options for transforming outputs to `float` for compatibility with downstream models.  
+- **Binning**: Discretizes numerical features into bins, with support for both fixed binning strategies and optimal binning derived from decision tree models.  
+- **MinMax**: Scales numerical data to a specific range, such as [-1, 1], using Min-Max scaling or similar techniques.  
+- **Standardization**: Centers and scales numerical features to have a mean of zero and unit variance for better compatibility with certain models.  
+- **Quantile Transformations**: Normalizes numerical data to follow a uniform or normal distribution, handling distributional shifts effectively.  
+- **Spline Transformations**: Captures nonlinearity in numerical features using spline-based transformations, ideal for complex relationships.  
+- **Piecewise Linear Encodings (PLE)**: Captures complex numerical patterns by applying piecewise linear encoding, suitable for data with periodic or nonlinear structures.  
+- **Polynomial Features**: Automatically generates polynomial and interaction terms for numerical features, enhancing the ability to capture higher-order relationships.  
+- **Box-Cox & Yeo-Johnson Transformations**: Performs power transformations to stabilize variance and normalize distributions.  
+- **Custom Binning**: Enables user-defined bin edges for precise discretization of numerical data.  
+ 
+
 
 
 <h2> Fit a Model </h2>
@@ -120,9 +126,10 @@ from mambular.models import MambularClassifier
 # Initialize and fit your model
 model = MambularClassifier(
     d_model=64,
-    n_layers=8,
+    n_layers=4,
     numerical_preprocessing="ple",
-    n_bins=50
+    n_bins=50,
+    d_conv=8
 )
 
 # X can be a dataframe or something that can be easily transformed into a pd.DataFrame as a np.array
@@ -221,6 +228,7 @@ Here's how you can implement a custom model with Mambular:
 
    ```python
    from mambular.base_models import BaseModel
+   from mambular.utils.get_feature_dimensions import get_feature_dimensions
    import torch
    import torch.nn
 
@@ -236,11 +244,7 @@ Here's how you can implement a custom model with Mambular:
            super().__init__(**kwargs)
            self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
-           input_dim = 0
-           for feature_name, input_shape in num_feature_info.items():
-               input_dim += input_shape
-           for feature_name, input_shape in cat_feature_info.items():
-               input_dim += 1 
+           input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
 
            self.linear = nn.Linear(input_dim, num_classes)
 
@@ -284,8 +288,16 @@ from mambular.base_models import Mambular
 from mambular.configs import DefaultMambularConfig
 
 # Dummy data and configuration
-cat_feature_info = {"cat1": 5, "cat2": 5}  # Example categorical feature information
-num_feature_info = {"num1": 1, "num2": 1}  # Example numerical feature information
+cat_feature_info = {
+    "cat1": {
+        "preprocessing": "imputer -> continuous_ordinal",
+        "dimension": 1,
+        "categories": 4,
+    }
+}  # Example categorical feature information
+num_feature_info = {
+    "num1": {"preprocessing": "imputer -> scaler", "dimension": 1, "categories": None}
+} # Example numerical feature information
 num_classes = 1
 config = DefaultMambularConfig()  # Use the desired configuration
 
@@ -329,6 +341,16 @@ If you find this project useful in your research, please consider cite:
 }
 ```
 
+If you use TabulaRNN please consider to cite:
+```BibTeX
+@article{thielmann2024efficiency,
+  title={On the Efficiency of NLP-Inspired Methods for Tabular Deep Learning},
+  author={Thielmann, Anton Frederik and Samiee, Soheila},
+  journal={arXiv preprint arXiv:2411.17207},
+  year={2024}
+}
+```
+
 # License
 
 The entire codebase is under MIT license.
diff --git a/mambular/base_models/__init__.py b/mambular/base_models/__init__.py
index 87246ad..b3eda43 100644
--- a/mambular/base_models/__init__.py
+++ b/mambular/base_models/__init__.py
@@ -7,9 +7,7 @@
 from .tabtransformer import TabTransformer
 from .mambatab import MambaTab
 from .mambattn import MambAttn
-from .cnn import CNN
 from .node import NODE
-from .trem import TREM
 from .tabm import TabM
 
 __all__ = [
@@ -22,8 +20,6 @@
     "BaseModel",
     "MambaTab",
     "MambAttn",
-    "CNN",
     "TabM",
     "NODE",
-    "TREM",
 ]
diff --git a/mambular/base_models/cnn.py b/mambular/base_models/cnn.py
deleted file mode 100644
index 386e7ac..0000000
--- a/mambular/base_models/cnn.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import torch
-import torch.nn as nn
-from ..configs.cnn_config import DefaultCNNConfig
-from .basemodel import BaseModel
-from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
-from ..arch_utils.cnn_utils import CNNBlock
-
-
-class CNN(BaseModel):
-    """
-    A convolutional neural network (CNN) model designed for tabular data with support for categorical
-    and numerical features, configurable embeddings, and dynamic flattened size computation.
-
-    Attributes
-    ----------
-    embedding_layer : EmbeddingLayer
-        A layer that generates embeddings for categorical and numerical features.
-    cnn : CNNBlock
-        A modular CNN block for feature extraction.
-    fc : nn.Sequential
-        A fully connected layer for final predictions.
-
-    Methods
-    -------
-    forward(num_features, cat_features):
-        Forward pass through the embedding, CNN, and fully connected layers.
-    """
-
-    def __init__(
-        self,
-        cat_feature_info,
-        num_feature_info,
-        num_classes=1,
-        config: DefaultCNNConfig = DefaultCNNConfig(),
-        **kwargs,
-    ):
-        super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=[])
-
-        self.returns_ensemble = False
-        self.n_features = len(num_feature_info) + len(cat_feature_info)
-
-        # Initialize the embedding layer
-        self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
-            config=config,
-        )
-
-        # CNN block
-        self.cnn = CNNBlock(config)
-        n_features = len(num_feature_info) + len(cat_feature_info)
-
-        # Dynamically compute flattened size
-        with torch.no_grad():
-            sample_input = torch.zeros(
-                1,
-                config.input_channels,
-                n_features,
-                config.d_model,
-            )
-            sample_output = self.cnn(sample_input)
-            flattened_size = sample_output.view(1, -1).size(1)
-            print(flattened_size)
-
-        # Fully connected layers
-        self.fc = nn.Sequential(
-            nn.Flatten(),
-            nn.Linear(flattened_size, num_classes),
-        )
-
-    def forward(self, num_features, cat_features):
-        x = self.embedding_layer(num_features, cat_features)
-        x = x.unsqueeze(1)
-        # Generate embeddings (x) with shape (N, J, D)
-
-        x = self.cnn(x)
-        preds = self.fc(x)
-        return preds
diff --git a/mambular/base_models/ftet.py b/mambular/base_models/ftet.py
deleted file mode 100644
index fa9bd23..0000000
--- a/mambular/base_models/ftet.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import torch
-import torch.nn as nn
-from ..arch_utils.mlp_utils import MLPhead
-from ..arch_utils.get_norm_fn import get_normalization_layer
-from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
-from ..arch_utils.transformer_utils import BatchEnsembleTransformerEncoder
-from ..configs.ftet_config import DefaultFTETConfig
-from .basemodel import BaseModel
-from ..arch_utils.layer_utils.sn_linear import SNLinear
-
-
-class FTET(BaseModel):
-    """
-    A Feature Transformer model for tabular data with categorical and numerical features, using embedding, transformer
-    encoding, and pooling to produce final predictions.
-
-    Parameters
-    ----------
-    cat_feature_info : dict
-        Dictionary containing information about categorical features, including their names and dimensions.
-    num_feature_info : dict
-        Dictionary containing information about numerical features, including their names and dimensions.
-    num_classes : int, optional
-        The number of output classes or target dimensions for regression, by default 1.
-    config : DefaultFTTransformerConfig, optional
-        Configuration object containing model hyperparameters such as dropout rates, hidden layer sizes,
-        transformer settings, and other architectural configurations, by default DefaultFTTransformerConfig().
-    **kwargs : dict
-        Additional keyword arguments for the BaseModel class.
-
-    Attributes
-    ----------
-    pooling_method : str
-        The pooling method to aggregate features after transformer encoding.
-    cat_feature_info : dict
-        Stores categorical feature information.
-    num_feature_info : dict
-        Stores numerical feature information.
-    embedding_layer : EmbeddingLayer
-        Layer for embedding categorical and numerical features.
-    norm_f : nn.Module
-        Normalization layer for the transformer output.
-    encoder : nn.TransformerEncoder
-        Transformer encoder for sequential processing of embedded features.
-    tabular_head : MLPhead
-        MLPhead layer to produce the final prediction based on the output of the transformer encoder.
-
-    Methods
-    -------
-    forward(num_features, cat_features)
-        Perform a forward pass through the model, including embedding, transformer encoding, pooling, and prediction steps.
-
-    """
-
-    def __init__(
-        self,
-        cat_feature_info,
-        num_feature_info,
-        num_classes=1,
-        config: DefaultFTETConfig = DefaultFTETConfig(),
-        **kwargs,
-    ):
-        super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
-
-        if not self.hparams.average_ensembles:
-            self.returns_ensemble = True  # Directly set ensemble flag
-        else:
-            self.returns_ensemble = False
-
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
-
-        # embedding layer
-        self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
-            config=config,
-        )
-
-        # transformer encoder
-        self.norm_f = get_normalization_layer(config)
-        self.encoder = BatchEnsembleTransformerEncoder(config)
-
-        if self.hparams.average_ensembles:
-            self.final_layer = nn.Linear(self.hparams.d_model, num_classes)
-        else:
-            self.final_layer = SNLinear(
-                self.hparams.ensemble_size,
-                self.hparams.d_model,
-                num_classes,
-            )
-
-        # pooling
-        n_inputs = len(num_feature_info) + len(cat_feature_info)
-        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
-
-    def forward(self, num_features, cat_features):
-        """
-        Defines the forward pass of the model.
-
-        Parameters
-        ----------
-        num_features : Tensor
-            Tensor containing the numerical features.
-        cat_features : Tensor
-            Tensor containing the categorical features.
-
-        Returns
-        -------
-        Tensor
-            The output predictions of the model.
-        """
-        x = self.embedding_layer(num_features, cat_features)
-
-        x = self.encoder(x)
-
-        x = self.pool_sequence(x)  # Shape: (batch_size, ensemble_size, hidden_size)
-
-        if self.hparams.average_ensembles:
-            x = x.mean(axis=1)  # Shape (batch_size, num_classes)
-
-        x = self.final_layer(
-            x
-        )  # Shape (batch_size, (ensemble_size), num_classes) if not averaged
-
-        if not self.hparams.average_ensembles:
-            x = x.squeeze(-1)
-
-        return x
diff --git a/mambular/base_models/trem.py b/mambular/base_models/trem.py
deleted file mode 100644
index 5330e0c..0000000
--- a/mambular/base_models/trem.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import torch
-import torch.nn as nn
-from ..arch_utils.layer_utils.sn_linear import SNLinear
-from ..configs import DefaultTREMConfig
-from .basemodel import BaseModel
-from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
-from ..arch_utils.rnn_utils import EnsembleConvRNN
-from ..arch_utils.get_norm_fn import get_normalization_layer
-from dataclasses import replace
-
-
-class TREM(BaseModel):
-    """
-    Tabular Recurrent Ensemble Model (TREM)
-    A batch ensemble model combining RNN and tabular data handling for multivariate time series or sequential tabular data.
-
-    Parameters
-    ----------
-    cat_feature_info : dict
-        Dictionary containing information about categorical features, including their names and dimensions.
-    num_feature_info : dict
-        Dictionary containing information about numerical features, including their names and dimensions.
-    num_classes : int, optional
-        The number of output classes or target dimensions for regression, by default 1.
-    config : DefaultTREMConfig, optional
-        Configuration object containing model hyperparameters such as dropout rates, hidden layer sizes, ensemble settings,
-        and other architectural configurations, by default DefaultTREMConfig().
-    **kwargs : dict
-        Additional keyword arguments for the BaseModel class.
-
-    Attributes
-    ----------
-    cat_feature_info : dict
-        Stores categorical feature information.
-    num_feature_info : dict
-        Stores numerical feature information.
-    pooling_method : str
-        The pooling method to aggregate sequence or ensemble features, specified in config.
-    ensemble_first : bool
-        Flag indicating if ensembles should be processed before pooling over the sequence.
-    embedding_layer : EmbeddingLayer
-        Layer for embedding categorical and numerical features.
-    rnn : EnsembleConvRNN
-        Ensemble RNN layer for processing sequential data.
-    tabular_head : MLPhead
-        MLPhead layer to produce the final prediction based on the output of the RNN and pooling layers.
-    linear : nn.Linear
-        Linear transformation layer for projecting features into a different dimension.
-    norm_f : nn.Module
-        Normalization layer.
-    ensemble_linear : nn.Linear, optional
-        Linear layer to learn a weighted combination of ensemble outputs, if configured.
-
-    Methods
-    -------
-    forward(num_features, cat_features)
-        Perform a forward pass through the model, including embedding, RNN, pooling, and prediction steps.
-
-    """
-
-    def __init__(
-        self,
-        cat_feature_info,
-        num_feature_info,
-        num_classes=1,
-        config: DefaultTREMConfig = DefaultTREMConfig(),
-        **kwargs,
-    ):
-        super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
-
-        if not self.hparams.average_ensembles:
-            self.returns_ensemble = True  # Directly set ensemble flag
-        else:
-            self.returns_ensemble = False
-
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
-
-        self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
-            config=config,
-        )
-        self.rnn = EnsembleConvRNN(config=config)
-
-        temp_config = replace(config, d_model=config.dim_feedforward)
-        self.norm_f = get_normalization_layer(temp_config)
-
-        if self.hparams.average_ensembles:
-            self.final_layer = nn.Linear(self.hparams.dim_feedforward, num_classes)
-        else:
-            self.final_layer = SNLinear(
-                self.hparams.ensemble_size,
-                self.hparams.dim_feedforward,
-                num_classes,
-            )
-
-        n_inputs = len(num_feature_info) + len(cat_feature_info)
-        self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
-
-    def forward(self, num_features, cat_features):
-        x = self.embedding_layer(num_features, cat_features)
-
-        # RNN forward pass
-        out, _ = self.rnn(
-            x
-        )  # Shape: (batch_size, sequence_length, ensemble_size, hidden_size)
-
-        out = self.pool_sequence(out)  # Shape: (batch_size, ensemble_size, hidden_size)
-
-        if self.hparams.average_ensembles:
-            x = out.mean(axis=1)  # Shape (batch_size, num_classes)
-
-        x = self.final_layer(
-            out
-        )  # Shape (batch_size, (ensemble_size), num_classes) if not averaged
-
-        if not self.hparams.average_ensembles:
-            x = x.squeeze(-1)
-
-        return x
diff --git a/mambular/data_utils/datamodule.py b/mambular/data_utils/datamodule.py
index 14fc1e3..df46a25 100644
--- a/mambular/data_utils/datamodule.py
+++ b/mambular/data_utils/datamodule.py
@@ -134,8 +134,8 @@ def preprocess_data(
 
         # Update feature info based on the actual processed data
         (
-            self.cat_feature_info,
             self.num_feature_info,
+            self.cat_feature_info,
         ) = self.preprocessor.get_feature_info()
 
     def setup(self, stage: str):
@@ -154,31 +154,33 @@ def setup(self, stage: str):
 
             # Populate tensors for categorical features, if present in processed data
             for key in self.cat_feature_info:
+                dtype = (
+                    torch.float32
+                    if "onehot" in self.cat_feature_info[key]["preprocessing"]
+                    else torch.long
+                )
+
                 cat_key = (
                     "cat_" + key
                 )  # Assuming categorical keys are prefixed with 'cat_'
                 if cat_key in train_preprocessed_data:
                     train_cat_tensors.append(
-                        torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long)
+                        torch.tensor(train_preprocessed_data[cat_key], dtype=dtype)
                     )
                 if cat_key in val_preprocessed_data:
                     val_cat_tensors.append(
-                        torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long)
+                        torch.tensor(val_preprocessed_data[cat_key], dtype=dtype)
                     )
 
                 binned_key = "num_" + key  # for binned features
                 if binned_key in train_preprocessed_data:
                     train_cat_tensors.append(
-                        torch.tensor(
-                            train_preprocessed_data[binned_key], dtype=torch.long
-                        )
+                        torch.tensor(train_preprocessed_data[binned_key], dtype=dtype)
                     )
 
                 if binned_key in val_preprocessed_data:
                     val_cat_tensors.append(
-                        torch.tensor(
-                            val_preprocessed_data[binned_key], dtype=torch.long
-                        )
+                        torch.tensor(val_preprocessed_data[binned_key], dtype=dtype)
                     )
 
             # Populate tensors for numerical features, if present in processed data
@@ -236,16 +238,21 @@ def preprocess_test_data(self, X):
 
         # Populate tensors for categorical features, if present in processed data
         for key in self.cat_feature_info:
+            dtype = (
+                torch.float32
+                if "onehot" in self.cat_feature_info[key]["preprocessing"]
+                else torch.long
+            )
             cat_key = "cat_" + key  # Assuming categorical keys are prefixed with 'cat_'
             if cat_key in test_preprocessed_data:
                 self.test_cat_tensors.append(
-                    torch.tensor(test_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(test_preprocessed_data[cat_key], dtype=dtype)
                 )
 
             binned_key = "num_" + key  # for binned features
             if binned_key in test_preprocessed_data:
                 self.test_cat_tensors.append(
-                    torch.tensor(test_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(test_preprocessed_data[binned_key], dtype=dtype)
                 )
 
         # Populate tensors for numerical features, if present in processed data
diff --git a/mambular/models/__init__.py b/mambular/models/__init__.py
index 48b153c..78cfc88 100644
--- a/mambular/models/__init__.py
+++ b/mambular/models/__init__.py
@@ -22,13 +22,9 @@
     MambAttentionRegressor,
     MambAttentionLSS,
 )
-
 from .ndtf import NDTFClassifier, NDTFRegressor, NDTFLSS
 from .node import NODEClassifier, NODERegressor, NODELSS
 from .tabm import TabMClassifier, TabMRegressor, TabMLSS
-from .trem import TREMRegressor, TREMClassifier, TREMLSS
-from .cnn import CNNRegressor, CNNClassifier, CNNLSS
-from .ftet import FTETRegressor, FTETClassifier, FTETLSS
 
 
 __all__ = [
@@ -68,13 +64,4 @@
     "TabMClassifier",
     "TabMRegressor",
     "TabMLSS",
-    "TREMRegressor",
-    "TREMClassifier",
-    "TREMLSS",
-    "CNNRegressor",
-    "CNNClassifier",
-    "CNNLSS",
-    "FTETRegressor",
-    "FTETClassifier",
-    "FTETLSS",
 ]
diff --git a/mambular/models/cnn.py b/mambular/models/cnn.py
deleted file mode 100644
index b205fbf..0000000
--- a/mambular/models/cnn.py
+++ /dev/null
@@ -1,344 +0,0 @@
-from .sklearn_base_regressor import SklearnBaseRegressor
-from .sklearn_base_lss import SklearnBaseLSS
-from .sklearn_base_classifier import SklearnBaseClassifier
-from ..base_models.cnn import CNN
-from ..configs.cnn_config import DefaultCNNConfig
-
-
-class CNNRegressor(SklearnBaseRegressor):
-    """
-    CNN regressor. This class extends the SklearnBaseRegressor class and uses the CNN model
-    with the default CNN configuration.
-
-    The accepted arguments to the CNNRegressor class include both the attributes in the DefaultCNNConfig dataclass
-    and the parameters for the Preprocessor class.
-
-    Optimizer Parameters
-    --------------------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-
-    Embedding Parameters
-    ---------------------
-    use_embeddings : bool, default=False
-        Whether to use embedding layers for all features.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', 'plr', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    d_model : int, default=32
-        Dimensionality of the embeddings.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-
-    CNN Parameters
-    --------------------
-    input_channels : int, default=1
-        Number of input channels (e.g., 1 for grayscale images).
-    num_layers : int, default=4
-        Number of convolutional layers.
-    out_channels_list : list, default=(64, 64, 128, 128)
-        List of output channels for each convolutional layer.
-    kernel_size_list : list, default=(3, 3, 3, 3)
-        List of kernel sizes for each convolutional layer.
-    stride_list : list, default=(1, 1, 1, 1)
-        List of stride values for each convolutional layer.
-    padding_list : list, default=(1, 1, 1, 1)
-        List of padding values for each convolutional layer.
-    pooling_method : str, default="max"
-        Pooling method ('max' or 'avg').
-    pooling_kernel_size_list : list, default=(2, 2, 1, 1)
-        List of kernel sizes for pooling layers for each convolutional layer.
-    pooling_stride_list : list, default=(2, 2, 1, 1)
-        List of stride values for pooling layers for each convolutional layer.
-
-    Dropout Parameters
-    -------------------
-    dropout_rate : float, default=0.5
-        Probability of dropping neurons during training.
-    dropout_positions : list, default=None
-        List of indices of layers after which dropout should be applied. If None, no dropout is applied.
-
-    Preprocessing Params
-    ---------------------
-    n_bins : int, default=50
-        The number of bins to use for numerical feature binning. This parameter is relevant
-        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    numerical_preprocessing : str, default="ple"
-        The preprocessing strategy for numerical features. Valid options are
-        'binning', 'one_hot', 'standardization', and 'normalization'.
-    use_decision_tree_bins : bool, default=False
-        If True, uses decision tree regression/classification to determine
-        optimal bin edges for numerical feature binning. This parameter is
-        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    binning_strategy : str, default="uniform"
-        Defines the strategy for binning numerical features. Options include 'uniform',
-        'quantile', or other sklearn-compatible strategies.
-    cat_cutoff : float or int, default=0.03
-        Indicates the cutoff after which integer values are treated as categorical.
-        If float, it's treated as a percentage. If int, it's the maximum number of
-        unique values for a column to be considered categorical.
-    treat_all_integers_as_numerical : bool, default=False
-        If True, all integer columns will be treated as numerical, regardless
-        of their unique value count or proportion.
-    degree : int, default=3
-        The degree of the polynomial features to be used in preprocessing.
-    knots : int, default=12
-        The number of knots to be used in spline transformations.
-
-    Notes
-    -----
-    - The accepted arguments to the CNNRegressor class are the same as the attributes in the DefaultCNNConfig dataclass.
-    - CNNRegressor uses SklearnBaseRegressor as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
-
-    See Also
-    --------
-    mambular.models.SklearnBaseRegressor : The parent class for CNNRegressor.
-
-    Examples
-    --------
-    >>> from mambular.models import CNNRegressor
-    >>> model = CNNRegressor(d_model=64, n_layers=8)
-    >>> model.fit(X_train, y_train)
-    >>> preds = model.predict(X_test)
-    >>> model.evaluate(X_test, y_test)
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(model=CNN, config=DefaultCNNConfig, **kwargs)
-
-
-class CNNLSS(SklearnBaseLSS):
-    """
-    CNN regressor. This class extends the SklearnBaseLSS class and uses the CNN model
-    with the default CNN configuration.
-
-    The accepted arguments to the CNNLSS class include both the attributes in the DefaultCNNConfig dataclass
-    and the parameters for the Preprocessor class.
-
-    Optimizer Parameters
-    --------------------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-
-    Embedding Parameters
-    ---------------------
-    use_embeddings : bool, default=False
-        Whether to use embedding layers for all features.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', 'plr', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    d_model : int, default=32
-        Dimensionality of the embeddings.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-
-    CNN Parameters
-    --------------------
-    input_channels : int, default=1
-        Number of input channels (e.g., 1 for grayscale images).
-    num_layers : int, default=4
-        Number of convolutional layers.
-    out_channels_list : list, default=(64, 64, 128, 128)
-        List of output channels for each convolutional layer.
-    kernel_size_list : list, default=(3, 3, 3, 3)
-        List of kernel sizes for each convolutional layer.
-    stride_list : list, default=(1, 1, 1, 1)
-        List of stride values for each convolutional layer.
-    padding_list : list, default=(1, 1, 1, 1)
-        List of padding values for each convolutional layer.
-    pooling_method : str, default="max"
-        Pooling method ('max' or 'avg').
-    pooling_kernel_size_list : list, default=(2, 2, 1, 1)
-        List of kernel sizes for pooling layers for each convolutional layer.
-    pooling_stride_list : list, default=(2, 2, 1, 1)
-        List of stride values for pooling layers for each convolutional layer.
-
-    Dropout Parameters
-    -------------------
-    dropout_rate : float, default=0.5
-        Probability of dropping neurons during training.
-    dropout_positions : list, default=None
-        List of indices of layers after which dropout should be applied. If None, no dropout is applied.
-
-    Preprocessing Params
-    ---------------------
-    n_bins : int, default=50
-        The number of bins to use for numerical feature binning. This parameter is relevant
-        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    numerical_preprocessing : str, default="ple"
-        The preprocessing strategy for numerical features. Valid options are
-        'binning', 'one_hot', 'standardization', and 'normalization'.
-    use_decision_tree_bins : bool, default=False
-        If True, uses decision tree regression/classification to determine
-        optimal bin edges for numerical feature binning. This parameter is
-        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    binning_strategy : str, default="uniform"
-        Defines the strategy for binning numerical features. Options include 'uniform',
-        'quantile', or other sklearn-compatible strategies.
-    cat_cutoff : float or int, default=0.03
-        Indicates the cutoff after which integer values are treated as categorical.
-        If float, it's treated as a percentage. If int, it's the maximum number of
-        unique values for a column to be considered categorical.
-    treat_all_integers_as_numerical : bool, default=False
-        If True, all integer columns will be treated as numerical, regardless
-        of their unique value count or proportion.
-    degree : int, default=3
-        The degree of the polynomial features to be used in preprocessing.
-    knots : int, default=12
-        The number of knots to be used in spline transformations.
-
-    Notes
-    -----
-    - The accepted arguments to the CNNLSS class are the same as the attributes in the DefaultCNNConfig dataclass.
-    - CNNLSS uses SklearnBaseLSS as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
-
-    See Also
-    --------
-    mambular.models.SklearnBaseLSS : The parent class for CNNLSS.
-
-    Examples
-    --------
-    >>> from mambular.models import CNNLSS
-    >>> model = CNNLSS(d_model=64, n_layers=8)
-    >>> model.fit(X_train, y_train)
-    >>> preds = model.predict(X_test)
-    >>> model.evaluate(X_test, y_test)
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(model=CNN, config=DefaultCNNConfig, **kwargs)
-
-
-class CNNClassifier(SklearnBaseClassifier):
-    """
-    CNN regressor. This class extends the SklearnBaseCLassifier class and uses the CNN model
-    with the default CNN configuration.
-
-    The accepted arguments to the CNNCLassifier class include both the attributes in the DefaultCNNConfig dataclass
-    and the parameters for the Preprocessor class.
-
-    Optimizer Parameters
-    --------------------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-
-    Embedding Parameters
-    ---------------------
-    use_embeddings : bool, default=False
-        Whether to use embedding layers for all features.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', 'plr', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    d_model : int, default=32
-        Dimensionality of the embeddings.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-
-    CNN Parameters
-    --------------------
-    input_channels : int, default=1
-        Number of input channels (e.g., 1 for grayscale images).
-    num_layers : int, default=4
-        Number of convolutional layers.
-    out_channels_list : list, default=(64, 64, 128, 128)
-        List of output channels for each convolutional layer.
-    kernel_size_list : list, default=(3, 3, 3, 3)
-        List of kernel sizes for each convolutional layer.
-    stride_list : list, default=(1, 1, 1, 1)
-        List of stride values for each convolutional layer.
-    padding_list : list, default=(1, 1, 1, 1)
-        List of padding values for each convolutional layer.
-    pooling_method : str, default="max"
-        Pooling method ('max' or 'avg').
-    pooling_kernel_size_list : list, default=(2, 2, 1, 1)
-        List of kernel sizes for pooling layers for each convolutional layer.
-    pooling_stride_list : list, default=(2, 2, 1, 1)
-        List of stride values for pooling layers for each convolutional layer.
-
-    Dropout Parameters
-    -------------------
-    dropout_rate : float, default=0.5
-        Probability of dropping neurons during training.
-    dropout_positions : list, default=None
-        List of indices of layers after which dropout should be applied. If None, no dropout is applied.
-
-    Preprocessing Params
-    ---------------------
-    n_bins : int, default=50
-        The number of bins to use for numerical feature binning. This parameter is relevant
-        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    numerical_preprocessing : str, default="ple"
-        The preprocessing strategy for numerical features. Valid options are
-        'binning', 'one_hot', 'standardization', and 'normalization'.
-    use_decision_tree_bins : bool, default=False
-        If True, uses decision tree regression/classification to determine
-        optimal bin edges for numerical feature binning. This parameter is
-        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    binning_strategy : str, default="uniform"
-        Defines the strategy for binning numerical features. Options include 'uniform',
-        'quantile', or other sklearn-compatible strategies.
-    cat_cutoff : float or int, default=0.03
-        Indicates the cutoff after which integer values are treated as categorical.
-        If float, it's treated as a percentage. If int, it's the maximum number of
-        unique values for a column to be considered categorical.
-    treat_all_integers_as_numerical : bool, default=False
-        If True, all integer columns will be treated as numerical, regardless
-        of their unique value count or proportion.
-    degree : int, default=3
-        The degree of the polynomial features to be used in preprocessing.
-    knots : int, default=12
-        The number of knots to be used in spline transformations.
-
-    Notes
-    -----
-    - The accepted arguments to the CNNCLassifier class are the same as the attributes in the DefaultCNNConfig dataclass.
-    - CNNCLassifier uses SklearnBaseCLassifier as the parent class. The methods for fitting, predicting, and evaluating the model are inherited from the parent class. Please refer to the parent class for more information.
-
-    See Also
-    --------
-    mambular.models.SklearnBaseCLassifier : The parent class for CNNCLassifier.
-
-    Examples
-    --------
-    >>> from mambular.models import CNNCLassifier
-    >>> model = CNNCLassifier(d_model=64, n_layers=8)
-    >>> model.fit(X_train, y_train)
-    >>> preds = model.predict(X_test)
-    >>> model.evaluate(X_test, y_test)
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(model=CNN, config=DefaultCNNConfig, **kwargs)
diff --git a/mambular/models/ftet.py b/mambular/models/ftet.py
deleted file mode 100644
index 297e41f..0000000
--- a/mambular/models/ftet.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from .sklearn_base_regressor import SklearnBaseRegressor
-from .sklearn_base_classifier import SklearnBaseClassifier
-from .sklearn_base_lss import SklearnBaseLSS
-
-from ..base_models.ftet import FTET
-from ..configs.ftet_config import DefaultFTETConfig
-
-
-class FTETRegressor(SklearnBaseRegressor):
-    def __init__(self, **kwargs):
-        super().__init__(model=FTET, config=DefaultFTETConfig, **kwargs)
-
-
-class FTETClassifier(SklearnBaseClassifier):
-    def __init__(self, **kwargs):
-        super().__init__(model=FTET, config=DefaultFTETConfig, **kwargs)
-
-
-class FTETLSS(SklearnBaseLSS):
-    def __init__(self, **kwargs):
-        super().__init__(model=FTET, config=DefaultFTETConfig, **kwargs)
diff --git a/mambular/models/trem.py b/mambular/models/trem.py
deleted file mode 100644
index af182ab..0000000
--- a/mambular/models/trem.py
+++ /dev/null
@@ -1,260 +0,0 @@
-from .sklearn_base_regressor import SklearnBaseRegressor
-from .sklearn_base_classifier import SklearnBaseClassifier
-from .sklearn_base_lss import SklearnBaseLSS
-
-from ..base_models.trem import TREM
-from ..configs.trem_config import DefaultTREMConfig
-
-
-class TREMRegressor(SklearnBaseRegressor):
-    """
-    RNN regressor. This class extends the SklearnBaseRegressor class and uses the TREM model
-    with the default TREM configuration.
-
-    The accepted arguments to the TREMRegressor class include both the attributes in the DefaultTREMConfig dataclass
-    and the parameters for the Preprocessor class.
-
-    Parameters
-    ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    model_type : str, default="RNN"
-        type of model, one of "RNN", "LSTM", "GRU"
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-    d_model : int, default=64
-        Dimensionality of the model.
-    n_layers : int, default=8
-        Number of layers in the transformer.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
-    activation : callable, default=nn.SELU()
-        Activation function for the transformer.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for numerical embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
-    head_dropout : float, default=0.5
-        Dropout rate for the head layers.
-    head_skip_layers : bool, default=False
-        Whether to skip layers in the head.
-    head_activation : callable, default=nn.SELU()
-        Activation function for the head layers.
-    head_use_batch_norm : bool, default=False
-        Whether to use batch normalization in the head layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    pooling_method : str, default="cls"
-        Pooling method to be used ('cls', 'avg', etc.).
-    norm_first : bool, default=False
-        Whether to apply normalization before other operations in each transformer block.
-    bias : bool, default=True
-        Whether to use bias in the linear layers.
-    rnn_activation : callable, default=nn.SELU()
-        Activation function for the transformer layers.
-    bidirectional : bool, default=False.
-        Whether to process data bidirectionally
-    cat_encoding : str, default="int"
-        Encoding method for categorical features.
-    n_bins : int, default=50
-        The number of bins to use for numerical feature binning. This parameter is relevant
-        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    numerical_preprocessing : str, default="ple"
-        The preprocessing strategy for numerical features. Valid options are
-        'binning', 'one_hot', 'standardization', and 'normalization'.
-    use_decision_tree_bins : bool, default=False
-        If True, uses decision tree regression/classification to determine
-        optimal bin edges for numerical feature binning. This parameter is
-        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    binning_strategy : str, default="uniform"
-        Defines the strategy for binning numerical features. Options include 'uniform',
-        'quantile', or other sklearn-compatible strategies.
-    cat_cutoff : float or int, default=0.03
-        Indicates the cutoff after which integer values are treated as categorical.
-        If float, it's treated as a percentage. If int, it's the maximum number of
-        unique values for a column to be considered categorical.
-    treat_all_integers_as_numerical : bool, default=False
-        If True, all integer columns will be treated as numerical, regardless
-        of their unique value count or proportion.
-    degree : int, default=3
-        The degree of the polynomial features to be used in preprocessing.
-    knots : int, default=12
-        The number of knots to be used in spline transformations.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(model=TREM, config=DefaultTREMConfig, **kwargs)
-
-
-class TREMClassifier(SklearnBaseClassifier):
-    """
-    RNN classifier. This class extends the SklearnBaseClassifier class and uses the TREM model
-    with the default TREM configuration.
-
-    The accepted arguments to the TREMClassifier class include both the attributes in the DefaultTREMConfig dataclass
-    and the parameters for the Preprocessor class.
-
-    Parameters
-    ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    model_type : str, default="RNN"
-        type of model, one of "RNN", "LSTM", "GRU"
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-    d_model : int, default=64
-        Dimensionality of the model.
-    n_layers : int, default=8
-        Number of layers in the transformer.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
-    activation : callable, default=nn.SELU()
-        Activation function for the transformer.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for numerical embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
-    head_dropout : float, default=0.5
-        Dropout rate for the head layers.
-    head_skip_layers : bool, default=False
-        Whether to skip layers in the head.
-    head_activation : callable, default=nn.SELU()
-        Activation function for the head layers.
-    head_use_batch_norm : bool, default=False
-        Whether to use batch normalization in the head layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    pooling_method : str, default="cls"
-        Pooling method to be used ('cls', 'avg', etc.).
-    norm_first : bool, default=False
-        Whether to apply normalization before other operations in each transformer block.
-    bias : bool, default=True
-        Whether to use bias in the linear layers.
-    rnn_activation : callable, default=nn.SELU()
-        Activation function for the transformer layers.
-    bidirectional : bool, default=False.
-        Whether to process data bidirectionally
-    cat_encoding : str, default="int"
-        Encoding method for categorical features.
-    n_bins : int, default=50
-        The number of bins to use for numerical feature binning. This parameter is relevant
-        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    numerical_preprocessing : str, default="ple"
-        The preprocessing strategy for numerical features. Valid options are
-        'binning', 'one_hot', 'standardization', and 'normalization'.
-    use_decision_tree_bins : bool, default=False
-        If True, uses decision tree regression/classification to determine
-        optimal bin edges for numerical feature binning. This parameter is
-        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    binning_strategy : str, default="uniform"
-        Defines the strategy for binning numerical features. Options include 'uniform',
-        'quantile', or other sklearn-compatible strategies.
-    cat_cutoff : float or int, default=0.03
-        Indicates the cutoff after which integer values are treated as categorical.
-        If float, it's treated as a percentage. If int, it's the maximum number of
-        unique values for a column to be considered categorical.
-    treat_all_integers_as_numerical : bool, default=False
-        If True, all integer columns will be treated as numerical, regardless
-        of their unique value count or proportion.
-    degree : int, default=3
-        The degree of the polynomial features to be used in preprocessing.
-    knots : int, default=12
-        The number of knots to be used in spline transformations.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(model=TREM, config=DefaultTREMConfig, **kwargs)
-
-
-class TREMLSS(SklearnBaseLSS):
-    """
-    RNN LSS. This class extends the SklearnBaseLSS class and uses the TREM model
-    with the default TREM configuration.
-
-    The accepted arguments to the TREMLSS class include both the attributes in the DefaultTREMConfig dataclass
-    and the parameters for the Preprocessor class.
-
-    Parameters
-    ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    model_type : str, default="RNN"
-        type of model, one of "RNN", "LSTM", "GRU"
-    family : str, default=None
-        Distributional family to be used for the model.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-    d_model : int, default=64
-        Dimensionality of the model.
-    n_layers : int, default=8
-        Number of layers in the transformer.
-    norm : str, default="RMSNorm"
-        Normalization method to be used.
-    activation : callable, default=nn.SELU()
-        Activation function for the transformer.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for numerical embeddings.
-    head_layer_sizes : list, default=(128, 64, 32)
-        Sizes of the layers in the head of the model.
-    head_dropout : float, default=0.5
-        Dropout rate for the head layers.
-    head_skip_layers : bool, default=False
-        Whether to skip layers in the head.
-    head_activation : callable, default=nn.SELU()
-        Activation function for the head layers.
-    head_use_batch_norm : bool, default=False
-        Whether to use batch normalization in the head layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    pooling_method : str, default="cls"
-        Pooling method to be used ('cls', 'avg', etc.).
-    norm_first : bool, default=False
-        Whether to apply normalization before other operations in each transformer block.
-    bias : bool, default=True
-        Whether to use bias in the linear layers.
-    rnn_activation : callable, default=nn.SELU()
-        Activation function for the transformer layers.
-    bidirectional : bool, default=False.
-        Whether to process data bidirectionally
-    cat_encoding : str, default="int"
-        Encoding method for categorical features.
-    n_bins : int, default=50
-        The number of bins to use for numerical feature binning. This parameter is relevant
-        only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    numerical_preprocessing : str, default="ple"
-        The preprocessing strategy for numerical features. Valid options are
-        'binning', 'one_hot', 'standardization', and 'normalization'.
-    use_decision_tree_bins : bool, default=False
-        If True, uses decision tree regression/classification to determine
-        optimal bin edges for numerical feature binning. This parameter is
-        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
-    binning_strategy : str, default="uniform"
-        Defines the strategy for binning numerical features. Options include 'uniform',
-        'quantile', or other sklearn-compatible strategies.
-    cat_cutoff : float or int, default=0.03
-        Indicates the cutoff after which integer values are treated as categorical.
-        If float, it's treated as a percentage. If int, it's the maximum number of
-        unique values for a column to be considered categorical.
-    treat_all_integers_as_numerical : bool, default=False
-        If True, all integer columns will be treated as numerical, regardless
-        of their unique value count or proportion.
-    degree : int, default=3
-        The degree of the polynomial features to be used in preprocessing.
-    knots : int, default=12
-        The number of knots to be used in spline transformations.
-    """
-
-    def __init__(self, **kwargs):
-        super().__init__(model=TREM, config=DefaultTREMConfig, **kwargs)

From cf5f64b37e7589dd47fe9744de68da3acc9f71a9 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Tue, 3 Dec 2024 19:44:18 +0000
Subject: [PATCH 129/132] adapt paper link in readme

---
 README.md                       |   2 +-
 mambular/configs/__init__.py    |   4 --
 mambular/configs/cnn_config.py  |  95 ---------------------------
 mambular/configs/ftet_config.py | 105 ------------------------------
 mambular/configs/trem_config.py | 112 --------------------------------
 5 files changed, 1 insertion(+), 317 deletions(-)
 delete mode 100644 mambular/configs/cnn_config.py
 delete mode 100644 mambular/configs/ftet_config.py
 delete mode 100644 mambular/configs/trem_config.py

diff --git a/README.md b/README.md
index e78de22..cbe205e 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
     <h1>Mambular: Tabular Deep Learning (with Mamba)</h1>
 </div>
 
-Mambular is a Python library for tabular deep learning. It includes models that leverage the Mamba (State Space Model) architecture, as well as other popular models like TabTransformer, FTTransformer, and tabular ResNets. Check out our paper `Mambular: A Sequential Model for Tabular Deep Learning`, available [here](https://arxiv.org/abs/2408.06291). Also check out our paper introducing (TabulaRNN)[https://arxiv.org/pdf/2411.17207] and analyzing the efficiency of NLP inspired tabular models. 
+Mambular is a Python library for tabular deep learning. It includes models that leverage the Mamba (State Space Model) architecture, as well as other popular models like TabTransformer, FTTransformer, and tabular ResNets. Check out our paper `Mambular: A Sequential Model for Tabular Deep Learning`, available [here](https://arxiv.org/abs/2408.06291). Also check out our paper introducing [TabulaRNN](https://arxiv.org/pdf/2411.17207) and analyzing the efficiency of NLP inspired tabular models. 
 
 <h3> Table of Contents </h3>
 
diff --git a/mambular/configs/__init__.py b/mambular/configs/__init__.py
index bf9b7b3..08c5cc9 100644
--- a/mambular/configs/__init__.py
+++ b/mambular/configs/__init__.py
@@ -9,8 +9,6 @@
 from .ndtf_config import DefaultNDTFConfig
 from .node_config import DefaultNODEConfig
 from .tabm_config import DefaultTabMConfig
-from .trem_config import DefaultTREMConfig
-from .cnn_config import DefaultCNNConfig
 
 
 __all__ = [
@@ -25,6 +23,4 @@
     "DefaultNDTFConfig",
     "DefaultNODEConfig",
     "DefaultTabMConfig",
-    "DefaultTREMConfig",
-    "DefaultCNNConfig",
 ]
diff --git a/mambular/configs/cnn_config.py b/mambular/configs/cnn_config.py
deleted file mode 100644
index 825c9cd..0000000
--- a/mambular/configs/cnn_config.py
+++ /dev/null
@@ -1,95 +0,0 @@
-from dataclasses import dataclass
-import torch.nn as nn
-
-
-@dataclass
-class DefaultCNNConfig:
-    """
-    Configuration class for the default CNN model with predefined hyperparameters.
-
-    Optimizer Parameters
-    --------------------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-
-    Embedding Parameters
-    ---------------------
-    use_embeddings : bool, default=False
-        Whether to use embedding layers for all features.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', 'plr', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    d_model : int, default=32
-        Dimensionality of the embeddings.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-
-    CNN Parameters
-    --------------------
-    input_channels : int, default=1
-        Number of input channels (e.g., 1 for grayscale images).
-    num_layers : int, default=4
-        Number of convolutional layers.
-    out_channels_list : list, default=(64, 64, 128, 128)
-        List of output channels for each convolutional layer.
-    kernel_size_list : list, default=(3, 3, 3, 3)
-        List of kernel sizes for each convolutional layer.
-    stride_list : list, default=(1, 1, 1, 1)
-        List of stride values for each convolutional layer.
-    padding_list : list, default=(1, 1, 1, 1)
-        List of padding values for each convolutional layer.
-    pooling_method : str, default="max"
-        Pooling method ('max' or 'avg').
-    pooling_kernel_size_list : list, default=(2, 2, 1, 1)
-        List of kernel sizes for pooling layers for each convolutional layer.
-    pooling_stride_list : list, default=(2, 2, 1, 1)
-        List of stride values for pooling layers for each convolutional layer.
-
-    Dropout Parameters
-    -------------------
-    dropout_rate : float, default=0.5
-        Probability of dropping neurons during training.
-    dropout_positions : list, default=None
-        List of indices of layers after which dropout should be applied. If None, no dropout is applied.
-    """
-
-    # Optimizer parameters
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
-    # Embedding parameters
-    layer_norm: bool = False
-    layer_norm_eps: float = 1e-05
-    use_embeddings: bool = False
-    embedding_activation: callable = nn.Identity()
-    embedding_type: str = "linear"
-    embedding_bias: bool = False
-    layer_norm_after_embedding: bool = False
-    d_model: int = 32
-    plr_lite: bool = False
-
-    # CNN parameters
-    input_channels: int = 1
-    num_layers: int = 4
-    out_channels_list: list = (64, 64, 128, 128)
-    kernel_size_list: list = (3, 3, 3, 3)
-    stride_list: list = (1, 1, 1, 1)
-    padding_list: list = (1, 1, 1, 1)
-    pooling_method: str = "max"
-    pooling_kernel_size_list: list = (2, 2, 1, 1)
-    pooling_stride_list: list = (2, 2, 1, 1)
-    dropout_rate: float = 0.5  # Probability to drop neurons
-    dropout_positions: list = None
diff --git a/mambular/configs/ftet_config.py b/mambular/configs/ftet_config.py
deleted file mode 100644
index 538e98a..0000000
--- a/mambular/configs/ftet_config.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from dataclasses import dataclass
-import torch.nn as nn
-from ..arch_utils.transformer_utils import ReGLU
-from typing import Optional, List, Literal
-
-
-@dataclass
-class DefaultFTETConfig:
-    """
-    Configuration class for the FTET model with predefined hyperparameters.
-
-    Attributes
-    ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-    d_model : int, default=128
-        Dimensionality of the transformer model.
-    n_layers : int, default=4
-        Number of transformer layers.
-    n_heads : int, default=8
-        Number of attention heads in the transformer.
-    attn_dropout : float, default=0.2
-        Dropout rate for the attention mechanism.
-    ff_dropout : float, default=0.1
-        Dropout rate for the feed-forward layers.
-    norm : str, default="LayerNorm"
-        Type of normalization to be used ('LayerNorm', 'RMSNorm', etc.).
-    activation : callable, default=nn.SELU()
-        Activation function for the transformer layers.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', 'plr', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in embedding layers.
-    head_layer_sizes : list, default=()
-        Sizes of the fully connected layers in the model's head.
-    head_dropout : float, default=0.5
-        Dropout rate for the head layers.
-    head_skip_layers : bool, default=False
-        Whether to use skip connections in the head layers.
-    head_activation : callable, default=nn.SELU()
-        Activation function for the head layers.
-    head_use_batch_norm : bool, default=False
-        Whether to use batch normalization in the head layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding layers.
-    pooling_method : str, default="avg"
-        Pooling method to be used ('cls', 'avg', etc.).
-    use_cls : bool, default=False
-        Whether to use a CLS token for pooling.
-    norm_first : bool, default=False
-        Whether to apply normalization before other operations in each transformer block.
-    bias : bool, default=True
-        Whether to use bias in linear layers.
-    transformer_activation : callable, default=ReGLU()
-        Activation function for the transformer feed-forward layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization to improve numerical stability.
-    transformer_dim_feedforward : int, default=256
-        Dimensionality of the feed-forward layers in the transformer.
-    cat_encoding : str, default="int"
-        Method for encoding categorical features ('int', 'one-hot', or 'linear').
-    """
-
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-    d_model: int = 128
-    n_layers: int = 4
-    n_heads: int = 8
-    attn_dropout: float = 0.3
-    ff_dropout: float = 0.5
-    norm: str = "LayerNorm"
-    activation: callable = nn.SELU()
-    embedding_activation: callable = nn.ReLU()
-    embedding_type: str = "linear"
-    embedding_bias: bool = False
-    layer_norm_after_embedding: bool = False
-    pooling_method: str = "avg"
-    use_cls: bool = False
-    norm_first: bool = False
-    bias: bool = True
-    transformer_activation: str = "relu"
-    layer_norm_eps: float = 1e-05
-    transformer_dim_feedforward: int = 256
-    cat_encoding: str = "int"
-
-    # Batch ensembling specific configurations
-    ensemble_size: int = 32
-    ensemble_scaling_in: bool = True
-    ensemble_scaling_out: bool = True
-    ensemble_bias: bool = True
-    scaling_init: Literal["ones", "random-signs", "normal"] = "normal"
-    average_ensembles: bool = False
-    model_type: Literal["mini", "full"] = "full"
-    batch_ensemble_projections: list = ("query", "key", "value", "out_proj")
-    abtch_ensemble_ffn: bool = False
diff --git a/mambular/configs/trem_config.py b/mambular/configs/trem_config.py
deleted file mode 100644
index 1479a2d..0000000
--- a/mambular/configs/trem_config.py
+++ /dev/null
@@ -1,112 +0,0 @@
-from dataclasses import dataclass
-import torch.nn as nn
-from typing import Literal
-
-
-@dataclass
-class DefaultTREMConfig:
-    """
-    Configuration class for the Tabular Recurrent Ensemble Model (TREM)
-
-    Attributes
-    ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-05
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-    d_model : int, default=128
-        Dimensionality of the model.
-    n_layers : int, default=4
-        Number of RNN layers in the model.
-    rnn_dropout : float, default=0.3
-        Dropout rate for RNN layers.
-    norm : str, default="RMSNorm"
-        Type of normalization to be used ('RMSNorm', 'LayerNorm', etc.).
-    activation : callable, default=nn.SELU()
-        Activation function for the RNN model.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for numerical embeddings.
-    embedding_dropout : float, optional
-        Dropout rate applied to embeddings. If None, no dropout is applied.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after the embedding layer.
-    pooling_method : str, default="avg"
-        Pooling method to be used ('cls', 'avg', etc.).
-    norm_first : bool, default=False
-        Whether to apply normalization before other operations in each RNN block.
-    bias : bool, default=True
-        Whether to use bias in the linear layers.
-    rnn_activation : callable, default=nn.ReLU()
-        Activation function for RNN layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization to improve numerical stability.
-    dim_feedforward : int, default=256
-        Dimensionality of the feed-forward layers.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', 'plr'.).
-    embedding_bias : bool, default=False
-        Whether to have a bias in the embedding layer
-    cat_encoding : str, default="int"
-        Encoding method for categorical features ('int', 'one-hot', 'linear').
-    d_conv : int, default=4
-        Dimensionality of convolutional layers, if used.
-    conv_bias : bool, default=True
-        Whether to use bias in convolutional layers.
-    residuals : bool, default=False
-        Whether to include residual connections.
-
-    Batch Ensembling Specific Attributes
-    ------------------------------------
-    ensemble_size : int, default=32
-        Number of ensemble members in batch ensembling.
-    ensemble_scaling_in : bool, default=True
-        Whether to apply scaling to input features for each ensemble member.
-    ensemble_scaling_out : bool, default=True
-        Whether to apply scaling to outputs for each ensemble member.
-    ensemble_bias : bool, default=True
-        Whether to include bias for ensemble-specific scaling.
-    scaling_init : {"ones", "random-signs", "normal"}, default="ones"
-        Initialization method for ensemble scaling factors.
-    average_ensembles : bool, default=False
-        Whether to average predictions across ensemble members.
-    model_type : {"mini", "full"}, default="mini"
-        Model type to use ('mini' for reduced version, 'full' for complete model).
-    """
-
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-05
-    lr_factor: float = 0.1
-    d_model: int = 128
-    n_layers: int = 4
-    rnn_dropout: float = 0.3
-    norm: str = "RMSNorm"
-    activation: callable = nn.SELU()
-    embedding_activation: callable = nn.Identity()
-    embedding_dropout: float = None
-    layer_norm_after_embedding: bool = False
-    pooling_method: str = "avg"
-    norm_first: bool = False
-    bias: bool = True
-    rnn_activation: callable = nn.ReLU()
-    layer_norm_eps: float = 1e-05
-    dim_feedforward: int = 256
-    embedding_type: float = "linear"
-    embedding_bias: bool = False
-    cat_encoding: str = "int"
-    d_conv: int = 4
-    conv_bias: bool = True
-    residuals: bool = False
-
-    # Batch ensembling specific configurations
-    ensemble_size: int = 32
-    ensemble_scaling_in: bool = True
-    ensemble_scaling_out: bool = True
-    ensemble_bias: bool = True
-    scaling_init: Literal["ones", "random-signs", "normal"] = "ones"
-    average_ensembles: bool = False
-    model_type: Literal["mini", "full"] = "mini"

From 99a309bbc1725fce06fca4ea685fc6a51907983a Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Wed, 4 Dec 2024 09:39:18 +0100
Subject: [PATCH 130/132] add prepro-args to sklearn hpo

---
 README.md                                  | 83 ++++++++++++++++++----
 mambular/models/sklearn_base_classifier.py |  6 +-
 mambular/models/sklearn_base_lss.py        |  6 +-
 mambular/models/sklearn_base_regressor.py  |  6 +-
 mambular/preprocessing/preprocessor.py     | 49 ++++++++++++-
 5 files changed, 123 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index cbe205e..63ee098 100644
--- a/README.md
+++ b/README.md
@@ -16,21 +16,21 @@
 </div>
 
 <div style="text-align: center;">
-    <h1>Mambular: Tabular Deep Learning (with Mamba)</h1>
+    <h1>Mambular: Tabular Deep Learning</h1>
 </div>
 
-Mambular is a Python library for tabular deep learning. It includes models that leverage the Mamba (State Space Model) architecture, as well as other popular models like TabTransformer, FTTransformer, and tabular ResNets. Check out our paper `Mambular: A Sequential Model for Tabular Deep Learning`, available [here](https://arxiv.org/abs/2408.06291). Also check out our paper introducing [TabulaRNN](https://arxiv.org/pdf/2411.17207) and analyzing the efficiency of NLP inspired tabular models. 
+Mambular is a Python library for tabular deep learning. It includes models that leverage the Mamba (State Space Model) architecture, as well as other popular models like TabTransformer, FTTransformer, TabM and tabular ResNets. Check out our paper `Mambular: A Sequential Model for Tabular Deep Learning`, available [here](https://arxiv.org/abs/2408.06291). Also check out our paper introducing [TabulaRNN](https://arxiv.org/pdf/2411.17207) and analyzing the efficiency of NLP inspired tabular models. 
 
 <h3> Table of Contents </h3>
 
 - [🏃 Quickstart](#-quickstart)
 - [📖 Introduction](#-introduction)
 - [🤖 Models](#-models)
-- [🏆 Results](#-results)
 - [📚 Documentation](#-documentation)
 - [🛠️ Installation](#️-installation)
 - [🚀 Usage](#-usage)
 - [💻 Implement Your Own Model](#-implement-your-own-model)
+- [Custom Training](#custom-training)
 - [🏷️ Citation](#️-citation)
 - [License](#license)
 
@@ -53,18 +53,18 @@ Mambular is a Python package that brings the power of advanced deep learning arc
 
 # 🤖 Models
 
-| Model            | Description                                                                                                                                             |
-| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `Mambular`       | A sequential model using Mamba blocks [Gu and Dao](https://arxiv.org/pdf/2312.00752)  specifically designed for various tabular data tasks.             |
-| `TabM`           | Batch Ensembling for a MLP as introduced by [Gorishniy et al.](https://arxiv.org/abs/2410.24210)                                                        |
-| `NODE`           | Neural Oblivious Decision Ensembles as introduced by [Popov et al.](https://arxiv.org/abs/1909.06312)                                                   |
-| `FTTransformer`  | A model leveraging transformer encoders, as introduced by [Gorishniy et al.](https://arxiv.org/abs/2106.11959), for tabular data.                       |
-| `MLP`            | A classical Multi-Layer Perceptron (MLP) model for handling tabular data tasks.                                                                         |
-| `ResNet`         | An adaptation of the ResNet architecture for tabular data applications.                                                                                 |
-| `TabTransformer` | A transformer-based model for tabular data introduced by [Huang et al.](https://arxiv.org/abs/2012.06678), enhancing feature learning capabilities.     |
-| `MambaTab`       | A tabular model using a Mamba-Block on a joint input representation described [here](https://arxiv.org/abs/2401.08867) . Not a sequential model.        |
-| `TabulaRNN`      | A Recurrent Neural Network for Tabular data. Not yet included in the benchmarks introduced [here](https://arxiv.org/pdf/2411.17207).                                                 |
-| `MambAttention`  | A combination between Mamba and Transformers, similar to Jamba by [Lieber et al.](https://arxiv.org/abs/2403.19887). Not yet included in the benchmarks |
+| Model            | Description                                                                                                                                         |
+| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `Mambular`       | A sequential model using Mamba blocks specifically designed for various tabular data tasks introduced [here](https://arxiv.org/abs/2408.06291).     |
+| `TabM`           | Batch Ensembling for a MLP as introduced by [Gorishniy et al.](https://arxiv.org/abs/2410.24210)                                                    |
+| `NODE`           | Neural Oblivious Decision Ensembles as introduced by [Popov et al.](https://arxiv.org/abs/1909.06312)                                               |
+| `FTTransformer`  | A model leveraging transformer encoders, as introduced by [Gorishniy et al.](https://arxiv.org/abs/2106.11959), for tabular data.                   |
+| `MLP`            | A classical Multi-Layer Perceptron (MLP) model for handling tabular data tasks.                                                                     |
+| `ResNet`         | An adaptation of the ResNet architecture for tabular data applications.                                                                             |
+| `TabTransformer` | A transformer-based model for tabular data introduced by [Huang et al.](https://arxiv.org/abs/2012.06678), enhancing feature learning capabilities. |
+| `MambaTab`       | A tabular model using a Mamba-Block on a joint input representation described [here](https://arxiv.org/abs/2401.08867) . Not a sequential model.    |
+| `TabulaRNN`      | A Recurrent Neural Network for Tabular data, introduced [here](https://arxiv.org/pdf/2411.17207).                                                   |
+| `MambAttention`  | A combination between Mamba and Transformers, also introduced [here](https://arxiv.org/pdf/2411.17207).                                             |
 
 
 
@@ -145,6 +145,59 @@ preds = model.predict(X)
 preds = model.predict_proba(X)
 ```
 
+<h3> Hyperparameter Optimization</h3>
+Since all of the models are sklearn base estimators, you can use the built-in hyperparameter optimizatino from sklearn. 
+
+```python
+from sklearn.model_selection import RandomizedSearchCV
+
+param_dist = {
+    'd_model': randint(32, 128),   
+    'n_layers': randint(2, 10),   
+    'lr': uniform(1e-5, 1e-3) 
+}
+
+random_search = RandomizedSearchCV(
+    estimator=model,
+    param_distributions=param_dist,
+    n_iter=50,  # Number of parameter settings sampled
+    cv=5,       # 5-fold cross-validation
+    scoring='accuracy',  # Metric to optimize
+    random_state=42
+)
+
+fit_params = {"max_epochs":5, "rebuild":False}
+
+# Fit the model
+random_search.fit(X, y, **fit_params)
+
+# Best parameters and score
+print("Best Parameters:", random_search.best_params_)
+print("Best Score:", random_search.best_score_)
+```
+Note, that using this, you can also optimize the preprocessing. Just use the prefix ``prepro__`` when specifying the preprocessor arguments you want to optimize:
+```python
+param_dist = {
+    'd_model': randint(32, 128),   
+    'n_layers': randint(2, 10),   
+    'lr': uniform(1e-5, 1e-3),
+    "prepro__numerical_preprocessing": ["ple", "standardization", "box-cox"] 
+}
+
+```
+
+
+Since we have early stopping integrated and return the best model with respect to the validation loss, setting max_epochs to a large number is sensible.
+
+
+Or use the built-in bayesian hpo simply by running:
+
+```python
+best_params = model.optimize_hparams(X, y)
+```
+
+This automatically sets the search space based on the default config from ``mambular.configs``. See the documentation for all params with regard to ``optimize_hparams()``. However, the preprocessor arguments are fixed and cannot be optimized here.
+
 
 <h2> ⚖️ Distributional Regression with MambularLSS </h2>
 
diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index dc759bb..8ff0aa7 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -87,7 +87,7 @@ def get_params(self, deep=True):
 
         if deep:
             preprocessor_params = {
-                "preprocessor__" + key: value
+                "prepro__" + key: value
                 for key, value in self.preprocessor.get_params().items()
             }
             params.update(preprocessor_params)
@@ -109,12 +109,12 @@ def set_params(self, **parameters):
             Estimator instance.
         """
         config_params = {
-            k: v for k, v in parameters.items() if not k.startswith("preprocessor__")
+            k: v for k, v in parameters.items() if not k.startswith("prepro__")
         }
         preprocessor_params = {
             k.split("__")[1]: v
             for k, v in parameters.items()
-            if k.startswith("preprocessor__")
+            if k.startswith("prepro__")
         }
 
         if config_params:
diff --git a/mambular/models/sklearn_base_lss.py b/mambular/models/sklearn_base_lss.py
index d97eab6..8178045 100644
--- a/mambular/models/sklearn_base_lss.py
+++ b/mambular/models/sklearn_base_lss.py
@@ -109,7 +109,7 @@ def get_params(self, deep=True):
 
         if deep:
             preprocessor_params = {
-                "preprocessor__" + key: value
+                "prepro__" + key: value
                 for key, value in self.preprocessor.get_params().items()
             }
             params.update(preprocessor_params)
@@ -131,12 +131,12 @@ def set_params(self, **parameters):
             Estimator instance.
         """
         config_params = {
-            k: v for k, v in parameters.items() if not k.startswith("preprocessor__")
+            k: v for k, v in parameters.items() if not k.startswith("prepro__")
         }
         preprocessor_params = {
             k.split("__")[1]: v
             for k, v in parameters.items()
-            if k.startswith("preprocessor__")
+            if k.startswith("prepro__")
         }
 
         if config_params:
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index b77d11b..61fa01c 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -88,7 +88,7 @@ def get_params(self, deep=True):
 
         if deep:
             preprocessor_params = {
-                "preprocessor__" + key: value
+                "prepro__" + key: value
                 for key, value in self.preprocessor.get_params().items()
             }
             params.update(preprocessor_params)
@@ -110,12 +110,12 @@ def set_params(self, **parameters):
             Estimator instance.
         """
         config_params = {
-            k: v for k, v in parameters.items() if not k.startswith("preprocessor__")
+            k: v for k, v in parameters.items() if not k.startswith("prepro__")
         }
         preprocessor_params = {
             k.split("__")[1]: v
             for k, v in parameters.items()
-            if k.startswith("preprocessor__")
+            if k.startswith("prepro__")
         }
 
         if config_params:
diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
index 3d7e83e..095da4b 100644
--- a/mambular/preprocessing/preprocessor.py
+++ b/mambular/preprocessing/preprocessor.py
@@ -131,7 +131,48 @@ def __init__(
         self.degree = degree
         self.n_knots = knots
 
+    def get_params(self, deep=True):
+        """
+        Get parameters for the preprocessor.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            If True, will return parameters of subobjects that are estimators.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+        params = {
+            "n_bins": self.n_bins,
+            "numerical_preprocessing": self.numerical_preprocessing,
+            "categorical_preprocessing": self.categorical_preprocessing,
+            "use_decision_tree_bins": self.use_decision_tree_bins,
+            "binning_strategy": self.binning_strategy,
+            "task": self.task,
+            "cat_cutoff": self.cat_cutoff,
+            "treat_all_integers_as_numerical": self.treat_all_integers_as_numerical,
+            "degree": self.degree,
+            "knots": self.n_knots,
+        }
+        return params
+
     def set_params(self, **params):
+        """
+        Set parameters for the preprocessor.
+
+        Parameters
+        ----------
+        **params : dict
+            Parameter names mapped to their new values.
+
+        Returns
+        -------
+        self : object
+            Preprocessor instance.
+        """
         for key, value in params.items():
             setattr(self, key, value)
         return self
@@ -222,9 +263,11 @@ def fit(self, X, y=None):
                                 (
                                     "discretizer",
                                     KBinsDiscretizer(
-                                        n_bins=bins
-                                        if isinstance(bins, int)
-                                        else len(bins) - 1,
+                                        n_bins=(
+                                            bins
+                                            if isinstance(bins, int)
+                                            else len(bins) - 1
+                                        ),
                                         encode="ordinal",
                                         strategy=self.binning_strategy,
                                         subsample=200_000 if len(X) > 200_000 else None,

From 4e4cde8e51e831164314929b6dd714034d24374b Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 4 Dec 2024 11:26:50 +0000
Subject: [PATCH 131/132] version bump

---
 README.md                                          | 2 +-
 mambular/__version__.py                            | 2 +-
 mambular/arch_utils/layer_utils/embedding_layer.py | 2 +-
 mambular/configs/mlp_config.py                     | 3 +--
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 63ee098..5769c92 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 </div>
 
 <div style="text-align: center;">
-    <h1>Mambular: Tabular Deep Learning</h1>
+    <h1>Mambular: Tabular Deep  Made Simple</h1>
 </div>
 
 Mambular is a Python library for tabular deep learning. It includes models that leverage the Mamba (State Space Model) architecture, as well as other popular models like TabTransformer, FTTransformer, TabM and tabular ResNets. Check out our paper `Mambular: A Sequential Model for Tabular Deep Learning`, available [here](https://arxiv.org/abs/2408.06291). Also check out our paper introducing [TabulaRNN](https://arxiv.org/pdf/2411.17207) and analyzing the efficiency of NLP inspired tabular models. 
diff --git a/mambular/__version__.py b/mambular/__version__.py
index 095e93c..23b9f4b 100644
--- a/mambular/__version__.py
+++ b/mambular/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.2.3"
+__version__ = "1.0.0"
diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index 478a70f..83b84ac 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -54,7 +54,7 @@ def __init__(self, num_feature_info, cat_feature_info, config):
                 d_embedding=self.d_model,
                 n_frequencies=getattr(config, "n_frequencies", 48),
                 frequency_init_scale=getattr(config, "frequency_init_scale", 0.01),
-                activation=self.embedding_activation,
+                activation=True,
                 lite=getattr(config, "plr_lite", False),
             )
         elif self.embedding_type == "linear":
diff --git a/mambular/configs/mlp_config.py b/mambular/configs/mlp_config.py
index fd1fe09..dc5e458 100644
--- a/mambular/configs/mlp_config.py
+++ b/mambular/configs/mlp_config.py
@@ -62,7 +62,7 @@ class DefaultMLPConfig:
     weight_decay: float = 1e-06
     lr_factor: float = 0.1
     layer_sizes: list = (256, 128, 32)
-    activation: callable = nn.SELU()
+    activation: callable = nn.ReLU()
     skip_layers: bool = False
     dropout: float = 0.2
     use_glu: bool = False
@@ -76,5 +76,4 @@ class DefaultMLPConfig:
     embedding_bias: bool = False
     layer_norm_after_embedding: bool = False
     d_model: int = 32
-    embedding_type: float = "plr"
     plr_lite: bool = False

From 64417d12d04845dcad3a029b05e9376d6e93e708 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 4 Dec 2024 12:06:50 +0000
Subject: [PATCH 132/132] add ndtf in readme

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5769c92..7e1e9fc 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 </div>
 
 <div style="text-align: center;">
-    <h1>Mambular: Tabular Deep  Made Simple</h1>
+    <h1>Mambular: Tabular Deep Learning Made Simple</h1>
 </div>
 
 Mambular is a Python library for tabular deep learning. It includes models that leverage the Mamba (State Space Model) architecture, as well as other popular models like TabTransformer, FTTransformer, TabM and tabular ResNets. Check out our paper `Mambular: A Sequential Model for Tabular Deep Learning`, available [here](https://arxiv.org/abs/2408.06291). Also check out our paper introducing [TabulaRNN](https://arxiv.org/pdf/2411.17207) and analyzing the efficiency of NLP inspired tabular models. 
@@ -65,6 +65,7 @@ Mambular is a Python package that brings the power of advanced deep learning arc
 | `MambaTab`       | A tabular model using a Mamba-Block on a joint input representation described [here](https://arxiv.org/abs/2401.08867) . Not a sequential model.    |
 | `TabulaRNN`      | A Recurrent Neural Network for Tabular data, introduced [here](https://arxiv.org/pdf/2411.17207).                                                   |
 | `MambAttention`  | A combination between Mamba and Transformers, also introduced [here](https://arxiv.org/pdf/2411.17207).                                             |
+| `NDTF`           | A neural decision forest using soft decision trees. See [Kontschieder et al.](https://openaccess.thecvf.com/content_iccv_2015/html/Kontschieder_Deep_Neural_Decision_ICCV_2015_paper.html) for inspiration. |