typo for module

basf · May 27, 2024 · 123d7f2 · 123d7f2
1 parent 26bef8b
commit 123d7f2
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 33 deletions.
diff --git a/mambular/__init__.py b/mambular/__init__.py
@@ -1,4 +1,5 @@
 from . import base_models, models
 from .__version__ import __version__
+from .utils import Preprocessor
 
-__all__ = ['base_models', 'models', '__version__']
+__all__ = ['base_models', 'models', 'Preprocessor', '__version__']
diff --git a/mambular/utils/__init__.py b/mambular/utils/__init__.py
@@ -0,0 +1,3 @@
+from .preprocessor import Preprocessor
+
+__all__ = ['Preprocessor']
diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py
@@ -1,16 +1,15 @@
-import pandas as pd
 import numpy as np
-from sklearn.base import TransformerMixin, BaseEstimator
-from sklearn.preprocessing import (
-    StandardScaler,
-    KBinsDiscretizer,
-    MinMaxScaler,
-)
-from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer
 from sklearn.exceptions import NotFittedError
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import (KBinsDiscretizer, MinMaxScaler,
+                                   StandardScaler)
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+
+__all__ = ['Preprocessor']
 
 
 class CustomBinner(TransformerMixin):
@@ -87,7 +86,8 @@ def transform(self, X):
         # Transform the categories to their mapped integer values
         X_transformed = np.array(
             [
-                [self.mapping_[col].get(value, -1) for col, value in enumerate(row)]
+                [self.mapping_[col].get(value, -1)
+                 for col, value in enumerate(row)]
                 for row in X
             ]
         )
@@ -187,7 +187,8 @@ class Preprocessor:
     The class is designed to work seamlessly with pandas DataFrames, facilitating easy integration into
     machine learning pipelines.
 
-    Parameters:
+    Parameters
+    ----------
         n_bins (int): The number of bins to use for numerical feature binning. This parameter is relevant
                       only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
         numerical_preprocessing (str): The preprocessing strategy for numerical features. Valid options are
@@ -196,17 +197,17 @@ class Preprocessor:
                                        optimal bin edges for numerical feature binning. This parameter is
                                        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
 
-    Attributes:
-        column_transformer (ColumnTransformer): A sklearn ColumnTransformer instance that holds the configured
-                                                preprocessing pipelines for the different feature types.
-
-    Methods:
-        fit(X, y=None): Fits the preprocessor to the data, identifying feature types and configuring the
-                        appropriate transformations.
-        transform(X): Transforms the data using the fitted preprocessing pipelines.
-        fit_transform(X, y=None): Fits the preprocessor to the data and then transforms the data.
-        get_feature_info(): Returns information about the processed features, including the number of bins for
-                            binned features and the dimensionality of encoded features.
+    # Attributes:
+    #     column_transformer (ColumnTransformer): A sklearn ColumnTransformer instance that holds the configured
+    #                                             preprocessing pipelines for the different feature types.
+
+    # Methods:
+    #     fit(X, y=None): Fits the preprocessor to the data, identifying feature types and configuring the
+    #                     appropriate transformations.
+    #     transform(X): Transforms the data using the fitted preprocessing pipelines.
+    #     fit_transform(X, y=None): Fits the preprocessor to the data and then transforms the data.
+    #     get_feature_info(): Returns information about the processed features, including the number of bins for
+    #                         binned features and the dimensionality of encoded features.
     """
 
     def __init__(
@@ -249,7 +250,8 @@ def _detect_column_types(self, X):
             num_unique_values = X[col].nunique()
             total_samples = len(X[col])
             if X[col].dtype.kind not in "iufc" or (
-                X[col].dtype.kind == "i" and (num_unique_values / total_samples) < 0.05
+                X[col].dtype.kind == "i" and (
+                    num_unique_values / total_samples) < 0.05
             ):
                 categorical_features.append(col)
             else:
@@ -282,7 +284,8 @@ def fit(self, X, y=None):
 
                 if self.numerical_preprocessing in ["binning", "one_hot"]:
                     bins = (
-                        self._get_decision_tree_bins(X[[feature]], y, [feature])
+                        self._get_decision_tree_bins(
+                            X[[feature]], y, [feature])
                         if self.use_decision_tree_bins
                         else self.n_bins
                     )
@@ -297,7 +300,8 @@ def fit(self, X, y=None):
                                         else len(bins) - 1,
                                         encode="ordinal",
                                         strategy=self.binning_strategy,
-                                        subsample=200_000 if len(X) > 200_000 else None,
+                                        subsample=200_000 if len(
+                                            X) > 200_000 else None,
                                     ),
                                 ),
                             ]
@@ -320,14 +324,17 @@ def fit(self, X, y=None):
                         )
 
                 elif self.numerical_preprocessing == "standardization":
-                    numeric_transformer_steps.append(("scaler", StandardScaler()))
+                    numeric_transformer_steps.append(
+                        ("scaler", StandardScaler()))
 
                 elif self.numerical_preprocessing == "normalization":
-                    numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
+                    numeric_transformer_steps.append(
+                        ("normalizer", MinMaxScaler()))
 
                 numeric_transformer = Pipeline(numeric_transformer_steps)
 
-                transformers.append((f"num_{feature}", numeric_transformer, [feature]))
+                transformers.append(
+                    (f"num_{feature}", numeric_transformer, [feature]))
 
         if categorical_features:
             for feature in categorical_features:
@@ -374,7 +381,8 @@ def _get_decision_tree_bins(self, X, y, numerical_features):
             bin_edges = np.sort(np.unique(thresholds))
 
             bins.append(
-                np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()]))
+                np.concatenate(
+                    ([X[feature].min()], bin_edges, [X[feature].max()]))
             )
         return bins
 
@@ -471,7 +479,8 @@ def get_feature_info(self):
                 # Handle features processed with discretization
                 if "discretizer" in steps:
                     step = transformer_pipeline.named_steps["discretizer"]
-                    n_bins = step.n_bins_[0] if hasattr(step, "n_bins_") else None
+                    n_bins = step.n_bins_[0] if hasattr(
+                        step, "n_bins_") else None
 
                     # Check if discretization is followed by one-hot encoding
                     if "onehot_from_ordinal" in steps:
@@ -492,7 +501,8 @@ def get_feature_info(self):
                 # Handle features processed with continuous ordinal encoding
                 elif "continuous_ordinal" in steps:
                     step = transformer_pipeline.named_steps["continuous_ordinal"]
-                    n_categories = len(step.mapping_[columns.index(feature_name)])
+                    n_categories = len(
+                        step.mapping_[columns.index(feature_name)])
                     binned_or_ordinal_info[feature_name] = n_categories
                     print(
                         f"Categorical Feature (Ordinal Encoded): {feature_name}, Number of unique categories: {n_categories}"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .preprocessor import Preprocessor

		__all__ = ['Preprocessor']