Skip to content

Commit

Permalink
typo for module
Browse files Browse the repository at this point in the history
  • Loading branch information
mkumar73 committed May 27, 2024
1 parent 26bef8b commit 123d7f2
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 33 deletions.
3 changes: 2 additions & 1 deletion mambular/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import base_models, models
from .__version__ import __version__
from .utils import Preprocessor

__all__ = ['base_models', 'models', '__version__']
__all__ = ['base_models', 'models', 'Preprocessor', '__version__']
3 changes: 3 additions & 0 deletions mambular/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .preprocessor import Preprocessor

__all__ = ['Preprocessor']
74 changes: 42 additions & 32 deletions mambular/utils/preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import (
StandardScaler,
KBinsDiscretizer,
MinMaxScaler,
)
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.exceptions import NotFittedError
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (KBinsDiscretizer, MinMaxScaler,
StandardScaler)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

__all__ = ['Preprocessor']


class CustomBinner(TransformerMixin):
Expand Down Expand Up @@ -87,7 +86,8 @@ def transform(self, X):
# Transform the categories to their mapped integer values
X_transformed = np.array(
[
[self.mapping_[col].get(value, -1) for col, value in enumerate(row)]
[self.mapping_[col].get(value, -1)
for col, value in enumerate(row)]
for row in X
]
)
Expand Down Expand Up @@ -187,7 +187,8 @@ class Preprocessor:
The class is designed to work seamlessly with pandas DataFrames, facilitating easy integration into
machine learning pipelines.
Parameters:
Parameters
----------
n_bins (int): The number of bins to use for numerical feature binning. This parameter is relevant
only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
numerical_preprocessing (str): The preprocessing strategy for numerical features. Valid options are
Expand All @@ -196,17 +197,17 @@ class Preprocessor:
optimal bin edges for numerical feature binning. This parameter is
relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
Attributes:
column_transformer (ColumnTransformer): A sklearn ColumnTransformer instance that holds the configured
preprocessing pipelines for the different feature types.
Methods:
fit(X, y=None): Fits the preprocessor to the data, identifying feature types and configuring the
appropriate transformations.
transform(X): Transforms the data using the fitted preprocessing pipelines.
fit_transform(X, y=None): Fits the preprocessor to the data and then transforms the data.
get_feature_info(): Returns information about the processed features, including the number of bins for
binned features and the dimensionality of encoded features.
# Attributes:
# column_transformer (ColumnTransformer): A sklearn ColumnTransformer instance that holds the configured
# preprocessing pipelines for the different feature types.
# Methods:
# fit(X, y=None): Fits the preprocessor to the data, identifying feature types and configuring the
# appropriate transformations.
# transform(X): Transforms the data using the fitted preprocessing pipelines.
# fit_transform(X, y=None): Fits the preprocessor to the data and then transforms the data.
# get_feature_info(): Returns information about the processed features, including the number of bins for
# binned features and the dimensionality of encoded features.
"""

def __init__(
Expand Down Expand Up @@ -249,7 +250,8 @@ def _detect_column_types(self, X):
num_unique_values = X[col].nunique()
total_samples = len(X[col])
if X[col].dtype.kind not in "iufc" or (
X[col].dtype.kind == "i" and (num_unique_values / total_samples) < 0.05
X[col].dtype.kind == "i" and (
num_unique_values / total_samples) < 0.05
):
categorical_features.append(col)
else:
Expand Down Expand Up @@ -282,7 +284,8 @@ def fit(self, X, y=None):

if self.numerical_preprocessing in ["binning", "one_hot"]:
bins = (
self._get_decision_tree_bins(X[[feature]], y, [feature])
self._get_decision_tree_bins(
X[[feature]], y, [feature])
if self.use_decision_tree_bins
else self.n_bins
)
Expand All @@ -297,7 +300,8 @@ def fit(self, X, y=None):
else len(bins) - 1,
encode="ordinal",
strategy=self.binning_strategy,
subsample=200_000 if len(X) > 200_000 else None,
subsample=200_000 if len(
X) > 200_000 else None,
),
),
]
Expand All @@ -320,14 +324,17 @@ def fit(self, X, y=None):
)

elif self.numerical_preprocessing == "standardization":
numeric_transformer_steps.append(("scaler", StandardScaler()))
numeric_transformer_steps.append(
("scaler", StandardScaler()))

elif self.numerical_preprocessing == "normalization":
numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
numeric_transformer_steps.append(
("normalizer", MinMaxScaler()))

numeric_transformer = Pipeline(numeric_transformer_steps)

transformers.append((f"num_{feature}", numeric_transformer, [feature]))
transformers.append(
(f"num_{feature}", numeric_transformer, [feature]))

if categorical_features:
for feature in categorical_features:
Expand Down Expand Up @@ -374,7 +381,8 @@ def _get_decision_tree_bins(self, X, y, numerical_features):
bin_edges = np.sort(np.unique(thresholds))

bins.append(
np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()]))
np.concatenate(
([X[feature].min()], bin_edges, [X[feature].max()]))
)
return bins

Expand Down Expand Up @@ -471,7 +479,8 @@ def get_feature_info(self):
# Handle features processed with discretization
if "discretizer" in steps:
step = transformer_pipeline.named_steps["discretizer"]
n_bins = step.n_bins_[0] if hasattr(step, "n_bins_") else None
n_bins = step.n_bins_[0] if hasattr(
step, "n_bins_") else None

# Check if discretization is followed by one-hot encoding
if "onehot_from_ordinal" in steps:
Expand All @@ -492,7 +501,8 @@ def get_feature_info(self):
# Handle features processed with continuous ordinal encoding
elif "continuous_ordinal" in steps:
step = transformer_pipeline.named_steps["continuous_ordinal"]
n_categories = len(step.mapping_[columns.index(feature_name)])
n_categories = len(
step.mapping_[columns.index(feature_name)])
binned_or_ordinal_info[feature_name] = n_categories
print(
f"Categorical Feature (Ordinal Encoded): {feature_name}, Number of unique categories: {n_categories}"
Expand Down

0 comments on commit 123d7f2

Please sign in to comment.