Skip to content

Commit

Permalink
updated other models
Browse files Browse the repository at this point in the history
  • Loading branch information
Ibrahim-Ola committed Feb 19, 2025
1 parent 772191e commit 17b0e65
Showing 1 changed file with 340 additions and 0 deletions.
340 changes: 340 additions & 0 deletions snowmodels/utils/_model_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
import datetime
import numpy as np
import pandas as pd
from typing import Dict, Union
from ._conversions import ConvertData, OutOfBoundsError

## Machine Learning Libraries
import category_encoders as ce
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score

set_config(transform_output="pandas")


# Set the seed for reproducibility
seed = 10


def split_data(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:

"""
A function that splits the data into training (70%), testing (20%) and tuning (10%) sets.
Parameters:
-----------
df : pandas DataFrame
A pandas DataFrame containing the data to split.
Returns:
--------
A dictionary containing the training, testing and tuning sets.
"""

X, y, strata = df.drop('Snow_Density', axis=1), df['Snow_Density'], df['Snow_Class']

X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.20, stratify=strata, random_state=seed
)

strata2 = X_temp['Snow_Class']

X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=1/8, stratify=strata2, random_state=seed
)


return {
'X_train': X_train,
'X_test': X_test,
'X_val': X_val,
'X_temp': X_temp,
'y_train': y_train,
'y_test': y_test,
'y_val': y_val,
'y_temp': y_temp
}


# let's create a function that will help us evaluate the results of the model

def evaluate_model(
y_true: Union[pd.Series, list],
y_pred: Union[pd.Series, list],
model_name: str
) -> pd.DataFrame:

"""
A function that evaluates the performance of a model using the RMSE, MBE and R2 metrics.
Parameters:
-----------
y_true : pandas Series or list
A pandas Series or list containing the true values of the target variable.
y_pred : pandas Series or list
A pandas Series or list containing the predicted values of the target variable.
model_name : str
A string representing the name of the model.
Returns:
--------
A pandas DataFrame containing the RMSE, MBE and R2 metrics for the model.
"""

RMSE = root_mean_squared_error(y_true=y_true, y_pred=y_pred)
MBE = np.mean(y_pred- y_true)
RSQ = r2_score(y_true=y_true, y_pred=y_pred)

score_df = pd.DataFrame({
model_name: [RMSE, MBE, RSQ]
}, index = ['RMSE', 'MBE', 'RSQ'])

return score_df


def validate_DOY(x: int | float | str | pd.Timestamp | datetime.datetime, origin: int = None) -> int:

"""
Validates or converts an input to a day of the year (DOY).
Accepts integer, float, string, datetime.datetime, or pd.Timestamp inputs.
If the input is an integer, float, or a string of integer it must be between 1 and 366.
If the input is a string, it must be convertible to a valid date.
"""

try:
float_x = float(x)
except:
pass

else:
if float_x.is_integer():
doy = int(float_x)
if doy >= 1 and doy <= 366:
return doy
else:
raise OutOfBoundsError(f"DOY must be between 1 and 366. Got {doy}.")
else:
raise ValueError(f"DOY must be a whole number. Got {x}.")

if isinstance(x, (str, pd.Timestamp, datetime.datetime)):
try:
timestamp = pd.Timestamp(x) if isinstance(x, str) else x

if origin < 1 or origin > 12:
raise OutOfBoundsError(f"Origin must be between 1 and 12. Got {origin}.")

converter=ConvertData()
return converter.date_to_DOY(date=timestamp, origin=origin, algorithm='default')
except ValueError as e:
raise ValueError(f"Could not convert {x} to a valid DOY. {e}")
else:
raise TypeError(f"Input type is not supported. Expected types are int, float, str, datetime.datetime, or pd.Timestamp, got {type(x).__name__}.")


def preprocessing_pipeline(numeric_features: list, categorical_feature: list, scaler=None, encoder=None) -> ColumnTransformer:

"""
A function that preprocesses the data by scaling the numeric features and encoding the categorical features.
Parameters:
-----------
numeric_features : list
A list containing the names of the numeric features.
categorical_feature : list
A list containing the names of the categorical features.
scaler : object
A scaler object that scales the numeric features. Default is None. Uses StandardScaler if None.
encoder : object
An encoder object that encodes the categorical features. Default is None. Uses TargetEncoder if None.
Returns:
--------
A ColumnTransformer object that preprocesses the data.
"""

if scaler is None:
scaler=StandardScaler()

if encoder is None:
encoder=ce.TargetEncoder(min_samples_leaf=20, smoothing=10)

numeric_transformer=Pipeline(steps=[
('scale', scaler) ## Scaling the data
])

categorical_transformer=Pipeline(steps=[
('encoder', encoder)
])

feature_engineering_pipeline= ColumnTransformer(
transformers=[
('numeric', numeric_transformer, numeric_features),
('categorical', categorical_transformer, categorical_feature)
],
remainder='drop'
)

return feature_engineering_pipeline


def compare_multiple_models(preds_df: pd.DataFrame, y_true: str) -> pd.DataFrame:

"""
A function that compares the performance of multiple models using the RMSE, MBE and R2 metrics.
Parameters:
-----------
preds_df : pandas DataFrame
A pandas DataFrame containing the predictions of multiple models.
y_true : str
A string representing the name of the target variable.
Returns:
--------
A pandas DataFrame containing the RMSE, MBE and R2 metrics for each model.
"""

other_preds=preds_df.drop(y_true, axis=1)

evaluations=[]

for col in other_preds.columns:
eval=evaluate_model(y_true=preds_df[y_true], y_pred=other_preds[col], model_name=col)
evaluations.append(eval)

return pd.concat(evaluations, axis=1)


def preprocess_data(data, encoder, numeric_features=['Elevation', 'Snow_Depth', 'DOWY']):

"""
Transforms a dataset by encoding the 'Snow_Class' feature and retaining specified numeric features.
This function applies one of two encoding strategies based on the type of encoder provided:
- If the encoder is an instance of `OneHotEncoder`, it performs one-hot encoding on the 'Snow_Class' column.
The function fits the encoder on `X_train` and applies the transformation to `X_val` and `X_test`.
- For other encoders, it assumes a different encoding strategy (e.g., label encoding) and applies it to 'Snow_Class'.
In both cases, the function retains a specified subset of numeric features in each transformed dataset.
Parameters
----------
data : dict of pandas.DataFrame
A dictionary containing the datasets, expected to have keys 'X_train', 'X_val', 'X_test', 'y_train', 'y_val', and 'y_test'.
Each DataFrame in `X_train`, `X_val`, and `X_test` must include the 'Snow_Class' column.
encoder : sklearn encoder instance
The encoder to apply to the 'Snow_Class' column. Should be compatible with `fit_transform` and `transform` methods.
For one-hot encoding, use `OneHotEncoder` from sklearn. Other encoders will be applied differently to the 'Snow_Class' column.
numeric_features : list of str, optional
A list of numeric feature column names to retain in each transformed dataset, by default ['Elevation', 'Snow_Depth', 'DOWY'].
Returns
-------
dict of pandas.DataFrame
A dictionary containing the transformed training, validation, and test sets with encoded 'Snow_Class' values:
- 'X_train_transformed': Transformed training set with selected numeric features and encoded 'Snow_Class'.
- 'X_val_transformed': Transformed validation set with selected numeric features and encoded 'Snow_Class'.
- 'X_test_transformed': Transformed test set with selected numeric features and encoded 'Snow_Class'.
- 'y_train': Original target values for the training set.
- 'y_val': Original target values for the validation set.
- 'y_test': Original target values for the test set.
Examples
--------
>>> from sklearn.preprocessing import OneHotEncoder
>>> encoder = OneHotEncoder()
>>> data = {
... 'X_train': df_train,
... 'X_val': df_val,
... 'X_test': df_test,
... 'y_train': y_train,
... 'y_val': y_val,
... 'y_test': y_test
... }
>>> transformed_data = transform_snow_class(data, encoder, numeric_features=['Elevation', 'Snow_Depth', 'DOWY'])
>>> transformed_data['X_train_transformed'].head()
"""

if encoder.__class__.__name__ == 'OneHotEncoder':

X_train_transformed=(
data['X_train']
.filter(items=numeric_features)
.assign(
**pd.DataFrame(
encoder.fit_transform(data['X_train'].Snow_Class.values.reshape(-1,1)),
columns=list(encoder.categories_[0]),
index=data['X_train'].index
).to_dict(orient='list')
)
)

X_val_transformed=(
data['X_val']
.filter(items=numeric_features)
.assign(
**pd.DataFrame(
encoder.transform(data['X_val'].Snow_Class.values.reshape(-1,1)),
columns=list(encoder.categories_[0]),
index=data['X_val'].index
).to_dict(orient='list')
)
)

X_test_transformed=(
data['X_test']
.filter(items=numeric_features)
.assign(
**pd.DataFrame(
encoder.transform(data['X_test'].Snow_Class.values.reshape(-1,1)),
columns=list(encoder.categories_[0]),
index=data['X_test'].index
).to_dict(orient='list')
)
)

else:

X_train_transformed=(
data['X_train']
.filter(items=numeric_features)
.assign(Snow_Class=encoder.fit_transform(data['X_train'].Snow_Class, data['y_train']))
)

X_val_transformed=(
data['X_val']
.filter(items=numeric_features)
.assign(Snow_Class=encoder.transform(data['X_val'].Snow_Class))
)

X_test_transformed=(
data['X_test']
.filter(items=numeric_features)
.assign(Snow_Class=encoder.transform(data['X_test'].Snow_Class))
)


return {
'X_train_transformed': X_train_transformed,
'X_val_transformed': X_val_transformed,
'X_test_transformed': X_test_transformed,
'y_train': data['y_train'],
'y_test': data['y_test'],
'y_val': data['y_val']
}

0 comments on commit 17b0e65

Please sign in to comment.