feat: add sklearn compatibility with __sklearn_tags__ method

agentfarmx[bot] · agentfarmx[bot] · commit 959e02bd993e · 2025-02-28T22:13:50.000Z
This commit adds proper scikit-learn compatibility by implementing
`__sklearn_tags__` in BaseModel and model subclasses. It also adds direct
property accessors for model coefficients and improves model handling in
SQL generation utilities.
diff --git a/ml2sql/utils/modelling/models/base_model.py b/ml2sql/utils/modelling/models/base_model.py
@@ -171,4 +171,44 @@ def get_params(self, deep=True):
             return self.params
         if hasattr(self.model, 'get_params'):
             return self.model.get_params(deep)
-        return self.params
+        return self.params
+    
+    def __sklearn_tags__(self):
+        """
+        Get the sklearn tags for this model.
+        
+        This method is required for compatibility with scikit-learn's estimator interface.
+        It delegates to the underlying model if it exists, otherwise returns a default set of tags.
+        
+        Returns
+        -------
+        dict
+            Dictionary of tags describing the model.
+        """
+        if self.model is not None and hasattr(self.model, '__sklearn_tags__'):
+            return self.model.__sklearn_tags__()
+        elif self.model is not None and hasattr(self.model, '_get_tags'):
+            # For older scikit-learn versions
+            return self.model._get_tags()
+        else:
+            # Default tags
+            return {
+                'allow_nan': False,
+                'binary_only': False,
+                'multilabel': False,
+                'multioutput': False,
+                'multioutput_only': False,
+                'no_validation': False,
+                'non_deterministic': False,
+                'pairwise': False,
+                'preserves_dtype': [],
+                'poor_score': False,
+                'requires_fit': True,
+                'requires_positive_X': False,
+                'requires_positive_y': False,
+                'requires_y': True,
+                'stateless': False,
+                'X_types': ['2darray'],
+                '_skip_test': False,
+                '_xfail_checks': False
+            }
diff --git a/ml2sql/utils/modelling/models/ebm.py b/ml2sql/utils/modelling/models/ebm.py
@@ -16,6 +16,46 @@ class EBMModel(BaseModel):
     model from the interpret package.
     """
     
+    def __sklearn_tags__(self):
+        """
+        Get the sklearn tags for this model.
+        
+        This method is required for compatibility with scikit-learn's estimator interface.
+        It delegates to the underlying model if it exists, otherwise returns a default set of tags.
+        
+        Returns
+        -------
+        dict
+            Dictionary of tags describing the model.
+        """
+        if self.model is not None and hasattr(self.model, '__sklearn_tags__'):
+            return self.model.__sklearn_tags__()
+        elif self.model is not None and hasattr(self.model, '_get_tags'):
+            # For older scikit-learn versions
+            return self.model._get_tags()
+        else:
+            # Default tags
+            return {
+                'allow_nan': False,
+                'binary_only': False,
+                'multilabel': False,
+                'multioutput': False,
+                'multioutput_only': False,
+                'no_validation': False,
+                'non_deterministic': False,
+                'pairwise': False,
+                'preserves_dtype': [],
+                'poor_score': False,
+                'requires_fit': True,
+                'requires_positive_X': False,
+                'requires_positive_y': False,
+                'requires_y': True,
+                'stateless': False,
+                'X_types': ['2darray'],
+                '_skip_test': False,
+                '_xfail_checks': False
+            }
+    
     def train(self, X_train, y_train, model_type):
         """
         Train an Explainable Boosting Machine (EBM) model on the given training data.
@@ -142,7 +182,7 @@ def trainModel(X_train, y_train, params, model_type):
     """
     Legacy function for backward compatibility.
     
-    Creates and trains an EBMModel instance.
+    Creates and trains an EBM model directly without using the EBMModel wrapper.
     
     Parameters
     ----------
@@ -157,11 +197,26 @@ def trainModel(X_train, y_train, params, model_type):
         
     Returns
     -------
-    clf : EBMModel
+    clf : ExplainableBoostingClassifier or ExplainableBoostingRegressor
         Trained EBM model.
     """
-    model = EBMModel(params)
-    return model.train(X_train, y_train, model_type).model
+    if "feature_names" not in params.keys():
+        params["feature_names"] = X_train.columns
+    
+    if model_type == "regression":
+        clf = ExplainableBoostingRegressor(**params)
+    elif model_type == "classification":
+        clf = ExplainableBoostingClassifier(**params)
+    else:
+        logger.warning("Only regression or classification available")
+        raise ValueError("Invalid model_type. Must be 'regression' or 'classification'.")
+
+    clf.fit(X_train, y_train)
+    
+    logger.info(f"Model params:\n {clf.get_params}")
+    logger.info("Trained explainable boosting machine")
+
+    return clf
 
 
 def featureExplanationSave(clf, given_name, file_type):
diff --git a/ml2sql/utils/modelling/models/l_regression.py b/ml2sql/utils/modelling/models/l_regression.py
@@ -13,6 +13,78 @@ class LinearRegressionModel(BaseModel):
     models from the interpret package.
     """
     
+    def __sklearn_tags__(self):
+        """
+        Get the sklearn tags for this model.
+        
+        This method is required for compatibility with scikit-learn's estimator interface.
+        It delegates to the underlying model if it exists, otherwise returns a default set of tags.
+        
+        Returns
+        -------
+        dict
+            Dictionary of tags describing the model.
+        """
+        if self.model is not None and hasattr(self.model, '__sklearn_tags__'):
+            return self.model.__sklearn_tags__()
+        elif self.model is not None and hasattr(self.model, '_get_tags'):
+            # For older scikit-learn versions
+            return self.model._get_tags()
+        else:
+            # Default tags
+            return {
+                'allow_nan': False,
+                'binary_only': False,
+                'multilabel': False,
+                'multioutput': False,
+                'multioutput_only': False,
+                'no_validation': False,
+                'non_deterministic': False,
+                'pairwise': False,
+                'preserves_dtype': [],
+                'poor_score': False,
+                'requires_fit': True,
+                'requires_positive_X': False,
+                'requires_positive_y': False,
+                'requires_y': True,
+                'stateless': False,
+                'X_types': ['2darray'],
+                '_skip_test': False,
+                '_xfail_checks': False
+            }
+    
+    @property
+    def coef_(self):
+        """
+        Get the coefficients of the model.
+        
+        Returns
+        -------
+        numpy.ndarray
+            Coefficients of the model.
+        """
+        if self.model is None:
+            raise ValueError("Model has not been trained yet.")
+        if not hasattr(self.model, 'sk_model_'):
+            raise AttributeError("Model does not have sk_model_ attribute.")
+        return self.model.sk_model_.coef_
+
+    @property
+    def intercept_(self):
+        """
+        Get the intercept of the model.
+        
+        Returns
+        -------
+        float or numpy.ndarray
+            Intercept of the model.
+        """
+        if self.model is None:
+            raise ValueError("Model has not been trained yet.")
+        if not hasattr(self.model, 'sk_model_'):
+            raise AttributeError("Model does not have sk_model_ attribute.")
+        return self.model.sk_model_.intercept_
+    
     def train(self, X_train, y_train, model_type):
         """
         Train a Linear/Logistic Regression model on the given training data.
@@ -133,7 +205,7 @@ def trainModel(X_train, y_train, params, model_type):
     """
     Legacy function for backward compatibility.
     
-    Creates and trains a LinearRegressionModel instance.
+    Creates and trains a Linear/Logistic Regression model directly without using the LinearRegressionModel wrapper.
     
     Parameters
     ----------
@@ -151,8 +223,24 @@ def trainModel(X_train, y_train, params, model_type):
     clf : LinearRegression or LogisticRegression
         Trained model.
     """
-    model = LinearRegressionModel(params)
-    return model.train(X_train, y_train, model_type).model
+    if model_type == "regression":
+        clf = LinearRegression(**params)
+        clf_name = "Linear regression"
+    elif model_type == "classification":
+        clf = LogisticRegression(**params)
+        # Hard code classes_
+        clf.classes_ = list(set(y_train))
+        clf_name = "Logistic regression"
+    else:
+        logger.warning("Only regression or classification available")
+        raise ValueError("Invalid model_type. Must be 'regression' or 'classification'.")
+
+    clf.fit(X_train, y_train)
+    
+    logger.info(f"Model non default params:\n {clf.kwargs}")
+    logger.info(f"Trained {clf_name.lower()}")
+
+    return clf
 
 
 def featureExplanationSave(clf, given_name, file_type):
diff --git a/ml2sql/utils/output_scripts/decision_tree_as_code.py b/ml2sql/utils/output_scripts/decision_tree_as_code.py
@@ -13,15 +13,20 @@ def tree_to_sql(tree, file=sys.stdout):
 
     Parameters:
     -----------
-    tree: sklearn decision tree model
+    tree: sklearn decision tree model or DecisionTreeModel
         The decision tree to represent as an SQL function
     file: file object, optional (default=sys.stdout)
         The file to write the output to. If not specified, prints to console.
     """
+    # Check if this is our custom model wrapper
+    if hasattr(tree, 'model'):
+        actual_tree = tree.model
+    else:
+        actual_tree = tree
 
-    tree_ = tree.tree_
+    tree_ = actual_tree.tree_
     feature_name = [
-        tree.feature_names_in_[i] if i != _tree.TREE_UNDEFINED else "undefined!"
+        actual_tree.feature_names_in_[i] if i != _tree.TREE_UNDEFINED else "undefined!"
         for i in tree_.feature
     ]
 
@@ -43,11 +48,11 @@ def recurse(node, depth):
             recurse(tree_.children_right[node], depth + 1)
             print(f"{indent}END", file=file)
         else:
-            if hasattr(tree, "classes_"):
+            if hasattr(actual_tree, "classes_"):
                 class_values = tree_.value[node]
                 samples = tree_.n_node_samples[node]
                 max_value = int(np.max(class_values))
-                predicted_class = tree.classes_[np.argmax(class_values)]
+                predicted_class = actual_tree.classes_[np.argmax(class_values)]
 
                 if np.issubdtype(type(predicted_class), np.integer):
                     print(
@@ -80,4 +85,4 @@ def save_model_and_extras(clf, model_name, post_params):
     logger.info("SQL version of decision tree saved")
 
     # If you want to also print to console, you can call the function again without the file parameter
-    # tree_to_sql(clf)
+    # tree_to_sql(clf)
diff --git a/ml2sql/utils/output_scripts/ebm_as_code.py b/ml2sql/utils/output_scripts/ebm_as_code.py
@@ -840,11 +840,17 @@ def ebm_to_sql(model_name, df, classes, split=True):
 
 
 def save_model_and_extras(ebm, model_name, post_params):
+    # Check if this is our custom model wrapper
+    if hasattr(ebm, 'model'):
+        actual_ebm = ebm.model
+    else:
+        actual_ebm = ebm
+        
     # extract lookup table from EBM
-    lookup_df = extractLookupTable(ebm, post_params)
+    lookup_df = extractLookupTable(actual_ebm, post_params)
     # In case of regression
-    if not hasattr(ebm, "classes_"):
-        ebm.classes_ = [0]
+    if not hasattr(actual_ebm, "classes_"):
+        actual_ebm.classes_ = [0]
         lookup_df["intercept"] = [lookup_df["intercept"]]
 
     # Write printed output to file
@@ -854,7 +860,7 @@ def save_model_and_extras(ebm, model_name, post_params):
     with open(output_path, "w") as f:
         with redirect_stdout(f):
             model_name = Path(model_name).name
-            ebm_to_sql(model_name, lookup_df, ebm.classes_, post_params["sql_split"])
+            ebm_to_sql(model_name, lookup_df, actual_ebm.classes_, post_params["sql_split"])
     logger.info("SQL version of EBM saved")
 
 
@@ -867,4 +873,4 @@ def save_model_and_extras(ebm, model_name, post_params):
         "file_type": "png",
         "sql_decimals": 15,
     }
-    save_model_and_extras(ebm, model_name, post_params)
+    save_model_and_extras(ebm, model_name, post_params)
diff --git a/ml2sql/utils/output_scripts/l_regression_as_code.py b/ml2sql/utils/output_scripts/l_regression_as_code.py
@@ -11,7 +11,7 @@ def extract_parameters(model):
     Extracts model_type, features, coefficients, and intercept from a trained logistic regression model.
 
     Parameters:
-    - trained_model: The trained logistic regression model object.
+    - model: The trained model object, either a direct scikit-learn model or our custom wrapper.
 
     Returns:
     - model_type: String, either regression of classification
@@ -20,30 +20,36 @@ def extract_parameters(model):
     - intercept: Intercept of the logistic regression model.
     """
     try:
+        # Check if this is our custom model wrapper
+        if hasattr(model, 'model'):
+            actual_model = model.model
+        else:
+            actual_model = model
+
         # Extract model type
-        if model.__class__.__name__ == "LinearRegression":
+        if actual_model.__class__.__name__ == "LinearRegression":
             model_type = "regression"
             pclasses = None
-        elif len(model.classes_) > 2:
+        elif len(actual_model.classes_) > 2:
             model_type = "multiclass"
-            pclasses = model.classes_
-        elif len(model.classes_) == 2:
+            pclasses = actual_model.classes_
+        elif len(actual_model.classes_) == 2:
             model_type = "binary"
-            pclasses = model.classes_
+            pclasses = actual_model.classes_
 
         # Extract features
-        features = model.feature_names_in_
+        features = actual_model.feature_names_in_
 
         if model_type == "binary":
-            coefficients = model.sk_model_.coef_[0]
+            coefficients = actual_model.coef_[0]
         else:
-            coefficients = model.sk_model_.coef_
+            coefficients = actual_model.coef_
 
         # Extract intercept
         if model_type == "binary":
-            intercept = model.sk_model_.intercept_[0]
+            intercept = actual_model.intercept_[0]
         else:
-            intercept = model.sk_model_.intercept_
+            intercept = actual_model.intercept_
 
         return model_type, pclasses, features, coefficients, intercept
 
@@ -191,4 +197,4 @@ def save_model_and_extras(clf, model_name, post_params):
                 intercept,
                 post_params,
             )
-    logger.info("SQL version of logistic/linear regression saved")
+    logger.info("SQL version of logistic/linear regression saved")