diff --git a/.gitignore b/.gitignore index 9bac2e8..cdca365 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,4 @@ __pycache__ docs/build .coverage poetry.lock -coverage.xml +coverage.xml \ No newline at end of file diff --git a/README.rst b/README.rst index 5d56f15..28a4ec1 100644 --- a/README.rst +++ b/README.rst @@ -1,22 +1,24 @@ -|Tests|_ |Coverage|_ |ReadTheDocs|_ |PythonVersion|_ |Black|_ |License|_ +|Tests| |Coverage| |ReadTheDocs| |PythonVersion| |PyPI| |Black| |License| .. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg -.. _Tests: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml + :target: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml .. |Coverage| image:: https://codecov.io/gh/GauravPandeyLab/eipy/graph/badge.svg?token=M2AU2XWJB8 -.. _Coverage: https://codecov.io/gh/GauravPandeyLab/eipy + :target: https://codecov.io/gh/GauravPandeyLab/eipy .. |ReadTheDocs| image:: https://readthedocs.org/projects/eipy/badge/?version=latest -.. _ReadTheDocs: https://eipy.readthedocs.io/en/latest/ + :target: https://eipy.readthedocs.io/en/latest/ + +.. |PyPI| image:: https://img.shields.io/pypi/v/ensemble-integration + :target: https://pypi.org/project/ensemble-integration/ .. |PythonVersion| image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue -.. _PythonVersion: https://github.com/GauravPandeyLab/eipy .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg -.. _Black: https://github.com/psf/black + :target: https://github.com/psf/black .. |License| image:: https://img.shields.io/badge/License-GPLv3-blue -.. _License: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING + :target: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING ``ensemble-integration``: Integrating multi-modal data for predictive modeling diff --git a/docs/source/development.rst b/docs/source/development.rst index f44941f..e543169 100644 --- a/docs/source/development.rst +++ b/docs/source/development.rst @@ -1,7 +1,7 @@ Development =========== -We welcome contributions to the development of ``eipy``. To contribute follow the below instructions to submit a pull request: +We welcome contributions to the development of ``ensemble-integration``. To contribute follow the below instructions to submit a pull request: 1. **Install Python**. First of all make sure you have a supported version of Python on your local machine (see `GitHub `__ for supported versions). 2. **Install Poetry**. ``eipy`` uses Poetry to manage dependencies. To install Poetry follow the instructions on their `website `__. @@ -49,9 +49,9 @@ Note that new test file names must have the prefix `test_`. 9. **Submit pull request**. Updates must be made via a pull request. Internal users should note that pushing to the main branch has been disabled. -10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning ` +10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning `__ to iterate version numbers in pyproject.toml automatically. You can publish to PyPI by creating a new `release `__, which will run the "Publish to PyPI" workflow. This workflow determines the PyPI version number from the GitHub release tag, which you should manually iterate. -Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow. \ No newline at end of file +Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow. diff --git a/eipy/additional_ensembles.py b/eipy/additional_ensembles.py index c62a4a9..b803264 100644 --- a/eipy/additional_ensembles.py +++ b/eipy/additional_ensembles.py @@ -14,7 +14,18 @@ class MeanAggregation(BaseEstimator, ClassifierMixin): """ + Mean Aggregation + Trivially takes the mean of X. + + Attributes + ---------- + classes : array + Ordered arrray of unique labels for computing mean. + X_ : array of (n_samples, n_features) + Base predictor data for computing mean. + y_ : array of (n_samples,) + True labels of X_. """ def __init__(self): @@ -36,7 +47,18 @@ def predict_proba(self, X): class MedianAggregation(BaseEstimator, ClassifierMixin): """ + Median Aggregation + Trivially takes the median of X. + + Attributes + ---------- + classes : array + Ordered arrray of unique labels for computing mean. + X_ : array of (n_samples, n_features) + Base predictor data for computing mean. + y_ : array of (n_samples,) + True labels of X_. """ def __init__(self): @@ -63,6 +85,28 @@ class CES(BaseEstimator, ClassifierMixin): Caruana R. et al. (2006) Getting the most out of ensemble selection. In: Sixth International Conference on Data Mining (ICDM'06), 2006 IEEE, Piscataway, NJ, USA, pp. 828-833. + + Parameters + ---------- + scoring : + + max_ensemble_size : int + Maximum number of base models to ensemble. + random_state : int + For determining a random state. + greater_is_better : bool + For sorting models by performance with respect to a metric. + + Attributes + ---------- + selected_ensemble : list + List of models selected for ensemble. + train_performance : list + Record of model performances. + argbest : bool + True if metric of interest is to be maximized. Used for model selection. + best : bool + True if metric of interest is to be maximized. Used for selecting maximum scorers. """ def __init__( diff --git a/eipy/datasets.py b/eipy/datasets.py index 7e79315..7bba460 100644 --- a/eipy/datasets.py +++ b/eipy/datasets.py @@ -25,7 +25,7 @@ def load_diabetes(): """ zenodo_link = "https://zenodo.org/records/10035422/files/diabetes.zip?download=1" # Get data path - data_path = get_data_home() + data_path = _get_data_home() folder_ext = "diabetes" data_ext_path = join(data_path, folder_ext) # check data downloaded before @@ -66,7 +66,7 @@ def _load_csv(file_path, fn, suffix): return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"), index_col=0) -def get_data_home(data_home=None): +def _get_data_home(data_home=None): """Return the path of the eipy data directory. This function is referring from scikit-learn. diff --git a/eipy/ei.py b/eipy/ei.py index 8bc13fc..a8b45d6 100755 --- a/eipy/ei.py +++ b/eipy/ei.py @@ -17,21 +17,21 @@ from joblib import Parallel, delayed import warnings from eipy.utils import ( - X_is_dict, - X_to_numpy, - y_to_numpy, - set_predictor_seeds, - random_integers, - sample, - retrieve_X_y, - append_modality, - safe_predict_proba, + _X_is_dict, + _X_to_numpy, + _y_to_numpy, + _set_predictor_seeds, + _random_integers, + _sample, + _retrieve_X_y, + _append_modality, + _safe_predict_proba, dummy_cv, bar_format, ) from eipy.metrics import ( - base_summary, - ensemble_summary, + _base_summary, + _ensemble_summary, ) warnings.filterwarnings("ignore", category=DeprecationWarning) @@ -181,7 +181,7 @@ def __init__( self.modality_names = [] self.n_features_per_modality = [] - self.random_numbers_for_samples = random_integers( + self.random_numbers_for_samples = _random_integers( n_integers=n_samples, seed=self.random_state ) self.feature_names = {} @@ -210,17 +210,17 @@ def fit_base(self, X, y, base_predictors=None, modality_name=None): \n... for ensemble performance analysis...""" ) # convert y to a numpy array - y = y_to_numpy(y) + y = _y_to_numpy(y) # check if base_predictors are passed here if base_predictors is not None: self.base_predictors = base_predictors # update base predictors # set random_states in base_predictors - set_predictor_seeds(self.base_predictors, self.random_state) + _set_predictor_seeds(self.base_predictors, self.random_state) # check data format and train accordingly - if X_is_dict(X): + if _X_is_dict(X): for modality_name, modality in X.items(): self._fit_base( X=modality, @@ -252,12 +252,12 @@ def fit_ensemble(self, ensemble_predictors=None): if ensemble_predictors is not None: self.ensemble_predictors = ensemble_predictors - set_predictor_seeds(self.ensemble_predictors, self.random_state) + _set_predictor_seeds(self.ensemble_predictors, self.random_state) y_test_combined = [] for fold_id in range(self.k_outer): - _, y_test = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) + _, y_test = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) y_test_combined.extend(y_test) ensemble_predictions = {} @@ -270,17 +270,17 @@ def fit_ensemble(self, ensemble_predictors=None): y_pred_combined = [] for fold_id in range(self.k_outer): - X_train, y_train = retrieve_X_y( + X_train, y_train = _retrieve_X_y( labelled_data=self.ensemble_training_data[fold_id] ) - X_test, _ = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) + X_test, _ = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) if self.sampling_aggregation == "mean": X_train = X_train.T.groupby(level=[0, 1]).mean().T X_test = X_test.T.groupby(level=[0, 1]).mean().T model.fit(X_train, y_train) - y_pred = safe_predict_proba(model, X_test) + y_pred = _safe_predict_proba(model, X_test) y_pred_combined.extend(y_pred) ensemble_predictions[model_name] = y_pred_combined @@ -288,7 +288,7 @@ def fit_ensemble(self, ensemble_predictors=None): ensemble_predictions["labels"] = y_test_combined self.ensemble_predictions = pd.DataFrame.from_dict(ensemble_predictions) - self.ensemble_summary = ensemble_summary( + self.ensemble_summary = _ensemble_summary( self.ensemble_predictions, self.metrics ) @@ -298,7 +298,7 @@ def fit_ensemble(self, ensemble_predictors=None): desc="Training final ensemble models", bar_format=bar_format, ): - X_train, y_train = retrieve_X_y( + X_train, y_train = _retrieve_X_y( labelled_data=self.ensemble_training_data_final[0] ) @@ -314,7 +314,7 @@ def fit_ensemble(self, ensemble_predictors=None): def predict(self, X_dict, ensemble_model_key): """ - Predict class labels for samples in X + Predict class labels for samples in X. Parameters ---------- @@ -336,7 +336,7 @@ def predict(self, X_dict, ensemble_model_key): modality_name = self.modality_names[i] X = X_dict[modality_name] - X, _ = X_to_numpy(X) + X, _ = _X_to_numpy(X) base_models = copy.deepcopy(self.final_models["base models"][modality_name]) self.base_predictors = {} @@ -345,7 +345,7 @@ def predict(self, X_dict, ensemble_model_key): self.base_predictors[base_model_dict["model name"]] = 0 base_model = pickle.loads(base_model_dict["pickled model"]) - y_pred = safe_predict_proba(base_model, X) + y_pred = _safe_predict_proba(base_model, X) base_model_dict["fold id"] = 0 base_model_dict["y_pred"] = y_pred @@ -353,7 +353,7 @@ def predict(self, X_dict, ensemble_model_key): combined_predictions = self._combine_predictions_outer( base_models, modality_name, model_building=True ) - ensemble_prediction_data = append_modality( + ensemble_prediction_data = _append_modality( ensemble_prediction_data, combined_predictions, model_building=True ) ensemble_prediction_data = ensemble_prediction_data[0] @@ -367,12 +367,12 @@ def predict(self, X_dict, ensemble_model_key): self.final_models["ensemble models"][ensemble_model_key] ) - y_pred = safe_predict_proba(ensemble_model, ensemble_prediction_data) + y_pred = _safe_predict_proba(ensemble_model, ensemble_prediction_data) return y_pred @ignore_warnings(category=ConvergenceWarning) def _fit_base(self, X, y, base_predictors=None, modality_name=None): - X, feature_names = X_to_numpy(X) + X, feature_names = _X_to_numpy(X) self.modality_names.append(modality_name) self.feature_names[modality_name] = feature_names @@ -387,7 +387,7 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None): modality_name=modality_name, ) - self.ensemble_training_data = append_modality( + self.ensemble_training_data = _append_modality( self.ensemble_training_data, ensemble_training_data_modality ) @@ -399,12 +399,12 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None): modality_name=modality_name, ) - self.ensemble_test_data = append_modality( + self.ensemble_test_data = _append_modality( self.ensemble_test_data, ensemble_test_data_modality ) # append data to dataframe # create a summary of base predictor performance - self.base_summary = base_summary(self.ensemble_test_data, self.metrics) + self.base_summary = _base_summary(self.ensemble_test_data, self.metrics) if self.model_building: self._fit_base_final(X=X, y=y, modality_name=modality_name) @@ -428,7 +428,7 @@ def _fit_base_final(self, X, y, modality_name=None): modality_name=modality_name, ) - self.ensemble_training_data_final = append_modality( + self.ensemble_training_data_final = _append_modality( self.ensemble_training_data_final, ensemble_training_data_modality ) @@ -562,7 +562,7 @@ def _train_predict_single_base_predictor( X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] - X_sample, y_sample = sample( + X_sample, y_sample = _sample( X_train, y_train, strategy=self.sampling_strategy, @@ -581,7 +581,7 @@ def _train_predict_single_base_predictor( } else: - y_pred = safe_predict_proba(model, X_test) + y_pred = _safe_predict_proba(model, X_test) results_dict = { "model name": model_name, @@ -677,7 +677,6 @@ def save(self, path=None): Parameters ---------- - path : optional, default=None Path to save the EnsembleIntegration class object. """ @@ -695,7 +694,6 @@ def load(cls, path): Parameters ---------- - path : str Path to load the EnsembleIntegration class object. """ diff --git a/eipy/interpretation.py b/eipy/interpretation.py index 8b6025a..fc65c9c 100644 --- a/eipy/interpretation.py +++ b/eipy/interpretation.py @@ -1,5 +1,5 @@ from sklearn.inspection import permutation_importance -from eipy.utils import X_to_numpy, retrieve_X_y, bar_format, y_to_numpy +from eipy.utils import _X_to_numpy, _retrieve_X_y, bar_format, _y_to_numpy import pandas as pd from tqdm import tqdm import numpy as np @@ -102,10 +102,10 @@ def rank_product_score(self, X_dict, y): ensemble_predictor_keys = self.ensemble_predictor_keys if self.LFR is None: - self.local_feature_rank(X_dict, y_to_numpy(y)) + self._local_feature_rank(X_dict, _y_to_numpy(y)) if self.LMR is None: - self.local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys) + self._local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys) print("Calculating combined rank product score...") @@ -151,7 +151,7 @@ def rank_product_score(self, X_dict, y): return self - def local_feature_rank(self, X_dict, y): + def _local_feature_rank(self, X_dict, y): """ Local Feature Ranks (LFRs) for each base predictor @@ -177,7 +177,7 @@ def local_feature_rank(self, X_dict, y): bar_format=bar_format, ): X = X_dict[modality_name] - X, feature_names = X_to_numpy(X) + X, feature_names = _X_to_numpy(X) # check feature names were seen during training if len(self.EI.feature_names[modality_name]) > 1: @@ -285,7 +285,7 @@ def local_feature_rank(self, X_dict, y): return self - def local_model_rank(self, ensemble_predictor_keys): + def _local_model_rank(self, ensemble_predictor_keys): """ Local Model Ranks (LMRs) @@ -302,7 +302,7 @@ def local_model_rank(self, ensemble_predictor_keys): """ # load ensemble training data from EI training - ensemble_X_train, ensemble_y_train = retrieve_X_y( + ensemble_X_train, ensemble_y_train = _retrieve_X_y( labelled_data=self.EI.ensemble_training_data_final[0] ) diff --git a/eipy/metrics.py b/eipy/metrics.py index 3233277..74e84d5 100644 --- a/eipy/metrics.py +++ b/eipy/metrics.py @@ -1,18 +1,39 @@ import numpy as np import pandas as pd import inspect -from eipy.utils import minority_class +from eipy.utils import _minority_class from sklearn.metrics import roc_auc_score, precision_recall_curve def fmax_score(y_test, y_score, beta=1.0, pos_label=1): - fmax_score, _, _, threshold_fmax = fmax_precision_recall_threshold( + """ + Computes the maximum F-score (the harmonic mean of precision and recall) and the corresponding threshold. + + Parameters + ---------- + y_test : array of shape (n_samples,) + Array of test labels. + y_pred : array of shape (n_samples,) + Array of predicted probabilities on test data. + beta : float + Parameter for weighing precision and recall in F score calculations. + pos_label : bool + Class selection for computing F scores. + + Returns + ------- + fmax_score : float64 + Calculated fmax + threshold_fmax : float64 + Threshold corresponding to returned fmax + """ + fmax_score, _, _, threshold_fmax = _fmax_precision_recall_threshold( y_test, y_score, beta=beta, pos_label=pos_label ) return fmax_score, threshold_fmax -def fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1): +def _fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1): """ Radivojac, P. et al. (2013). A Large-Scale Evaluation of Computational Protein Function Prediction. Nature Methods, 10(3), 221-227. @@ -44,7 +65,7 @@ def fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1): return fmax_score, precision_fmax, recall_fmax, threshold_fmax -def try_metric_with_pos_label(y_true, y_pred, metric, pos_label): +def _try_metric_with_pos_label(y_true, y_pred, metric, pos_label): """ Compute score for a given metric. """ @@ -55,7 +76,7 @@ def try_metric_with_pos_label(y_true, y_pred, metric, pos_label): return score -def scores(y_true, y_pred, metrics): +def _scores(y_true, y_pred, metrics): """ Compute all metrics for a single set of predictions. Returns a dictionary containing metric keys, each paired to a tuple (score, threshold). @@ -65,7 +86,7 @@ def scores(y_true, y_pred, metrics): if metrics is None: metrics = {"fmax (minority)": fmax_score, "auc": roc_auc_score} - pos_label = minority_class(y_true) # gives value 1 or 0 + pos_label = _minority_class(y_true) # gives value 1 or 0 metric_threshold_dict = {} @@ -75,14 +96,14 @@ def scores(y_true, y_pred, metrics): if "y_pred" in inspect.signature(metric).parameters: # calculate metric for target vector with threshold=0.5 metric_threshold_dict[metric_key] = ( - try_metric_with_pos_label( + _try_metric_with_pos_label( y_true, (np.array(y_pred) >= 0.5).astype(int), metric, pos_label ), 0.5, ) # if y_score parameter exists in metric function then y should be probability vector elif "y_score" in inspect.signature(metric).parameters: - metric_results = try_metric_with_pos_label( + metric_results = _try_metric_with_pos_label( y_true, y_pred, metric, pos_label ) if isinstance( @@ -95,7 +116,7 @@ def scores(y_true, y_pred, metrics): return metric_threshold_dict -def scores_matrix(X, labels, metrics): +def _scores_matrix(X, labels, metrics): """ Calculate metrics and threshold (if applicable) for each column (set of predictions) in matrix X @@ -104,7 +125,7 @@ def scores_matrix(X, labels, metrics): scores_dict = {} for column in X.columns: column_temp = X[column] - metrics_per_column = scores(labels, column_temp, metrics) + metrics_per_column = _scores(labels, column_temp, metrics) # metric_names = list(metrics.keys()) for metric_key in metrics_per_column.keys(): if not (metric_key in scores_dict): @@ -115,13 +136,13 @@ def scores_matrix(X, labels, metrics): return scores_dict -def create_metric_threshold_dataframes(X, labels, metrics): +def _create_metric_threshold_dataframes(X, labels, metrics): """ Create a separate dataframe for metrics and thresholds. thresholds_df contains NaN if threshold not applicable. """ - scores_dict = scores_matrix(X, labels, metrics) + scores_dict = _scores_matrix(X, labels, metrics) metrics_df = pd.DataFrame(columns=X.columns) thresholds_df = pd.DataFrame(columns=X.columns) @@ -130,15 +151,15 @@ def create_metric_threshold_dataframes(X, labels, metrics): return metrics_df, thresholds_df -def create_metric_threshold_dict(X, labels, metrics): +def _create_metric_threshold_dict(X, labels, metrics): df_dict = {} - df_dict["metrics"], df_dict["thresholds"] = create_metric_threshold_dataframes( + df_dict["metrics"], df_dict["thresholds"] = _create_metric_threshold_dataframes( X, labels, metrics ) return df_dict -def base_summary(ensemble_test_dataframes, metrics): +def _base_summary(ensemble_test_dataframes, metrics): """ Create a base predictor performance summary by concatenating data across test folds """ @@ -149,13 +170,13 @@ def base_summary(ensemble_test_dataframes, metrics): for df in ensemble_test_dataframes ] ) - return create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics) + return _create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics) -def ensemble_summary(ensemble_predictions, metrics): +def _ensemble_summary(ensemble_predictions, metrics): X = ensemble_predictions.drop(["labels"], axis=1) labels = ensemble_predictions["labels"] - return create_metric_threshold_dict(X, labels, metrics) + return _create_metric_threshold_dict(X, labels, metrics) # These two functions are an attempt at maximizing/minimizing any metric diff --git a/eipy/utils.py b/eipy/utils.py index f81bbff..31ba545 100755 --- a/eipy/utils.py +++ b/eipy/utils.py @@ -15,7 +15,7 @@ bar_format = "{desc}: |{bar}|{percentage:3.0f}%" -def minority_class(y_true): +def _minority_class(y_true): if np.bincount(y_true)[0] < np.bincount(y_true)[1]: minority_class = 0 else: @@ -23,7 +23,7 @@ def minority_class(y_true): return minority_class -def set_predictor_seeds(base_predictors, random_state): +def _set_predictor_seeds(base_predictors, random_state): for _, v in base_predictors.items(): if type(v) == Pipeline: est_ = list(v.named_steps)[-1] @@ -33,25 +33,25 @@ def set_predictor_seeds(base_predictors, random_state): v.set_params(**{"random_state": random_state}) -def X_is_dict(X): +def _X_is_dict(X): if isinstance(X, dict): return True else: return False -def X_dict_to_numpy(X_dict): +def _X_dict_to_numpy(X_dict): """ Retrieve feature names and convert arrays to numpy. """ X_dict_numpy = {} feature_names = {} for key, X in X_dict.items(): - X_dict_numpy[key], feature_names[key] = X_to_numpy(X) + X_dict_numpy[key], feature_names[key] = _X_to_numpy(X) return X_dict_numpy, feature_names -def X_to_numpy(X): +def _X_to_numpy(X): """ Return X as a numpy array, with feature names if applicable. """ @@ -66,7 +66,7 @@ def X_to_numpy(X): ) -def y_to_numpy(y): +def _y_to_numpy(y): """ Check y is numpy array and convert if not. """ @@ -85,13 +85,13 @@ def y_to_numpy(y): or pandas Series.""" ) - if not is_binary_array(_y): + if not _is_binary_array(_y): raise ValueError("y must contain binary values.") return _y -def is_binary_array(arr): +def _is_binary_array(arr): if all(x == 0 or x == 1 or x == 0.0 or x == 1.0 for x in arr): return True else: @@ -110,7 +110,7 @@ def get_n_splits(self, X, y, groups=None): return self.n_splits -def safe_predict_proba(model, X): # uses predict_proba method where possible +def _safe_predict_proba(model, X): # uses predict_proba method where possible if hasattr(model, "predict_proba"): y_pred = model.predict_proba(X)[:, 1] else: @@ -118,12 +118,12 @@ def safe_predict_proba(model, X): # uses predict_proba method where possible return y_pred -def random_integers(n_integers=1, seed=42): +def _random_integers(n_integers=1, seed=42): random.seed(seed) return random.sample(range(0, 10000), n_integers) -def sample(X, y, strategy, random_state): +def _sample(X, y, strategy, random_state): if strategy is None: X_resampled, y_resampled = X, y elif strategy == "undersampling": # define sampler @@ -161,13 +161,13 @@ def sample(X, y, strategy, random_state): return X_resampled, y_resampled -def retrieve_X_y(labelled_data): +def _retrieve_X_y(labelled_data): X = labelled_data.drop(columns=["labels"], level=0) y = np.ravel(labelled_data["labels"]) return X, y -def append_modality(current_data, modality_data, model_building=False): +def _append_modality(current_data, modality_data, model_building=False): if current_data is None: combined_dataframe = modality_data else: