diff --git a/docs/mva/index.rst b/docs/mva/index.rst index 9b76c6ac..332f4967 100644 --- a/docs/mva/index.rst +++ b/docs/mva/index.rst @@ -40,7 +40,7 @@ Reference/API .. automodapi:: protopipe.mva :no-inheritance-diagram: - :skip: auc, roc_curve, train_test_split + :skip: auc, roc_curve, train_test_split, shuffle, save_obj .. _scikit-learn: https://scikit-learn.org/ .. _GridSearchCV: https://scikit-learn.org/stable/modules/grid_search.html diff --git a/protopipe/aux/example_config_files/AdaBoostRegressor.yaml b/protopipe/aux/example_config_files/AdaBoostRegressor.yaml index 02049071..0cae5236 100644 --- a/protopipe/aux/example_config_files/AdaBoostRegressor.yaml +++ b/protopipe/aux/example_config_files/AdaBoostRegressor.yaml @@ -1,18 +1,17 @@ General: - # [...] = your analysis local full path OUTSIDE the Vagrant box - data_dir: '../../data/' - data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5' - outdir: './' - - # List of cameras to use (you can override this from the CLI) - cam_id_list: ['LSTCam', 'NectarCam'] + # [...] = analysis full path (on the host if you are using a container) + data_dir_signal: "ANALYSES_DIRECTORY/ANALYSIS_NAME/data/TRAINING/for_energy_estimation/gamma" + data_sig_file: "TRAINING_energy_tail_gamma_merged.h5" + outdir: "ANALYSES_DIRECTORY/ANALYSIS_NAME/estimators/energy_regressor" + # List of cameras to use (protopipe-MODEL help output for other options) + cam_id_list: [] # If train_fraction is 1, all the TRAINING dataset will be used to train the # model and benchmarking can only be done from the benchmarking notebook # TRAINING/benchmarks_DL2_to_classification.ipynb Split: train_fraction: 0.8 - use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split # Optimize the hyper-parameters of the estimator with a grid search # If True parameters should be provided as lists @@ -20,17 +19,20 @@ Split: GridSearchCV: use: False # True or False # if False the following two variables are irrelevant - scoring: 'explained_variance' - cv: 2 + scoring: "explained_variance" + cv: 2 # cross-validation splitting strategy + refit: True # Refit the estimator using the best found parameters + verbose: 1 # 1,2,3,4 + njobs: -1 # int or -1 (all processors) Method: - name: 'sklearn.ensemble.AdaBoostRegressor' - target_name: 'true_energy' + name: "sklearn.ensemble.AdaBoostRegressor" + target_name: "true_energy" log_10_target: True # this makes the model use log10(target_name) # Please, see scikit-learn's API for what each parameter means # NOTE: null == None base_estimator: - name: 'sklearn.tree.DecisionTreeRegressor' + name: "sklearn.tree.DecisionTreeRegressor" parameters: # NOTE: here we set the parameters relevant for sklearn.tree.DecisionTreeRegressor criterion: "mse" # "mse", "friedman_mse", "mae" or "poisson" @@ -47,7 +49,7 @@ Method: tuned_parameters: n_estimators: 50 learning_rate: 1 - loss: 'linear' # 'linear', 'square' or 'exponential' + loss: "linear" # 'linear', 'square' or 'exponential' random_state: 0 # int, RandomState instance or None # List of the features to use to train the model @@ -57,12 +59,12 @@ Method: # - if not you can propose modifications to protopipe.mva.utils.prepare_data FeatureList: Basic: # single-named, they need to correspond to input data columns - - 'h_max' # Height of shower maximum from stereoscopic reconstruction - - 'impact_dist' # Impact parameter from stereoscopic reconstruction - - 'hillas_width' # Image Width - - 'hillas_length' # Image Length - # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel - - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera + - "h_max" # Height of shower maximum from stereoscopic reconstruction + - "impact_dist" # Impact parameter from stereoscopic reconstruction + - "hillas_width" # Image Width + - "hillas_length" # Image Length + - "concentration_pixel" # Percentage of photo-electrons in the brightest pixel + - "leakage_intensity_width_1" # fraction of total Intensity which is contained in the outermost pixels of the camera Derived: # custom evaluations of basic features that will be added to the data # column name : expression to evaluate using basic column names log10_WLS: log10(hillas_width*hillas_length/hillas_intensity) @@ -72,13 +74,13 @@ FeatureList: # These cuts select the input data BEFORE training SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - - 'hillas_intensity_reco > 0' + - "good_image == 1" + - "is_valid == True" + - "hillas_intensity > 0" Diagnostic: - # Energy binning (used for reco and true energy) - energy: - nbins: 15 - min: 0.0125 - max: 125 + # Energy binning (used for reco and true energy) + energy: + nbins: 15 + min: 0.0125 + max: 125 diff --git a/protopipe/aux/example_config_files/RandomForestClassifier.yaml b/protopipe/aux/example_config_files/RandomForestClassifier.yaml index 2a4a8a3d..d1da31bb 100644 --- a/protopipe/aux/example_config_files/RandomForestClassifier.yaml +++ b/protopipe/aux/example_config_files/RandomForestClassifier.yaml @@ -1,19 +1,19 @@ General: - # [...] = your analysis local full path OUTSIDE the Vagrant box - data_dir: '../../data/' # '[...]/data/TRAINING/for_particle_classification/' - data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5' - data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5' - outdir: './' # [...]/estimators/gamma_hadron_classifier - + # [...] = your analysis full path (on the host if you are using a container) + data_dir_signal: "ANALYSES_DIRECTORY/ANALYSIS_NAME/data/TRAINING/for_particle_classification/gamma" + data_dir_background: "ANALYSES_DIRECTORY/ANALYSIS_NAME/data/TRAINING/for_particle_classification/proton" + data_sig_file: "TRAINING_classification_tail_gamma_merged.h5" + data_bkg_file: "TRAINING_classification_tail_proton_merged.h5" + outdir: "ANALYSES_DIRECTORY/ANALYSIS_NAME/estimators/gamma_hadron_classifier" # List of cameras to use (protopipe-MODEL help output for other options) - cam_id_list: ['LSTCam', 'NectarCam'] + cam_id_list: [] -# If train_fraction is 1, all the TRAINING dataset will be used to train the +# If train_fraction is 1.0, all the TRAINING dataset will be used to train the # model and benchmarking can only be done from the benchmarking notebook # TRAINING/benchmarks_DL2_to_classification.ipynb Split: train_fraction: 0.8 - use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split # Optimize the hyper-parameters of the estimator with a grid search # If True parameters should be provided as lists (for None use [null]) @@ -21,19 +21,22 @@ Split: GridSearchCV: use: False # True or False # if False the following to variables are irrelevant - scoring: 'roc_auc' - cv: 2 + scoring: "roc_auc" + cv: 2 # cross-validation splitting strategy + refit: True # Refit the estimator using the best found parameters + verbose: 1 # 1,2,3,4 + njobs: -1 # int or -1 (all processors) # Definition of the algorithm/method used and its hyper-parameters Method: - name: 'sklearn.ensemble.RandomForestClassifier' # DO NOT CHANGE - target_name: 'label' # defined between 0 and 1 (DO NOT CHANGE) + name: "sklearn.ensemble.RandomForestClassifier" # DO NOT CHANGE + target_name: "label" # defined between 0 and 1 (DO NOT CHANGE) tuned_parameters: # Please, see scikit-learn's API for what each parameter means # WARNING: null (not a string) == 'None' n_estimators: 100 # integer - criterion: 'gini' # 'gini' or 'entropy' - max_depth: null # null or integer + criterion: "gini" # 'gini' or 'entropy' + max_depth: 20 # null or integer min_samples_split: 2 # integer or float min_samples_leaf: 1 # integer or float min_weight_fraction_leaf: 0.0 # float @@ -49,7 +52,8 @@ Method: class_weight: null # 'balanced', 'balanced_subsample', null, dict or list of dicts ccp_alpha: 0.0 # non-negative float max_samples: null # null, integer or float - calibrate_output: False # If True calibrate model on test data + use_proba: True # If True output is 'gammaness', else 'score' + calibrate_output: False # If True calibrate model on test data # List of the features to use to train the model # You can: @@ -58,11 +62,11 @@ Method: # - if not you can propose modifications to protopipe.mva.utils.prepare_data FeatureList: Basic: # single-named, they need to correspond to input data columns - - 'h_max' # Height of shower maximum from stereoscopic reconstruction - - 'impact_dist' # Impact parameter from stereoscopic reconstruction - - 'hillas_width' # Image Width - - 'hillas_length' # Image Length - # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + - "h_max" # Height of shower maximum from stereoscopic reconstruction + - "impact_dist" # Impact parameter from stereoscopic reconstruction + - "hillas_width" # Image Width + - "hillas_length" # Image Length + - "concentration_pixel" # Percentage of photo-electrons in the brightest pixel Derived: # custom evaluations of basic features that will be added to the data # column name : expression to evaluate using basic column names log10_intensity: log10(hillas_intensity) @@ -71,18 +75,18 @@ FeatureList: # These cuts select the input data BEFORE training SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - - 'hillas_intensity_reco > 0' + - "good_image == 1" + - "is_valid == True" + - "hillas_intensity > 0" BkgFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - - 'hillas_intensity_reco > 0' + - "good_image == 1" + - "is_valid == True" + - "hillas_intensity > 0" Diagnostic: - # Energy binning (used for reco and true energy) - energy: - nbins: 4 - min: 0.0125 - max: 200 + # Energy binning (used for reco and true energy) + energy: + nbins: 4 + min: 0.0125 + max: 200 diff --git a/protopipe/aux/example_config_files/RandomForestRegressor.yaml b/protopipe/aux/example_config_files/RandomForestRegressor.yaml index 86a3103d..605e8d50 100644 --- a/protopipe/aux/example_config_files/RandomForestRegressor.yaml +++ b/protopipe/aux/example_config_files/RandomForestRegressor.yaml @@ -1,18 +1,17 @@ General: - # [...] = your analysis local full path OUTSIDE the Vagrant box - data_dir: '../../data/' # '[...]/data/TRAINING/for_energy_estimation/' - data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5' - outdir: './' # '[...]/estimators/energy_regressor' - + # [...] = analysis full path (on the host if you are using a container) + data_dir_signal: "ANALYSES_DIRECTORY/ANALYSIS_NAME/data/TRAINING/for_energy_estimation/gamma" + data_sig_file: "TRAINING_energy_tail_gamma_merged.h5" + outdir: "ANALYSES_DIRECTORY/ANALYSIS_NAME/estimators/energy_regressor" # List of cameras to use (protopipe-MODEL help output for other options) - cam_id_list: ['LSTCam', 'NectarCam'] + cam_id_list: [] # If train_fraction is 1, all the TRAINING dataset will be used to train the # model and benchmarking can only be done from the benchmarking notebook # TRAINING/benchmarks_DL2_to_classification.ipynb Split: train_fraction: 0.8 - use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split # Optimize the hyper-parameters of the estimator with a grid search # If True parameters should be provided as lists @@ -20,20 +19,23 @@ Split: GridSearchCV: use: False # True or False # if False the following two variables are irrelevant - scoring: 'explained_variance' - cv: 2 + scoring: "explained_variance" + cv: 2 # cross-validation splitting strategy + refit: True # Refit the estimator using the best found parameters + verbose: 1 # 1,2,3,4 + njobs: -1 # int or -1 (all processors) # Definition of the model algorithm/method used and its hyper-parameters Method: - name: 'sklearn.ensemble.RandomForestRegressor' # DO NOT CHANGE - target_name: 'true_energy' + name: "sklearn.ensemble.RandomForestRegressor" # DO NOT CHANGE + target_name: "true_energy" log_10_target: True # this makes the model use log10(target_name) tuned_parameters: # Please, see scikit-learn's API for what each parameter means # NOTE: null == None n_estimators: 50 # integer criterion: "mse" # "mse" or "mae" - max_depth: null # null or integer + max_depth: 20 # null or integer min_samples_split: 5 # integer min_samples_leaf: 5 # integer min_weight_fraction_leaf: 0.0 # float @@ -56,12 +58,12 @@ Method: # - if not you can propose modifications to protopipe.mva.utils.prepare_data FeatureList: Basic: # single-named, they need to correspond to input data columns - - 'h_max' # Height of shower maximum from stereoscopic reconstruction - - 'impact_dist' # Impact parameter from stereoscopic reconstruction - - 'hillas_width' # Image Width - - 'hillas_length' # Image Length - # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel - - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera + - "h_max" # Height of shower maximum from stereoscopic reconstruction + - "impact_dist" # Impact parameter from stereoscopic reconstruction + - "hillas_width" # Image Width + - "hillas_length" # Image Length + - "concentration_pixel" # Percentage of photo-electrons in the brightest pixel + - "leakage_intensity_width_1" # fraction of total Intensity which is contained in the outermost pixels of the camera Derived: # custom evaluations of basic features that will be added to the data # column name : expression to evaluate using basic column names log10_WLS: log10(hillas_width*hillas_length/hillas_intensity) @@ -71,9 +73,9 @@ FeatureList: # These cuts select the input data BEFORE training SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - - 'hillas_intensity_reco > 0' + - "good_image == 1" + - "is_valid == True" + - "hillas_intensity > 0" # Information used by the benchmarking notebook related to this model Diagnostic: diff --git a/protopipe/mva/io.py b/protopipe/mva/io.py index 963d053a..92bef32f 100644 --- a/protopipe/mva/io.py +++ b/protopipe/mva/io.py @@ -4,7 +4,7 @@ import joblib from os import path -from protopipe.mva.utils import save_obj +from protopipe.pipeline.io import save_obj def initialize_script_arguments(): @@ -48,23 +48,34 @@ def initialize_script_arguments(): # These last CL arguments can overwrite the values from the config group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--cameras_from_config', - action='store_true', - help="Get cameras configuration file (Priority 1)",) - group.add_argument('--cameras_from_file', - action='store_true', - help="Get cameras from input file (Priority 2)",) - group.add_argument('--cam_id_list', - type=str, - default=None, - help="Select cameras like 'LSTCam CHEC' (Priority 3)",) + group.add_argument( + "--cameras_from_config", + action="store_true", + help="Get cameras configuration file (Priority 1)", + ) + group.add_argument( + "--cameras_from_file", + action="store_true", + help="Get cameras from input file (Priority 2)", + ) + group.add_argument( + "--cam_id_list", + type=str, + default=None, + help="Select cameras like 'LSTCam CHEC' (Priority 3)", + ) parser.add_argument( - "-i", - "--indir", + "--indir_signal", type=str, default=None, - help="Directory containing the required input file(s)" + help="Directory containing the required SIGNAL input file(s) (default: read from config file)" + ) + parser.add_argument( + "--indir_background", + type=str, + default=None, + help="Directory containing the required BACKGROUND input file(s) (default: read from config file)" ) parser.add_argument( "--infile_signal", @@ -85,13 +96,7 @@ def initialize_script_arguments(): return args -def save_output(models, - cam_id, - factory, - best_model, - model_types, - method_name, - outdir): +def save_output(models, cam_id, factory, best_model, model_types, method_name, outdir): """Save model and data used to produce it per camera-type. Parameters @@ -114,9 +119,7 @@ def save_output(models, models[cam_id] = best_model model_type = [k for k, v in model_types.items() if method_name in v][0] - outname = "{}_{}_{}.pkl.gz".format( - model_type, cam_id, method_name - ) + outname = "{}_{}_{}.pkl.gz".format(model_type, cam_id, method_name) joblib.dump(best_model, path.join(outdir, outname)) # SAVE DATA @@ -124,24 +127,18 @@ def save_output(models, factory.data_scikit, path.join( outdir, - "data_scikit_{}_{}_{}.pkl.gz".format( - model_type, method_name, cam_id - ), + "data_scikit_{}_{}_{}.pkl.gz".format(model_type, method_name, cam_id), ), ) factory.data_train.to_pickle( path.join( outdir, - "data_train_{}_{}_{}.pkl.gz".format( - model_type, method_name, cam_id - ), + "data_train_{}_{}_{}.pkl.gz".format(model_type, method_name, cam_id), ) ) factory.data_test.to_pickle( path.join( outdir, - "data_test_{}_{}_{}.pkl.gz".format( - model_type, method_name, cam_id - ), + "data_test_{}_{}_{}.pkl.gz".format(model_type, method_name, cam_id), ) ) diff --git a/protopipe/mva/train_model.py b/protopipe/mva/train_model.py index fd25c2b0..423ef355 100644 --- a/protopipe/mva/train_model.py +++ b/protopipe/mva/train_model.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from sklearn.model_selection import GridSearchCV from .utils import split_train_test @@ -112,26 +113,47 @@ def split_data( max_events = len(X_test_bkg) else: max_events = len(X_test_sig) - X_test = X_test_sig[0:max_events].append(X_test_bkg[0:max_events]) - y_test = y_test_sig[0:max_events].append(y_test_bkg[0:max_events]) - self.data_test = data_test_sig[0:max_events].append( - data_test_bkg[0:max_events] - ) + + try: + X_test = X_test_sig[0:max_events].append(X_test_bkg[0:max_events]) + y_test = y_test_sig[0:max_events].append(y_test_bkg[0:max_events]) + self.data_test = data_test_sig[0:max_events].append( + data_test_bkg[0:max_events] + ) + except TypeError as e: + if str(e) != "'NoneType' object is unsubscriptable": + raise + else: + X_test = None + y_test = None + self.data_test = None weight = np.ones(len(X_train)) weight_train = weight / sum(weight) - self.data_scikit = { - "X_train": X_train.values, - "X_test": X_test.values, - "y_train": y_train.values, - "y_test": y_test.values, - "w_train": weight_train, - } - - def get_optimal_model(self, init_model, tuned_parameters, scoring, cv): + if X_test is not None: + self.data_scikit = { + "X_train": X_train.values, + "X_test": X_test.values, + "y_train": y_train.values, + "y_test": y_test.values, + "w_train": weight_train, + } + else: + self.data_scikit = { + "X_train": X_train.values, + "X_test": None, + "y_train": y_train.values, + "y_test": None, + "w_train": weight_train, + } + + def get_optimal_model(self, init_model, tuned_parameters, scoring, cv, refit=True, verbose=2, njobs=1): """ - Get optimal hyperparameters and returns best model. + Get optimal hyperparameters for an estimator and return the best model. + + The best parameters are obtained by performing an exhaustive search + over specified parameter values. Parameters ---------- @@ -143,12 +165,27 @@ def get_optimal_model(self, init_model, tuned_parameters, scoring, cv): Estimator cv: int number of split for x-validation + refit: bool, str, or callable, default=False + Refit the estimator using the best found parameters on the whole dataset. + verbose: int + Controls the verbosity: the higher, the more messages. + >1 : the computation time for each fold and parameter candidate is displayed + >2 : the score is also displayed + >3 : the fold and candidate parameter indexes are also displayed together with the starting time of the computation + njobs: int + Number of jobs to run in parallel. -1 means using all processors. + Returns ------- best_estimator: `~sklearn.base.BaseEstimator` Best model """ - model = GridSearchCV(init_model, tuned_parameters, scoring=scoring, cv=cv) + model = GridSearchCV(init_model, + tuned_parameters, + scoring=scoring, + cv=cv, + refit=refit, + verbose=verbose) model.fit( self.data_scikit["X_train"], self.data_scikit["y_train"], @@ -158,11 +195,18 @@ def get_optimal_model(self, init_model, tuned_parameters, scoring, cv): print("Best parameters set found on development set:") for key in model.best_params_.keys(): print(" - {}: {}".format(key, model.best_params_[key])) + print("Grid scores on development set:") means = model.cv_results_["mean_test_score"] stds = model.cv_results_["std_test_score"] - for mean, std, params in zip(means, stds, model.cv_results_["params"]): - print(" - {:.3f}+/-{:.3f} for {}".format(mean, std * 2, params)) + if verbose > 2: + for mean, std, params in zip(means, stds, model.cv_results_["params"]): + print(" - {:.3f}+/-{:.3f} for {}".format(mean, std * 2, params)) + + grid_search_cv_results = pd.DataFrame(model.cv_results_) + if verbose > 3: + print(grid_search_cv_results) best_estimator = model.best_estimator_ + return best_estimator diff --git a/protopipe/mva/utils.py b/protopipe/mva/utils.py index bd20ef93..10aa42d8 100644 --- a/protopipe/mva/utils.py +++ b/protopipe/mva/utils.py @@ -1,21 +1,8 @@ import numpy as np import pandas as pd -import pickle -import gzip from sklearn.metrics import auc, roc_curve from sklearn.model_selection import train_test_split - - -def save_obj(obj, name): - """Save object in binary""" - with gzip.open(name, "wb") as f: - pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) - - -def load_obj(name): - """Load object in binary""" - with gzip.open(name, "rb") as f: - return pickle.load(f) +from sklearn.utils import shuffle def prepare_data(ds, derived_features, cuts, select_data=True, label=None): @@ -50,13 +37,17 @@ def prepare_data(ds, derived_features, cuts, select_data=True, label=None): # This is needed because our reference analysis uses energy as # feature for classification # We should propably support a more elastic choice in the future. - if not all(i in derived_features for i in ["log10_reco_energy", "log10_reco_energy_tel"]): - raise ValueError('log10_reco_energy and log10_reco_energy_tel need to be model features.') + if not all( + i in derived_features + for i in ["log10_reco_energy", "log10_reco_energy_tel"] + ): + raise ValueError( + "log10_reco_energy and log10_reco_energy_tel need to be model features." + ) # Compute derived features and add them to the dataframe for feature_name, feature_expression in derived_features.items(): - ds.eval(f'{feature_name} = {feature_expression}', - inplace=True) + ds.eval(f"{feature_name} = {feature_expression}", inplace=True) if select_data: ds = ds.query(cuts) @@ -76,6 +67,9 @@ def make_cut_list(cuts): def split_train_test(survived_images, train_fraction, feature_name_list, target_name): """Split the data selected for cuts in train and test samples. + If the estimator is a classifier, data is split in a stratified fashion, + using this as the class labels. + Parameters ---------- survived_images: `~pandas.DataFrame` @@ -103,19 +97,39 @@ def split_train_test(survived_images, train_fraction, feature_name_list, target_ Test data indexed by observation ID and event ID. """ - data_train, data_test = train_test_split(survived_images, - train_size=train_fraction, - random_state=0, - shuffle=True) - - y_train = data_train[target_name] - X_train = data_train[feature_name_list] - - y_test = data_test[target_name] - X_test = data_test[feature_name_list] - - data_train = data_train.set_index(["obs_id", "event_id"]) - data_test = data_test.set_index(["obs_id", "event_id"]) + # If the estimator is a classifier, data is split in a stratified fashion, + # using this as the class labels + labels = None + if target_name == "label": + labels = survived_images[target_name] + + if train_fraction != 1.0: + data_train, data_test = train_test_split( + survived_images, + train_size=train_fraction, + random_state=0, + shuffle=True, + stratify=labels, + ) + y_train = data_train[target_name] + X_train = data_train[feature_name_list] + + y_test = data_test[target_name] + X_test = data_test[feature_name_list] + + data_train = data_train.set_index(["obs_id", "event_id"]) + data_test = data_test.set_index(["obs_id", "event_id"]) + else: + # if the user wants to use the whole input dataset + # there is not 'test' data, though we shuffle anyway + data_train = survived_images + shuffle(data_train, random_state=0, n_samples=None) + y_train = data_train[target_name] + X_train = data_train[feature_name_list] + + data_test = None + y_test = None + X_test = None return X_train, X_test, y_train, y_test, data_train, data_test diff --git a/protopipe/pipeline/io.py b/protopipe/pipeline/io.py index 15d9a6a2..ffdc75fc 100644 --- a/protopipe/pipeline/io.py +++ b/protopipe/pipeline/io.py @@ -1,10 +1,35 @@ """Utility functions mainly used in benchmarking notebooks.""" from pathlib import Path +import yaml +import pickle +import gzip +from astropy.table import Table import tables import pandas -from astropy.table import Table +import joblib + + +def load_config(name): + """Load a YAML configuration file. + + Parameters + ---------- + name: str or pathlib.Path + + Returns + ------- + cfg: object + Python object (usually a dictionary). + """ + try: + with open(name, "r") as stream: + cfg = yaml.load(stream, Loader=yaml.FullLoader) + except FileNotFoundError as e: + raise e + + return cfg def get_camera_names(input_directory=None, @@ -93,3 +118,47 @@ def read_TRAINING_per_tel_type_with_images(input_directory=None, table[camera][key] = h5file.get_node(f"/{camera}").col(key) return table + + +def load_models(path, cam_id_list): + """Load the pickled dictionary of model from disk + and fill the model dictionary. + + Parameters + ---------- + path : string + The path where the pre-trained, pickled models are + stored. `path` is assumed to contain a `{cam_id}` keyword + to be replaced by each camera identifier in `cam_id_list` + (or at least a naked `{}`). + cam_id_list : list + List of camera identifiers like telescope ID or camera ID + and the assumed distinguishing feature in the filenames of + the various pickled regressors. + + Returns + ------- + model_dict: dict + Dictionary with `cam_id` as keys and pickled models as values. + """ + + model_dict = {} + for key in cam_id_list: + try: + model_dict[key] = joblib.load(path.format(cam_id=key)) + except IndexError: + model_dict[key] = joblib.load(path.format(key)) + + return model_dict + + +def load_obj(name): + """Load object in binary""" + with gzip.open(name, "rb") as f: + return pickle.load(f) + + +def save_obj(obj, name): + """Save object in binary""" + with gzip.open(name, "wb") as f: + pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) diff --git a/protopipe/scripts/build_model.py b/protopipe/scripts/build_model.py index 7547149f..bd6c9ee1 100755 --- a/protopipe/scripts/build_model.py +++ b/protopipe/scripts/build_model.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - import os from os import path import importlib @@ -9,7 +7,7 @@ from sklearn.metrics import classification_report from sklearn.calibration import CalibratedClassifierCV -from protopipe.pipeline.utils import load_config, get_camera_names +from protopipe.pipeline.io import load_config, get_camera_names from protopipe.mva import TrainModel from protopipe.mva.io import initialize_script_arguments, save_output from protopipe.mva.utils import ( @@ -29,10 +27,10 @@ def main(): # INPUT CONFIGURATION # Import parameters - if args.indir is None: - data_dir = cfg["General"]["data_dir"] + if args.indir_signal is None: + data_dir_signal = cfg["General"]["data_dir_signal"] else: - data_dir = args.indir + data_dir_signal = args.indir_signal if args.outdir is None: outdir = cfg["General"]["outdir"] @@ -47,7 +45,7 @@ def main(): else: data_sig_file = args.infile_signal - filename_sig = path.join(data_dir, data_sig_file) + filename_sig = path.join(data_dir_signal, data_sig_file) print(f"INPUT SIGNAL FILE PATH= {filename_sig}") @@ -58,11 +56,11 @@ def main(): elif args.cameras_from_file: print("GETTING CAMERAS FROM SIGNAL TRAINING FILE") # in the same analysis all particle types are analyzed in the - # same way so we can just use gammas - cam_ids = get_camera_names(filename_sig) + # same way so we can just use signal + cam_ids = get_camera_names(data_dir_signal, data_sig_file) else: print("GETTING CAMERAS FROM CLI") - cam_ids = args.cam_id_lists.split() + cam_ids = args.cam_id_list.split() # The names of the tables inside the HDF5 file are the camera's names table_name = [cam_id for cam_id in cam_ids] @@ -86,6 +84,9 @@ def main(): use_GridSearchCV = cfg["GridSearchCV"]["use"] scoring = cfg["GridSearchCV"]["scoring"] cv = cfg["GridSearchCV"]["cv"] + refit = cfg["GridSearchCV"]["refit"] + grid_search_verbose = cfg["GridSearchCV"]["verbose"] + grid_search_njobs = cfg["GridSearchCV"]["njobs"] # Hyper-parameters of the main model tuned_parameters = cfg["Method"]["tuned_parameters"] @@ -98,7 +99,6 @@ def main(): class_name = model_to_use.split('.')[-1] module = importlib.import_module(module_name) # sklearn.XXX model = getattr(module, class_name) - print(f"Going to use {module_name}.{class_name}...") # Check for any base estimator if main model is a meta-estimator if "base_estimator" in cfg['Method']: @@ -107,7 +107,8 @@ def main(): base_estimator_pars = base_estimator_cfg['parameters'] base_estimator_module_name = '.'.join(base_estimator_name.split('.', 2)[:-1]) base_estimator_class_name = base_estimator_name.split('.')[-1] - base_estimator_module = importlib.import_module(base_estimator_module_name) # sklearn.XXX + base_estimator_module = importlib.import_module( + base_estimator_module_name) # sklearn.XXX base_estimator_model = getattr(base_estimator_module, base_estimator_class_name) initialized_base_estimator = base_estimator_model(**base_estimator_pars) print(f"...based on {base_estimator_module_name}.{base_estimator_class_name}") @@ -136,16 +137,18 @@ def main(): elif class_name in model_types["classifier"]: + if args.indir_background is None: + data_dir_background = cfg["General"]["data_dir_background"] + else: + data_dir_background = args.indir_background + # read background file from either config file or CLI if args.infile_background is None: data_bkg_file = cfg["General"]["data_bkg_file"].format(args.mode) else: data_bkg_file = args.infile_background - # filename_sig = path.join(data_dir, data_sig_file) - filename_bkg = path.join(data_dir, data_bkg_file) - - # table_name = [table_name_template + cam_id for cam_id in cam_ids] + filename_bkg = path.join(data_dir_background, data_bkg_file) # Get the selection cuts sig_cuts = make_cut_list(cfg["SigFiducialCuts"]) @@ -158,7 +161,7 @@ def main(): else: raise ValueError("ERROR: not a supported model") - print("### Using {} for model construction".format(model_to_use)) + print(f"Using {module_name}.{class_name} for model construction") print(f"LIST OF CAMERAS TO USE = {cam_ids}") @@ -193,7 +196,8 @@ def main(): # Useful to test the models before using them for DL2 production factory.split_data(data_sig=data_sig, train_fraction=train_fraction) print("Training sample: sig {}".format(len(factory.data_train))) - print("Test sample: sig {}".format(len(factory.data_test))) + if factory.data_test is not None: + print("Test sample: sig {}".format(len(factory.data_test))) else: # if it's not a regressor it's a classifier @@ -217,7 +221,8 @@ def main(): data_sig = data_sig[0:args.max_events] data_bkg = data_bkg[0:args.max_events] - print(f"Going to split {len(data_sig)} SIGNAL images and {len(data_bkg)} BACKGROUND images") + print( + f"Going to split {len(data_sig)} SIGNAL images and {len(data_bkg)} BACKGROUND images") # Initialize the model factory = TrainModel( @@ -247,11 +252,13 @@ def main(): ) if use_GridSearchCV: + print("Going to perform exhaustive cross-validated grid-search over" + " specified parameter values...") # Apply optimization of the hyper-parameters via grid search # and return best model best_model = factory.get_optimal_model( - initialized_model, tuned_parameters, scoring=scoring, cv=cv - ) + initialized_model, tuned_parameters, scoring=scoring, cv=cv, + refit=refit, verbose=grid_search_verbose, njobs=grid_search_njobs) else: # otherwise use directly the initial model best_model = initialized_model diff --git a/protopipe/scripts/model_diagnostic.py b/protopipe/scripts/model_diagnostic.py index 102263f2..5ec9ff40 100755 --- a/protopipe/scripts/model_diagnostic.py +++ b/protopipe/scripts/model_diagnostic.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - import os import pandas as pd import numpy as np @@ -11,7 +9,8 @@ import joblib -from protopipe.pipeline.utils import load_config, save_fig +from protopipe.pipeline.io import load_obj, load_config +from protopipe.pipeline.utils import save_fig from protopipe.mva import ( RegressorDiagnostic, @@ -19,7 +18,6 @@ BoostedDecisionTreeDiagnostic, ) from protopipe.mva.utils import ( - load_obj, get_evt_subarray_model_output, plot_roc_curve, plot_hist, diff --git a/protopipe/scripts/tests/test_AdaBoostRegressor.yaml b/protopipe/scripts/tests/test_AdaBoostRegressor.yaml index 9660df09..eecd62dd 100644 --- a/protopipe/scripts/tests/test_AdaBoostRegressor.yaml +++ b/protopipe/scripts/tests/test_AdaBoostRegressor.yaml @@ -1,20 +1,17 @@ General: - # [...] = your analysis local full path OUTSIDE the Vagrant box - # NOTE: not used here since the testing suite needs to work from the CLI - data_dir: '../../data/' - data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5' - outdir: './' - - # List of cameras to use (you can override this from the CLI) - # NOTE: not used here since the testing suite needs to work from the CLI - cam_id_list: ['LSTCam', 'NectarCam'] + # [...] = analysis full path (on the host if you are using a container) + data_dir_signal: "[...]/data/TRAINING/for_energy_estimation/" + data_sig_file: "TRAINING_energy_tail_gamma_merged.h5" + outdir: "[...]/estimators/energy_regressor" + # List of cameras to use (protopipe-MODEL help output for other options) + cam_id_list: ["LSTCam", "NectarCam"] # If train_fraction is 1, all the TRAINING dataset will be used to train the # model and benchmarking can only be done from the benchmarking notebook # TRAINING/benchmarks_DL2_to_classification.ipynb Split: train_fraction: 0.8 - use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split # Optimize the hyper-parameters of the estimator with a grid search # If True parameters should be provided as lists @@ -22,17 +19,20 @@ Split: GridSearchCV: use: False # True or False # if False the following two variables are irrelevant - scoring: 'explained_variance' - cv: 2 + scoring: "explained_variance" + cv: 2 # cross-validation splitting strategy + refit: True # Refit the estimator using the best found parameters + verbose: 1 # 1,2,3,4 + njobs: -1 # int or -1 (all processors) Method: - name: 'sklearn.ensemble.AdaBoostRegressor' - target_name: 'true_energy' + name: "sklearn.ensemble.AdaBoostRegressor" + target_name: "true_energy" log_10_target: True # this makes the model use log10(target_name) # Please, see scikit-learn's API for what each parameter means # NOTE: null == None base_estimator: - name: 'sklearn.tree.DecisionTreeRegressor' + name: "sklearn.tree.DecisionTreeRegressor" parameters: # NOTE: here we set the parameters relevant for sklearn.tree.DecisionTreeRegressor criterion: "mse" # "mse", "friedman_mse", "mae" or "poisson" @@ -49,7 +49,7 @@ Method: tuned_parameters: n_estimators: 50 learning_rate: 1 - loss: 'linear' # 'linear', 'square' or 'exponential' + loss: "linear" # 'linear', 'square' or 'exponential' random_state: 0 # int, RandomState instance or None # List of the features to use to train the model @@ -59,12 +59,12 @@ Method: # - if not you can propose modifications to protopipe.mva.utils.prepare_data FeatureList: Basic: # single-named, they need to correspond to input data columns - - 'h_max' # Height of shower maximum from stereoscopic reconstruction - - 'impact_dist' # Impact parameter from stereoscopic reconstruction - - 'hillas_width' # Image Width - - 'hillas_length' # Image Length - # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel - - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera + - "h_max" # Height of shower maximum from stereoscopic reconstruction + - "impact_dist" # Impact parameter from stereoscopic reconstruction + - "hillas_width" # Image Width + - "hillas_length" # Image Length + - "concentration_pixel" # Percentage of photo-electrons in the brightest pixel + - "leakage_intensity_width_1" # fraction of total Intensity which is contained in the outermost pixels of the camera Derived: # custom evaluations of basic features that will be added to the data # column name : expression to evaluate using basic column names log10_WLS: log10(hillas_width*hillas_length/hillas_intensity) @@ -74,13 +74,13 @@ FeatureList: # These cuts select the input data BEFORE training SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - - 'hillas_intensity_reco > 0' + - "good_image == 1" + - "is_valid == True" + - "hillas_intensity > 0" Diagnostic: - # Energy binning (used for reco and true energy) - energy: - nbins: 15 - min: 0.0125 - max: 125 + # Energy binning (used for reco and true energy) + energy: + nbins: 15 + min: 0.0125 + max: 125 diff --git a/protopipe/scripts/tests/test_RandomForestClassifier.yaml b/protopipe/scripts/tests/test_RandomForestClassifier.yaml index 0f406e38..82a5b895 100644 --- a/protopipe/scripts/tests/test_RandomForestClassifier.yaml +++ b/protopipe/scripts/tests/test_RandomForestClassifier.yaml @@ -1,20 +1,19 @@ General: - # NOTE: not used here since the testing suite needs to work from the CLI - data_dir: '../../data/' - data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5' - data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5' - outdir: './' - + # [...] = your analysis full path (on the host if you are using a container) + data_dir_signal: "[...]/data/TRAINING/for_particle_classification/gamma" + data_dir_background: "[...]/data/TRAINING/for_particle_classification/proton" + data_sig_file: "TRAINING_classification_tail_gamma_merged.h5" + data_bkg_file: "TRAINING_classification_tail_proton_merged.h5" + outdir: "[...]/data/TRAINING/estimators/gamma_hadron_classifier" # List of cameras to use (protopipe-MODEL help output for other options) - # NOTE: not used here since the testing suite needs to work from the CLI - cam_id_list: ['LSTCam', 'NectarCam'] + cam_id_list: ["LSTCam", "NectarCam"] -# If train_fraction is 1, all the TRAINING dataset will be used to train the +# If train_fraction is 1.0, all the TRAINING dataset will be used to train the # model and benchmarking can only be done from the benchmarking notebook # TRAINING/benchmarks_DL2_to_classification.ipynb Split: train_fraction: 0.8 - use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split # Optimize the hyper-parameters of the estimator with a grid search # If True parameters should be provided as lists (for None use [null]) @@ -22,19 +21,22 @@ Split: GridSearchCV: use: False # True or False # if False the following to variables are irrelevant - scoring: 'roc_auc' - cv: 2 + scoring: "roc_auc" + cv: 2 # cross-validation splitting strategy + refit: True # Refit the estimator using the best found parameters + verbose: 1 # 1,2,3,4 + njobs: -1 # int or -1 (all processors) # Definition of the algorithm/method used and its hyper-parameters Method: - name: 'sklearn.ensemble.RandomForestClassifier' # DO NOT CHANGE - target_name: 'label' # defined between 0 and 1 (DO NOT CHANGE) + name: "sklearn.ensemble.RandomForestClassifier" # DO NOT CHANGE + target_name: "label" # defined between 0 and 1 (DO NOT CHANGE) tuned_parameters: # Please, see scikit-learn's API for what each parameter means # WARNING: null (not a string) == 'None' n_estimators: 100 # integer - criterion: 'gini' # 'gini' or 'entropy' - max_depth: null # null or integer + criterion: "gini" # 'gini' or 'entropy' + max_depth: 20 # null or integer min_samples_split: 2 # integer or float min_samples_leaf: 1 # integer or float min_weight_fraction_leaf: 0.0 # float @@ -50,7 +52,8 @@ Method: class_weight: null # 'balanced', 'balanced_subsample', null, dict or list of dicts ccp_alpha: 0.0 # non-negative float max_samples: null # null, integer or float - calibrate_output: False # If True calibrate model on test data + use_proba: True # If True output is 'gammaness', else 'score' + calibrate_output: False # If True calibrate model on test data # List of the features to use to train the model # You can: @@ -59,11 +62,11 @@ Method: # - if not you can propose modifications to protopipe.mva.utils.prepare_data FeatureList: Basic: # single-named, they need to correspond to input data columns - - 'h_max' # Height of shower maximum from stereoscopic reconstruction - - 'impact_dist' # Impact parameter from stereoscopic reconstruction - - 'hillas_width' # Image Width - - 'hillas_length' # Image Length - # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel + - "h_max" # Height of shower maximum from stereoscopic reconstruction + - "impact_dist" # Impact parameter from stereoscopic reconstruction + - "hillas_width" # Image Width + - "hillas_length" # Image Length + - "concentration_pixel" # Percentage of photo-electrons in the brightest pixel Derived: # custom evaluations of basic features that will be added to the data # column name : expression to evaluate using basic column names log10_intensity: log10(hillas_intensity) @@ -72,18 +75,18 @@ FeatureList: # These cuts select the input data BEFORE training SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - - 'hillas_intensity_reco > 0' + - "good_image == 1" + - "is_valid == True" + - "hillas_intensity > 0" BkgFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - - 'hillas_intensity_reco > 0' + - "good_image == 1" + - "is_valid == True" + - "hillas_intensity > 0" Diagnostic: - # Energy binning (used for reco and true energy) - energy: - nbins: 4 - min: 0.0125 - max: 200 + # Energy binning (used for reco and true energy) + energy: + nbins: 4 + min: 0.0125 + max: 200 diff --git a/protopipe/scripts/tests/test_RandomForestRegressor.yaml b/protopipe/scripts/tests/test_RandomForestRegressor.yaml index d567e483..9bed874b 100644 --- a/protopipe/scripts/tests/test_RandomForestRegressor.yaml +++ b/protopipe/scripts/tests/test_RandomForestRegressor.yaml @@ -1,19 +1,17 @@ General: - # NOTE: not used here since the testing suite needs to work from the CLI - data_dir: './' - data_sig_file: 'test_TRAINING_energy_{}_gamma_merged.h5' - outdir: './' - + # [...] = analysis full path (on the host if you are using a container) + data_dir_signal: "[...]/data/TRAINING/for_energy_estimation/" + data_sig_file: "TRAINING_energy_tail_gamma_merged.h5" + outdir: "[...]/estimators/energy_regressor" # List of cameras to use (protopipe-MODEL help output for other options) - # NOTE: not used here since the testing suite needs to work from the CLI - cam_id_list: ['LSTCam', 'NectarCam'] + cam_id_list: ["LSTCam", "NectarCam"] # If train_fraction is 1, all the TRAINING dataset will be used to train the # model and benchmarking can only be done from the benchmarking notebook # TRAINING/benchmarks_DL2_to_classification.ipynb Split: train_fraction: 0.8 - use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split # Optimize the hyper-parameters of the estimator with a grid search # If True parameters should be provided as lists @@ -21,20 +19,23 @@ Split: GridSearchCV: use: False # True or False # if False the following two variables are irrelevant - scoring: 'explained_variance' - cv: 2 + scoring: "explained_variance" + cv: 2 # cross-validation splitting strategy + refit: True # Refit the estimator using the best found parameters + verbose: 1 # 1,2,3,4 + njobs: -1 # int or -1 (all processors) # Definition of the model algorithm/method used and its hyper-parameters Method: - name: 'sklearn.ensemble.RandomForestRegressor' # DO NOT CHANGE - target_name: 'true_energy' + name: "sklearn.ensemble.RandomForestRegressor" # DO NOT CHANGE + target_name: "true_energy" log_10_target: True # this makes the model use log10(target_name) tuned_parameters: # Please, see scikit-learn's API for what each parameter means # NOTE: null == None n_estimators: 50 # integer criterion: "mse" # "mse" or "mae" - max_depth: null # null or integer + max_depth: 20 # null or integer min_samples_split: 5 # integer min_samples_leaf: 5 # integer min_weight_fraction_leaf: 0.0 # float @@ -57,12 +58,12 @@ Method: # - if not you can propose modifications to protopipe.mva.utils.prepare_data FeatureList: Basic: # single-named, they need to correspond to input data columns - - 'h_max' # Height of shower maximum from stereoscopic reconstruction - - 'impact_dist' # Impact parameter from stereoscopic reconstruction - - 'hillas_width' # Image Width - - 'hillas_length' # Image Length - # - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel - - 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera + - "h_max" # Height of shower maximum from stereoscopic reconstruction + - "impact_dist" # Impact parameter from stereoscopic reconstruction + - "hillas_width" # Image Width + - "hillas_length" # Image Length + - "concentration_pixel" # Percentage of photo-electrons in the brightest pixel + - "leakage_intensity_width_1" # fraction of total Intensity which is contained in the outermost pixels of the camera Derived: # custom evaluations of basic features that will be added to the data # column name : expression to evaluate using basic column names log10_WLS: log10(hillas_width*hillas_length/hillas_intensity) @@ -72,9 +73,9 @@ FeatureList: # These cuts select the input data BEFORE training SigFiducialCuts: - - 'good_image == 1' - - 'is_valid == True' - - 'hillas_intensity_reco > 0' + - "good_image == 1" + - "is_valid == True" + - "hillas_intensity > 0" # Information used by the benchmarking notebook related to this model Diagnostic: diff --git a/protopipe/scripts/write_dl2.py b/protopipe/scripts/write_dl2.py index 8e43075d..370637f9 100755 --- a/protopipe/scripts/write_dl2.py +++ b/protopipe/scripts/write_dl2.py @@ -159,7 +159,11 @@ def main(): estimation_weight_energy = "CTAMARS" classifier_method = cfg["GammaHadronClassifier"]["method_name"] estimation_weight_classification = cfg["GammaHadronClassifier"]["estimation_weight"] - use_proba_for_classifier = cfg["GammaHadronClassifier"]["use_proba"] + try: + use_proba_for_classifier = cfg["GammaHadronClassifier"]["use_proba"] + except KeyError: + # if not specified use "gammaness" as particle type score + use_proba_for_classifier = True if regressor_method in ["None", "none", None]: print(