Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve protopipe.mva and protopipe-MODELS #164

Merged
merged 22 commits into from
Feb 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/mva/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Reference/API

.. automodapi:: protopipe.mva
:no-inheritance-diagram:
:skip: auc, roc_curve, train_test_split
:skip: auc, roc_curve, train_test_split, shuffle, save_obj

.. _scikit-learn: https://scikit-learn.org/
.. _GridSearchCV: https://scikit-learn.org/stable/modules/grid_search.html
Expand Down
58 changes: 30 additions & 28 deletions protopipe/aux/example_config_files/AdaBoostRegressor.yaml
Original file line number Diff line number Diff line change
@@ -1,36 +1,38 @@
General:
# [...] = your analysis local full path OUTSIDE the Vagrant box
data_dir: '../../data/'
data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5'
outdir: './'

# List of cameras to use (you can override this from the CLI)
cam_id_list: ['LSTCam', 'NectarCam']
# [...] = analysis full path (on the host if you are using a container)
data_dir_signal: "ANALYSES_DIRECTORY/ANALYSIS_NAME/data/TRAINING/for_energy_estimation/gamma"
data_sig_file: "TRAINING_energy_tail_gamma_merged.h5"
outdir: "ANALYSES_DIRECTORY/ANALYSIS_NAME/estimators/energy_regressor"
# List of cameras to use (protopipe-MODEL help output for other options)
cam_id_list: []

# If train_fraction is 1, all the TRAINING dataset will be used to train the
# model and benchmarking can only be done from the benchmarking notebook
# TRAINING/benchmarks_DL2_to_classification.ipynb
Split:
train_fraction: 0.8
use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split
use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split

# Optimize the hyper-parameters of the estimator with a grid search
# If True parameters should be provided as lists
# If False the model used will be the one based on the chosen single-valued hyper-parameters
GridSearchCV:
use: False # True or False
# if False the following two variables are irrelevant
scoring: 'explained_variance'
cv: 2
scoring: "explained_variance"
cv: 2 # cross-validation splitting strategy
refit: True # Refit the estimator using the best found parameters
verbose: 1 # 1,2,3,4
njobs: -1 # int or -1 (all processors)

Method:
name: 'sklearn.ensemble.AdaBoostRegressor'
target_name: 'true_energy'
name: "sklearn.ensemble.AdaBoostRegressor"
target_name: "true_energy"
log_10_target: True # this makes the model use log10(target_name)
# Please, see scikit-learn's API for what each parameter means
# NOTE: null == None
base_estimator:
name: 'sklearn.tree.DecisionTreeRegressor'
name: "sklearn.tree.DecisionTreeRegressor"
parameters:
# NOTE: here we set the parameters relevant for sklearn.tree.DecisionTreeRegressor
criterion: "mse" # "mse", "friedman_mse", "mae" or "poisson"
Expand All @@ -47,7 +49,7 @@ Method:
tuned_parameters:
n_estimators: 50
learning_rate: 1
loss: 'linear' # 'linear', 'square' or 'exponential'
loss: "linear" # 'linear', 'square' or 'exponential'
random_state: 0 # int, RandomState instance or None

# List of the features to use to train the model
Expand All @@ -57,12 +59,12 @@ Method:
# - if not you can propose modifications to protopipe.mva.utils.prepare_data
FeatureList:
Basic: # single-named, they need to correspond to input data columns
- 'h_max' # Height of shower maximum from stereoscopic reconstruction
- 'impact_dist' # Impact parameter from stereoscopic reconstruction
- 'hillas_width' # Image Width
- 'hillas_length' # Image Length
# - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
- 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera
- "h_max" # Height of shower maximum from stereoscopic reconstruction
- "impact_dist" # Impact parameter from stereoscopic reconstruction
- "hillas_width" # Image Width
- "hillas_length" # Image Length
- "concentration_pixel" # Percentage of photo-electrons in the brightest pixel
- "leakage_intensity_width_1" # fraction of total Intensity which is contained in the outermost pixels of the camera
Derived: # custom evaluations of basic features that will be added to the data
# column name : expression to evaluate using basic column names
log10_WLS: log10(hillas_width*hillas_length/hillas_intensity)
Expand All @@ -72,13 +74,13 @@ FeatureList:

# These cuts select the input data BEFORE training
SigFiducialCuts:
- 'good_image == 1'
- 'is_valid == True'
- 'hillas_intensity_reco > 0'
- "good_image == 1"
- "is_valid == True"
- "hillas_intensity > 0"

Diagnostic:
# Energy binning (used for reco and true energy)
energy:
nbins: 15
min: 0.0125
max: 125
# Energy binning (used for reco and true energy)
energy:
nbins: 15
min: 0.0125
max: 125
68 changes: 36 additions & 32 deletions protopipe/aux/example_config_files/RandomForestClassifier.yaml
Original file line number Diff line number Diff line change
@@ -1,39 +1,42 @@
General:
# [...] = your analysis local full path OUTSIDE the Vagrant box
data_dir: '../../data/' # '[...]/data/TRAINING/for_particle_classification/'
data_sig_file: 'TRAINING_classification_tail_gamma_merged.h5'
data_bkg_file: 'TRAINING_classification_tail_proton_merged.h5'
outdir: './' # [...]/estimators/gamma_hadron_classifier

# [...] = your analysis full path (on the host if you are using a container)
data_dir_signal: "ANALYSES_DIRECTORY/ANALYSIS_NAME/data/TRAINING/for_particle_classification/gamma"
data_dir_background: "ANALYSES_DIRECTORY/ANALYSIS_NAME/data/TRAINING/for_particle_classification/proton"
data_sig_file: "TRAINING_classification_tail_gamma_merged.h5"
data_bkg_file: "TRAINING_classification_tail_proton_merged.h5"
outdir: "ANALYSES_DIRECTORY/ANALYSIS_NAME/estimators/gamma_hadron_classifier"
# List of cameras to use (protopipe-MODEL help output for other options)
cam_id_list: ['LSTCam', 'NectarCam']
cam_id_list: []

# If train_fraction is 1, all the TRAINING dataset will be used to train the
# If train_fraction is 1.0, all the TRAINING dataset will be used to train the
# model and benchmarking can only be done from the benchmarking notebook
# TRAINING/benchmarks_DL2_to_classification.ipynb
Split:
train_fraction: 0.8
use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split
use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split

# Optimize the hyper-parameters of the estimator with a grid search
# If True parameters should be provided as lists (for None use [null])
# If False the model used will be the one based on the chosen single-valued hyper-parameters
GridSearchCV:
use: False # True or False
# if False the following to variables are irrelevant
scoring: 'roc_auc'
cv: 2
scoring: "roc_auc"
cv: 2 # cross-validation splitting strategy
refit: True # Refit the estimator using the best found parameters
verbose: 1 # 1,2,3,4
njobs: -1 # int or -1 (all processors)

# Definition of the algorithm/method used and its hyper-parameters
Method:
name: 'sklearn.ensemble.RandomForestClassifier' # DO NOT CHANGE
target_name: 'label' # defined between 0 and 1 (DO NOT CHANGE)
name: "sklearn.ensemble.RandomForestClassifier" # DO NOT CHANGE
target_name: "label" # defined between 0 and 1 (DO NOT CHANGE)
tuned_parameters:
# Please, see scikit-learn's API for what each parameter means
# WARNING: null (not a string) == 'None'
n_estimators: 100 # integer
criterion: 'gini' # 'gini' or 'entropy'
max_depth: null # null or integer
criterion: "gini" # 'gini' or 'entropy'
max_depth: 20 # null or integer
min_samples_split: 2 # integer or float
min_samples_leaf: 1 # integer or float
min_weight_fraction_leaf: 0.0 # float
Expand All @@ -49,7 +52,8 @@ Method:
class_weight: null # 'balanced', 'balanced_subsample', null, dict or list of dicts
ccp_alpha: 0.0 # non-negative float
max_samples: null # null, integer or float
calibrate_output: False # If True calibrate model on test data
use_proba: True # If True output is 'gammaness', else 'score'
calibrate_output: False # If True calibrate model on test data

# List of the features to use to train the model
# You can:
Expand All @@ -58,11 +62,11 @@ Method:
# - if not you can propose modifications to protopipe.mva.utils.prepare_data
FeatureList:
Basic: # single-named, they need to correspond to input data columns
- 'h_max' # Height of shower maximum from stereoscopic reconstruction
- 'impact_dist' # Impact parameter from stereoscopic reconstruction
- 'hillas_width' # Image Width
- 'hillas_length' # Image Length
# - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
- "h_max" # Height of shower maximum from stereoscopic reconstruction
- "impact_dist" # Impact parameter from stereoscopic reconstruction
- "hillas_width" # Image Width
- "hillas_length" # Image Length
- "concentration_pixel" # Percentage of photo-electrons in the brightest pixel
Derived: # custom evaluations of basic features that will be added to the data
# column name : expression to evaluate using basic column names
log10_intensity: log10(hillas_intensity)
Expand All @@ -71,18 +75,18 @@ FeatureList:

# These cuts select the input data BEFORE training
SigFiducialCuts:
- 'good_image == 1'
- 'is_valid == True'
- 'hillas_intensity_reco > 0'
- "good_image == 1"
- "is_valid == True"
- "hillas_intensity > 0"

BkgFiducialCuts:
- 'good_image == 1'
- 'is_valid == True'
- 'hillas_intensity_reco > 0'
- "good_image == 1"
- "is_valid == True"
- "hillas_intensity > 0"

Diagnostic:
# Energy binning (used for reco and true energy)
energy:
nbins: 4
min: 0.0125
max: 200
# Energy binning (used for reco and true energy)
energy:
nbins: 4
min: 0.0125
max: 200
44 changes: 23 additions & 21 deletions protopipe/aux/example_config_files/RandomForestRegressor.yaml
Original file line number Diff line number Diff line change
@@ -1,39 +1,41 @@
General:
# [...] = your analysis local full path OUTSIDE the Vagrant box
data_dir: '../../data/' # '[...]/data/TRAINING/for_energy_estimation/'
data_sig_file: 'TRAINING_energy_tail_gamma_merged.h5'
outdir: './' # '[...]/estimators/energy_regressor'

# [...] = analysis full path (on the host if you are using a container)
data_dir_signal: "ANALYSES_DIRECTORY/ANALYSIS_NAME/data/TRAINING/for_energy_estimation/gamma"
data_sig_file: "TRAINING_energy_tail_gamma_merged.h5"
outdir: "ANALYSES_DIRECTORY/ANALYSIS_NAME/estimators/energy_regressor"
# List of cameras to use (protopipe-MODEL help output for other options)
cam_id_list: ['LSTCam', 'NectarCam']
cam_id_list: []

# If train_fraction is 1, all the TRAINING dataset will be used to train the
# model and benchmarking can only be done from the benchmarking notebook
# TRAINING/benchmarks_DL2_to_classification.ipynb
Split:
train_fraction: 0.8
use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split
use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split

# Optimize the hyper-parameters of the estimator with a grid search
# If True parameters should be provided as lists
# If False the model used will be the one based on the chosen single-valued hyper-parameters
GridSearchCV:
use: False # True or False
# if False the following two variables are irrelevant
scoring: 'explained_variance'
cv: 2
scoring: "explained_variance"
cv: 2 # cross-validation splitting strategy
refit: True # Refit the estimator using the best found parameters
verbose: 1 # 1,2,3,4
njobs: -1 # int or -1 (all processors)

# Definition of the model algorithm/method used and its hyper-parameters
Method:
name: 'sklearn.ensemble.RandomForestRegressor' # DO NOT CHANGE
target_name: 'true_energy'
name: "sklearn.ensemble.RandomForestRegressor" # DO NOT CHANGE
target_name: "true_energy"
log_10_target: True # this makes the model use log10(target_name)
tuned_parameters:
# Please, see scikit-learn's API for what each parameter means
# NOTE: null == None
n_estimators: 50 # integer
criterion: "mse" # "mse" or "mae"
max_depth: null # null or integer
max_depth: 20 # null or integer
min_samples_split: 5 # integer
min_samples_leaf: 5 # integer
min_weight_fraction_leaf: 0.0 # float
Expand All @@ -56,12 +58,12 @@ Method:
# - if not you can propose modifications to protopipe.mva.utils.prepare_data
FeatureList:
Basic: # single-named, they need to correspond to input data columns
- 'h_max' # Height of shower maximum from stereoscopic reconstruction
- 'impact_dist' # Impact parameter from stereoscopic reconstruction
- 'hillas_width' # Image Width
- 'hillas_length' # Image Length
# - 'concentration_pixel' # Percentage of photo-electrons in the brightest pixel
- 'leakage_intensity_width_1_reco' # fraction of total Intensity which is contained in the outermost pixels of the camera
- "h_max" # Height of shower maximum from stereoscopic reconstruction
- "impact_dist" # Impact parameter from stereoscopic reconstruction
- "hillas_width" # Image Width
- "hillas_length" # Image Length
- "concentration_pixel" # Percentage of photo-electrons in the brightest pixel
- "leakage_intensity_width_1" # fraction of total Intensity which is contained in the outermost pixels of the camera
Derived: # custom evaluations of basic features that will be added to the data
# column name : expression to evaluate using basic column names
log10_WLS: log10(hillas_width*hillas_length/hillas_intensity)
Expand All @@ -71,9 +73,9 @@ FeatureList:

# These cuts select the input data BEFORE training
SigFiducialCuts:
- 'good_image == 1'
- 'is_valid == True'
- 'hillas_intensity_reco > 0'
- "good_image == 1"
- "is_valid == True"
- "hillas_intensity > 0"

# Information used by the benchmarking notebook related to this model
Diagnostic:
Expand Down
Loading