Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve models generation #96

Merged
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0efd817
Refactor model script I/O
HealthyPear Jan 25, 2021
71f59ea
Improve input features and add missing CTAMARS ones
HealthyPear Jan 25, 2021
e5b64bb
Started improving model script (see complete commit message for details)
HealthyPear Jan 25, 2021
2f045ab
Update from master and solve conflicts
HealthyPear Apr 12, 2021
8819bde
clarify CLI help
HealthyPear Apr 12, 2021
e070b06
small format changes to protopipe.mva.utils.prepare_data
HealthyPear Apr 12, 2021
b554334
simplify a condition in TrainModel
HealthyPear Apr 12, 2021
07fcb9e
Test improvement of models initialization
HealthyPear Apr 12, 2021
03f8c97
allow fit of single model (no GridSearchCV)
HealthyPear Apr 12, 2021
22b264c
small formatting change
HealthyPear Apr 12, 2021
de38037
Add example configuration file for RandomForestRegressor
HealthyPear Apr 12, 2021
fb26a1c
Add example configuration file for RandomForestClassifier
HealthyPear Apr 12, 2021
a55c861
fix input signal file name key
HealthyPear Apr 12, 2021
1ab50d7
Add testing files for RandomForestClassifier and RandomForestRegressor
HealthyPear Apr 14, 2021
9fb476f
Add test configuration file for AdaBoostRegressor (replaces regressor)
HealthyPear Apr 14, 2021
861e61b
Add AdaBoostRegressor configuration file
HealthyPear Apr 14, 2021
52104a5
Update model output
HealthyPear Apr 14, 2021
beb5562
Update example config files for RandomForest-based algorithms
HealthyPear Apr 14, 2021
e3fecb5
Improve protopipe.mva.utils.prepare_data
HealthyPear Apr 14, 2021
608d0f5
Improve and simplify protopipe-MODEL
HealthyPear Apr 14, 2021
7fad104
Modify protopipe-TRAINING according to new version of protopipe-MODEL
HealthyPear Apr 14, 2021
dc92517
Modify protopipe-DL2 according to modification to protopipe-MODEL
HealthyPear Apr 14, 2021
b5539e4
Update test configuration files
HealthyPear Apr 14, 2021
44a9317
Update test pipeline
HealthyPear Apr 14, 2021
2e01972
Remove obsolete MVA example/test configuration files
HealthyPear Apr 14, 2021
c76eee6
Update documentation
HealthyPear Apr 14, 2021
a68cab6
Rename some regressor features
HealthyPear Apr 15, 2021
45bc7d9
Remove code leftovers from DL2 script
HealthyPear Apr 15, 2021
47c522b
Fix check for classification features
HealthyPear Apr 15, 2021
f589fc8
Improve check for model type
HealthyPear Apr 15, 2021
8411fe7
Remove old test configuration files for regressor and classifier
HealthyPear Apr 15, 2021
44368ec
Fix comment/description in configuration files
HealthyPear Apr 15, 2021
95cff80
Fix names of energy-releated features
HealthyPear Apr 15, 2021
d10dac3
Check if label is explicitly None because it can be also 0
HealthyPear Apr 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion protopipe/mva/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Classes to buil models based on machine learning methods.
Classes to build models based on machine learning methods.
"""
from .train_model import *
from .diagnostic import *
from .utils import *
from .io import *
127 changes: 127 additions & 0 deletions protopipe/mva/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""Input functions for a model initilization."""

import argparse
import joblib
from os import path

from protopipe.mva.utils import save_obj


def initialize_script_arguments():
"""Initialize the parser of protopipe.scripts.build_model.

Returns
-------
args : argparse.Namespace
Populated argparse namespace.
"""

parser = argparse.ArgumentParser(
description="Build model for regression/classification"
)
parser.add_argument("--config_file", type=str, required=True)

parser.add_argument(
"--max_events",
type=int,
default=None,
help="maximum number of events to use",
)

mode_group = parser.add_mutually_exclusive_group()
mode_group.add_argument(
"--wave",
dest="mode",
action="store_const",
const="wave",
default="tail",
help="if set, use wavelet cleaning",
)
mode_group.add_argument(
"--tail",
dest="mode",
action="store_const",
const="tail",
help="if set, use tail cleaning (default), otherwise wavelets",
)

# These last CL arguments can overwrite the values from the config

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--cameras_from_config',
action='store_true',
help="Get cameras configuration file (Priority 1)",)
group.add_argument('--cameras_from_file',
action='store_true',
help="Get cameras from input file (Priority 2)",)
group.add_argument('--cam_id_list',
type=str,
default=None,
help="Select cameras like 'LSTCam CHEC' (Priority 3)",)

parser.add_argument(
"-i",
"--indir",
type=str,
default=None,
help="Directory containing the required input file(s)"
)
parser.add_argument(
"--infile_signal",
type=str,
default=None,
help="SIGNAL file (default: read from config file)",
)
parser.add_argument(
"--infile_background",
type=str,
default=None,
help="BACKGROUND file (default: read from config file)",
)
parser.add_argument("-o", "--outdir", type=str, default=None)

args = parser.parse_args()

return args

def save_output(models,
cam_id,
factory,
best_model,
model_type,
method_name,
outdir):
"""Save model and data used to produce it per camera-type."""

models[cam_id] = best_model
outname = "{}_{}_{}.pkl.gz".format(
model_type, cam_id, method_name
)
joblib.dump(best_model, path.join(outdir, outname))

# SAVE DATA
save_obj(
factory.data_scikit,
path.join(
outdir,
"data_scikit_{}_{}_{}.pkl.gz".format(
model_type, method_name, cam_id
),
),
)
factory.data_train.to_pickle(
path.join(
outdir,
"data_train_{}_{}_{}.pkl.gz".format(
model_type, method_name, cam_id
),
)
)
factory.data_test.to_pickle(
path.join(
outdir,
"data_test_{}_{}_{}.pkl.gz".format(
model_type, method_name, cam_id
),
)
)
4 changes: 2 additions & 2 deletions protopipe/mva/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def split_data(
to build a classifier
"""

if self.case in "regressor":
if self.case == "regressor":
(
X_train,
X_test,
Expand All @@ -65,7 +65,7 @@ def split_data(
weight = np.ones(len(self.data_train))
weight_train = weight / sum(weight)

elif self.case in "classifier":
else:
kosack marked this conversation as resolved.
Show resolved Hide resolved
(
X_train_sig,
X_test_sig,
Expand Down
48 changes: 42 additions & 6 deletions protopipe/mva/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,53 @@ def load_obj(name):


def prepare_data(ds, cuts, label=None):
"""Add variables in data frame"""
ds["log10_hillas_intensity"] = np.log10(
ds["hillas_intensity_reco"]
) # THIS SHOULDN'T BE HARDCODED!!!
"""Add variables to the input data.

This is done in order to allow the use modified or more complex features.

Parameters
----------
ds : pandas.DataFrame
Input data not yet selected.
cuts: str
Fiducial cuts from protopipe.mva.utils.make_cut_list
label: str
Name of the classifier target label if any.

Returns
-------
ds : pandas.DataFrame
Input data selected for the fiducial cuts and integrated with more
possible features.
"""
width = ds["hillas_width_reco"]
length = ds["hillas_length_reco"]
intensity = ds["hillas_intensity_reco"]
cog_x = ds["hillas_x_reco"]
cog_y = ds["hillas_y_reco"]

# reconstructed event direction on the camera
# this works because we do image parametriation in the TelescopeFrame
# WARNING: check case of e.g. divergent pointing
dir_x = ds["az"]
dir_y = ds["alt"]

ds["log10_hillas_intensity"] = np.log10(intensity)
ds["log10_impact_dist"] = np.log10(ds["impact_dist"])
ds["log10_true_energy"] = np.log10(ds["true_energy"])
try: # for classification
ds["log10(width*length/intensity)"] = np.log10(width * length / intensity)

# square of distance from Image c.o.g. to reconstructed event direction
# on the camera (dir_x, dir_y)
ds["CTAMARS_1"] = np.power(np.sqrt(np.power((cog_x - dir_x), 2) + np.power((cog_y - dir_y), 2)), 2)

ds["CTAMARS_2"] = np.arctan2(cog_y - dir_y, cog_x - dir_x)

try: # additional parameters for classification
ds["log10_reco_energy"] = np.log10(ds["reco_energy"])
ds["log10_reco_energy_tel"] = np.log10(ds["reco_energy_tel"])
ds["label"] = np.full(len(ds), label)
except:
except KeyError:
pass

ds = ds.query(cuts)
Expand Down
Loading