cta-observatory · HealthyPear · Apr 15, 2021 · Jan 25, 2021 · Jan 25, 2021 · Jan 25, 2021
@@ -1,6 +1,7 @@
 """
-Classes to buil models based on machine learning methods.
+Classes to build models based on machine learning methods.
 """
 from .train_model import *
 from .diagnostic import *
 from .utils import *
+from .io import *
@@ -0,0 +1,127 @@
+"""Input functions for a model initilization."""
+
+import argparse
+import joblib
+from os import path
+
+from protopipe.mva.utils import save_obj
+
+
+def initialize_script_arguments():
+    """Initialize the parser of protopipe.scripts.build_model.
+
+    Returns
+    -------
+    args : argparse.Namespace
+        Populated argparse namespace.
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Build model for regression/classification"
+    )
+    parser.add_argument("--config_file", type=str, required=True)
+
+    parser.add_argument(
+        "--max_events",
+        type=int,
+        default=None,
+        help="maximum number of events to use",
+    )
+
+    mode_group = parser.add_mutually_exclusive_group()
+    mode_group.add_argument(
+        "--wave",
+        dest="mode",
+        action="store_const",
+        const="wave",
+        default="tail",
+        help="if set, use wavelet cleaning",
+    )
+    mode_group.add_argument(
+        "--tail",
+        dest="mode",
+        action="store_const",
+        const="tail",
+        help="if set, use tail cleaning (default), otherwise wavelets",
+    )
+
+    # These last CL arguments can overwrite the values from the config
+
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument('--cameras_from_config',
+                       action='store_true',
+                       help="Get cameras configuration file (Priority 1)",)
+    group.add_argument('--cameras_from_file',
+                       action='store_true',
+                       help="Get cameras from input file (Priority 2)",)
+    group.add_argument('--cam_id_list',
+                       type=str,
+                       default=None,
+                       help="Select cameras like 'LSTCam CHEC' (Priority 3)",)
+
+    parser.add_argument(
+        "-i",
+        "--indir",
+        type=str,
+        default=None,
+        help="Directory containing the required input file(s)"
+    )
+    parser.add_argument(
+        "--infile_signal",
+        type=str,
+        default=None,
+        help="SIGNAL file (default: read from config file)",
+    )
+    parser.add_argument(
+        "--infile_background",
+        type=str,
+        default=None,
+        help="BACKGROUND file (default: read from config file)",
+    )
+    parser.add_argument("-o", "--outdir", type=str, default=None)
+
+    args = parser.parse_args()
+
+    return args
+
+def save_output(models,
+                cam_id,
+                factory,
+                best_model,
+                model_type,
+                method_name,
+                outdir):
+    """Save model and data used to produce it per camera-type."""
+
+    models[cam_id] = best_model
+    outname = "{}_{}_{}.pkl.gz".format(
+        model_type, cam_id, method_name
+    )
+    joblib.dump(best_model, path.join(outdir, outname))
+
+    # SAVE DATA
+    save_obj(
+        factory.data_scikit,
+        path.join(
+            outdir,
+            "data_scikit_{}_{}_{}.pkl.gz".format(
+                model_type, method_name, cam_id
+            ),
+        ),
+    )
+    factory.data_train.to_pickle(
+        path.join(
+            outdir,
+            "data_train_{}_{}_{}.pkl.gz".format(
+                model_type, method_name, cam_id
+            ),
+        )
+    )
+    factory.data_test.to_pickle(
+        path.join(
+            outdir,
+            "data_test_{}_{}_{}.pkl.gz".format(
+                model_type, method_name, cam_id
+            ),
+        )
+    )
@@ -48,7 +48,7 @@ def split_data(
             to build a classifier
         """
 
-        if self.case in "regressor":
+        if self.case == "regressor":
             (
                 X_train,
                 X_test,
@@ -65,7 +65,7 @@ def split_data(
             weight = np.ones(len(self.data_train))
             weight_train = weight / sum(weight)
 
-        elif self.case in "classifier":
+        else:
             (
                 X_train_sig,
                 X_test_sig,

@@ -19,17 +19,53 @@ def load_obj(name):
 
 
 def prepare_data(ds, cuts, label=None):
-    """Add variables in data frame"""
-    ds["log10_hillas_intensity"] = np.log10(
-        ds["hillas_intensity_reco"]
-    )  # THIS SHOULDN'T BE HARDCODED!!!
+    """Add variables to the input data.
+
+    This is done in order to allow the use modified or more complex features.
+
+    Parameters
+    ----------
+    ds : pandas.DataFrame
+        Input data not yet selected.
+    cuts: str
+        Fiducial cuts from protopipe.mva.utils.make_cut_list
+    label: str
+        Name of the classifier target label if any.
+
+    Returns
+    -------
+    ds : pandas.DataFrame
+        Input data selected for the fiducial cuts and integrated with more
+        possible features.
+    """
+    width = ds["hillas_width_reco"]
+    length = ds["hillas_length_reco"]
+    intensity = ds["hillas_intensity_reco"]
+    cog_x = ds["hillas_x_reco"]
+    cog_y = ds["hillas_y_reco"]
+
+    # reconstructed event direction on the camera
+    # this works because we do image parametriation in the TelescopeFrame
+    # WARNING: check case of e.g. divergent pointing
+    dir_x = ds["az"]
+    dir_y = ds["alt"]
+
+    ds["log10_hillas_intensity"] = np.log10(intensity)
     ds["log10_impact_dist"] = np.log10(ds["impact_dist"])
     ds["log10_true_energy"] = np.log10(ds["true_energy"])
-    try:  # for classification
+    ds["log10(width*length/intensity)"] = np.log10(width * length / intensity)
+
+    # square of distance from Image c.o.g. to reconstructed event direction
+    # on the camera (dir_x, dir_y)
+    ds["CTAMARS_1"] = np.power(np.sqrt(np.power((cog_x - dir_x), 2) + np.power((cog_y - dir_y), 2)), 2)
+
+    ds["CTAMARS_2"] = np.arctan2(cog_y - dir_y, cog_x - dir_x)
+
+    try:  # additional parameters for classification
         ds["log10_reco_energy"] = np.log10(ds["reco_energy"])
         ds["log10_reco_energy_tel"] = np.log10(ds["reco_energy_tel"])
         ds["label"] = np.full(len(ds), label)
-    except:
+    except KeyError:
         pass
 
     ds = ds.query(cuts)