diff --git a/conftest.py b/conftest.py new file mode 100644 index 00000000..1287a021 --- /dev/null +++ b/conftest.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.fixture(scope="session") +def pipeline_testdir(tmp_path_factory): + return tmp_path_factory.mktemp("test_pipeline") diff --git a/docs/contribute/beforepushing.rst b/docs/contribute/beforepushing.rst index 0248b0fe..3063087a 100644 --- a/docs/contribute/beforepushing.rst +++ b/docs/contribute/beforepushing.rst @@ -35,18 +35,16 @@ You will have to fix any warning that appears during documentation building, because the documentation also runs on `readthedocs `__ with an option to treat warnings as errors. -Unit and integration tests --------------------------- +Unit tests +---------- .. note:: This is a maintenance activity which has being long overdue and we need manpower for it, so if you have experience on this or you want to contribute please feel free to do so. - For more information on how to contribute to this effort check these issues, - - - `unit-tests `__, - - `integration tests `__. + For more information on how to contribute to this effort check + `this issue `__. Being *protopipe* based on *ctapipe*, all the tools imported from the latter have been already tested and approved (*protopipe* uses always a version of @@ -64,18 +62,28 @@ Same for *pyirf*. order to code a new feature, this has to be pull-requested to *ctapipe* and at the same time hardcoded in *protopipe*, until the new version of *ctapipe* is released. -For the moment there is only **one integration test**. +Integration tests +^^^^^^^^^^^^^^^^^ + +.. note:: + For more information on how to contribute to this effort check + `this issue `__. + +The integration tests are defined in the dedicated module ``pipeline/scripts/tests/test_pipeline.py`` +and start from test simtel files stored on a CC-IN2P3 dataserver. + +The test data is diffuse data from the Prod3b baseline simulations of both +CTAN and CTAS produced with the following Corsika settings, -This test is in charge to detect if changes in the script -``protopipe.scripts.data_training`` and any code used by it, produce -any fatal behaviour or crash. +- gammas, ``NSHOW=10 ESLOPE=-2.0 EMIN=10 EMAX=20 NSCAT=1 CSCAT=200 VIEWCONE=3`` +- protons, ``NSHOW=10 ESLOPE=-2.0 EMIN=100 EMAX=200 NSCAT=1 CSCAT=200 VIEWCONE=3`` +- electrons, ``NSHOW=10 ESLOPE=-2.0 EMIN=10 EMAX=20 NSCAT=1 CSCAT=200 VIEWCONE=3`` -For the moment it uses an old test file from *ctapipe* (a Prod2 CTA South array composed of -LSTCam, FlashCam and ASTRICam with about ~100 simulated showers). -Each test is expected to produce a non-empty HDF5 file. +in the same proportions as a standard full-scale analysis. -The test can be executed directly from the main folder of *protopipe* by launching -``pytest``. It is also automatically triggered by the CI every time a new +The pipeline integration testing can be executed directly from the main folder +of *protopipe* by launching ``pytest``. +It is also automatically triggered by the CI every time a new pull-request is pushed to the repository, and its correct execution is a mandatory condition for merging. diff --git a/protopipe/scripts/tests/test_classifier.yaml b/protopipe/scripts/tests/test_classifier.yaml new file mode 100644 index 00000000..ad2deacf --- /dev/null +++ b/protopipe/scripts/tests/test_classifier.yaml @@ -0,0 +1,51 @@ +General: + model_type: 'classifier' + # [...] = your analysis local full path OUTSIDE the Vagrant box + data_dir: '[...]/shared_folder/analyses/v0.4.0_dev1/data/TRAINING/for_particle_classification/' + data_sig_file: 'TRAINING_classification_{}_gamma_merged.h5' + data_bkg_file: 'TRAINING_classification_{}_proton_merged.h5' + cam_id_list: ['LSTCam', 'NectarCam'] + table_name_template: '' # leave empty (TO BE REMOVED) + outdir: '[...]/shared_folder/analyses/v0.4.0_dev1/estimators/gamma_hadron_classifier' + +Split: + train_fraction: 0.8 + use_same_number_of_sig_and_bkg_for_training: False # Lowest statistics will drive the split + +Method: + name: 'RandomForestClassifier' # AdaBoostClassifier or RandomForestClassifier + target_name: 'label' + tuned_parameters: # these are lists of values used by the GridSearchCV algorithm + n_estimators: [200] + max_depth: [10] # null for None + max_features: [3] # possible choices are “auto”, “sqrt”, “log2”, int or float + min_samples_split: [10] + min_samples_leaf: [10] + scoring: 'roc_auc' # possible choices are 'roc_auc', 'explained_variance' + cv: 2 + use_proba: True # If not output is score + calibrate_output: False # If true calibrate probability + +FeatureList: + # - 'log10_reco_energy' + # - 'log10_reco_energy_tel' + - 'log10_hillas_intensity' + - 'hillas_width' + - 'hillas_length' + - 'h_max' + - 'impact_dist' + +SigFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + +BkgFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + +Diagnostic: + # Energy binning (used for reco and true energy) + energy: + nbins: 4 + min: 0.02 + max: 200 diff --git a/protopipe/scripts/tests/test_config_analysis_south.yaml b/protopipe/scripts/tests/test_config_analysis_south.yaml index 6a9de344..448d6899 100644 --- a/protopipe/scripts/tests/test_config_analysis_south.yaml +++ b/protopipe/scripts/tests/test_config_analysis_south.yaml @@ -9,7 +9,7 @@ General: # WARNING: for simulations containing multiple copies of the telescopes, # only 'full_array' or custom list are supported options! array: full_array - cam_id_list : ['LSTCam', 'NectarCam'] # Selected cameras (disabled option) + cam_id_list : ['LSTCam', 'FlashCam', 'CHEC'] # Selected cameras (disabled option) # Cleaning for reconstruction ImageCleaning: diff --git a/protopipe/scripts/tests/test_dataTraining.py b/protopipe/scripts/tests/test_dataTraining.py deleted file mode 100644 index 952c568e..00000000 --- a/protopipe/scripts/tests/test_dataTraining.py +++ /dev/null @@ -1,94 +0,0 @@ -"""Test the data training script. - -TODO ----- - -- test only diffuse data (more general case) -- add Paranal diffuse test file -- add Prod5 test files - -""" -from os import path, system -from pkg_resources import resource_filename - -import tables -import pytest - -from protopipe.scripts import data_training -from protopipe.pipeline.temp import get_dataset_path - -# TEST FILES - -URL = "http://cccta-dataserver.in2p3.fr/data/protopipe/testData/" - -# PROD 3b - -PROD3B_CTA_NORTH = get_dataset_path("gamma1.simtel.gz", - url=f"{URL}/prod3_laPalma_baseline_Az180_Az20") -PROD3B_CTA_SOUTH = get_dataset_path("gamma1.simtel.gz", - url=f"{URL}/prod3_Paranal_baseline_Az180_Az20") - - -@pytest.mark.parametrize("input_file", [PROD3B_CTA_NORTH, PROD3B_CTA_SOUTH]) -def test_dataTraining_noImages(input_file): - """Very bare test to see if the script reaches the end correctly. - - WARNING: some of the cuts in the example config file are not optimized for - cameras other than LSTCam and NectarCam. - In any case, it is expected that in absence of fatal bugs, the script - ends successfully. - """ - - # the difference is only the 'site' key as a check for the user - if input_file in [PROD3B_CTA_SOUTH]: - ana_config = resource_filename("protopipe", "scripts/tests/test_config_analysis_south.yaml") - else: - ana_config = resource_filename("protopipe", "scripts/tests/test_config_analysis_north.yaml") - - exit_status = system( - f"python {data_training.__file__}\ - --config_file {ana_config}\ - -o test_training_noImages.h5\ - -i {path.dirname(input_file)}\ - -f {path.basename(input_file)}" - ) - - # check that the script ends without crashing - assert exit_status == 0 - - # check that the produced HDF5 file is non-empty - with tables.open_file("test_training_noImages.h5") as file: - assert file.get_filesize() > 0 - - -@pytest.mark.parametrize("input_file", [PROD3B_CTA_NORTH, PROD3B_CTA_SOUTH]) -def test_dataTraining_withImages(input_file): - """Very bare test to see if the script reaches the end correctly. - - WARNING: some of the cuts in the example config file are not optimized for - cameras other than LSTCam and NectarCam. - In any case, it is expected that in absence of fatal bugs, the script - ends successfully. - """ - - # the difference is only the 'site' key as a check for the user - if input_file in [PROD3B_CTA_SOUTH]: - ana_config = resource_filename("protopipe", "scripts/tests/test_config_analysis_south.yaml") - else: - ana_config = resource_filename("protopipe", "scripts/tests/test_config_analysis_north.yaml") - - exit_status = system( - f"python {data_training.__file__}\ - --config_file {ana_config}\ - -o test_training_withImages.h5\ - --save_images\ - -i {path.dirname(input_file)}\ - -f {path.basename(input_file)}" - ) - - # check that the script ends without crashing - assert exit_status == 0 - - # check that the produced HDF5 file is non-empty - with tables.open_file("test_training_noImages.h5") as file: - assert file.get_filesize() > 0 diff --git a/protopipe/scripts/tests/test_pipeline.py b/protopipe/scripts/tests/test_pipeline.py new file mode 100644 index 00000000..562cc73c --- /dev/null +++ b/protopipe/scripts/tests/test_pipeline.py @@ -0,0 +1,188 @@ +from os import system +from pkg_resources import resource_filename + +import tables +import pytest + +from protopipe.pipeline.temp import get_dataset_path +from protopipe.scripts import data_training, build_model + + +# PROD 3B + +# CONFIG FILES +config_prod3b_CTAN = resource_filename("protopipe", "scripts/tests/test_config_analysis_north.yaml") +config_prod3b_CTAS = resource_filename("protopipe", "scripts/tests/test_config_analysis_south.yaml") + +# TEST FILES + +URL_TEST_DATA = "http://cccta-dataserver.in2p3.fr/data/protopipe/testData/" +URL_PROD3B_CTAN = f"{URL_TEST_DATA}/prod3_laPalma_baseline_Az180_Zd20" +URL_PROD3B_CTAS = f"{URL_TEST_DATA}/prod3_Paranal_baseline_Az180_Zd20" + +input_data = { + + "PROD3B_CTA_NORTH": {"config": config_prod3b_CTAN, + "gamma1": get_dataset_path("gamma1.simtel.gz", + url=f"{URL_PROD3B_CTAN}"), + "gamma2": get_dataset_path("gamma2.simtel.gz", + url=f"{URL_PROD3B_CTAN}"), + "proton1": get_dataset_path("proton1.simtel.gz", + url=f"{URL_PROD3B_CTAN}"), + }, + + "PROD3B_CTA_SOUTH": {"config": config_prod3b_CTAS, + "gamma1": get_dataset_path("gamma1.simtel.gz", + url=f"{URL_PROD3B_CTAS}"), + "gamma2": get_dataset_path("gamma2.simtel.gz", + url=f"{URL_PROD3B_CTAS}"), + "proton1": get_dataset_path("proton1.simtel.gz", + url=f"{URL_PROD3B_CTAS}"), + } + +} + + +@pytest.mark.parametrize("test_case", ["PROD3B_CTA_NORTH", "PROD3B_CTA_SOUTH"]) +def test_GET_GAMMAS_FOR_ENERGY_MODEL_WITH_IMAGES(test_case, pipeline_testdir): + + outpath = pipeline_testdir / f"test_training_withImages_{test_case}.h5" + + exit_status = system( + f"python {data_training.__file__}\ + --config_file {input_data[test_case]['config']}\ + -o {outpath}\ + --save_images\ + -i {input_data[test_case]['gamma1'].parent}\ + -f {input_data[test_case]['gamma1'].name}" + ) + + # check that the script ends without crashing + assert exit_status == 0 + + # check that the produced HDF5 file is non-empty + with tables.open_file(outpath) as file: + assert file.get_filesize() > 0 + + +@pytest.mark.parametrize("test_case", [ + pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="g1N")), + pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="g1S")), +]) +def test_GET_GAMMAS_FOR_ENERGY_MODEL(test_case, pipeline_testdir): + + outpath = pipeline_testdir / f"test_gamma1_noImages_{test_case}.h5" + + exit_status = system( + f"python {data_training.__file__}\ + --config_file {input_data[test_case]['config']}\ + -o {outpath}\ + -i {input_data[test_case]['gamma1'].parent}\ + -f {input_data[test_case]['gamma1'].name}" + ) + + # check that the script ends without crashing + assert exit_status == 0 + + # check that the produced HDF5 file is non-empty + with tables.open_file(outpath) as file: + assert file.get_filesize() > 0 + + +@pytest.mark.parametrize("test_case", [ + pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="g2N")), + pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="g2S")), +]) +def test_GET_GAMMAS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir): + + outpath = pipeline_testdir / f"test_gamma2_noImages_{test_case}.h5" + + exit_status = system( + f"python {data_training.__file__}\ + --config_file {input_data[test_case]['config']}\ + -o {outpath}\ + -i {input_data[test_case]['gamma2'].parent}\ + -f {input_data[test_case]['gamma2'].name}" + ) + + # check that the script ends without crashing + assert exit_status == 0 + + # check that the produced HDF5 file is non-empty + with tables.open_file(outpath) as file: + assert file.get_filesize() > 0 + + +@pytest.mark.parametrize("test_case", [ + pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="p1N")), + pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="p1S")), +]) +def test_GET_PROTONS_FOR_CLASSIFICATION_MODEL(test_case, pipeline_testdir): + + outpath = pipeline_testdir / f"test_proton1_noImages_{test_case}.h5" + + exit_status = system( + f"python {data_training.__file__}\ + --config_file {input_data[test_case]['config']}\ + -o {outpath}\ + -m 10\ + -i {input_data[test_case]['proton1'].parent}\ + -f {input_data[test_case]['proton1'].name}" + ) + + # check that the script ends without crashing + assert exit_status == 0 + + # check that the produced HDF5 file is non-empty + with tables.open_file(outpath) as file: + assert file.get_filesize() > 0 + + +@pytest.mark.parametrize("test_case", [ + pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="EN", + depends=["g1N"])), + pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="ES", + depends=["g1S"])), +]) +def test_BUILD_ENERGY_MODEL_AdaBoost_DecisionTreeRegressor(test_case, pipeline_testdir): + """Launch protopipe.scripts.build_model for a AdaBoost DecisionTreeRegressor.""" + + infile = pipeline_testdir / f"test_gamma1_noImages_{test_case}.h5" + outdir = pipeline_testdir / f"energy_model_{test_case}" + + config = resource_filename("protopipe", "scripts/tests/test_regressor.yaml") + + exit_status = system( + f"python {build_model.__file__}\ + --config_file {config}\ + --infile_signal {infile}\ + --outdir {outdir}\ + --cameras_from_file" + ) + assert exit_status == 0 + + +@pytest.mark.parametrize("test_case", [ + pytest.param("PROD3B_CTA_NORTH", marks=pytest.mark.dependency(name="C1", + depends=["g2N", "p1N"])), + pytest.param("PROD3B_CTA_SOUTH", marks=pytest.mark.dependency(name="C2", + depends=["g2S", "p1S"])), +]) +def test_BUILD_CLASSIFICATION_MODEL_RandomForest(test_case, pipeline_testdir): + """Launch protopipe.scripts.build_model for a Random Forest classifier.""" + + infile_signal = pipeline_testdir / f"test_gamma2_noImages_{test_case}.h5" + infile_background = pipeline_testdir / f"test_proton1_noImages_{test_case}.h5" + outdir = pipeline_testdir / f"classification_model_{test_case}" + + config = resource_filename("protopipe", "scripts/tests/test_regressor.yaml") + + exit_status = system( + f"python {build_model.__file__}\ + --config_file {config}\ + --infile_signal {infile_signal}\ + --infile_background {infile_background}\ + --outdir {outdir}\ + --cameras_from_file" + ) + assert exit_status == 0 diff --git a/protopipe/scripts/tests/test_regressor.yaml b/protopipe/scripts/tests/test_regressor.yaml new file mode 100644 index 00000000..9041fe5e --- /dev/null +++ b/protopipe/scripts/tests/test_regressor.yaml @@ -0,0 +1,41 @@ +General: + model_type: 'regressor' + # [...] = your analysis local full path OUTSIDE the Vagrant box + data_dir: './' + data_file: 'test_TRAINING_energy_{}_gamma_merged.h5' + outdir: './' + cam_id_list: ['LSTCam', 'NectarCam'] + table_name_template: '' # leave empty (TO BE REMOVED) + +Split: + train_fraction: 0.5 + +Method: + name: 'AdaBoostRegressor' + target_name: 'true_energy' + tuned_parameters: + learning_rate: [0.3] + n_estimators: [100] + base_estimator__max_depth: [null] # null is equivalent to None + base_estimator__min_samples_split: [2] + base_estimator__min_samples_leaf: [10] + scoring: 'explained_variance' + cv: 2 + +FeatureList: + - 'log10_hillas_intensity' + - 'log10_impact_dist' + - 'hillas_width_reco' + - 'hillas_length_reco' + - 'h_max' + +SigFiducialCuts: + - 'good_image == 1' + - 'is_valid == True' + +Diagnostic: + # Energy binning (used for reco and true energy) + energy: + nbins: 15 + min: 0.0125 + max: 125 diff --git a/setup.cfg b/setup.cfg index a97dd4b4..8a051d17 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,6 +10,7 @@ show-response = 1 minversion=3.0 norecursedirs=build docs/_build addopts = -v +markers = integration: integration test [aliases] test=pytest