Improvement of the data management and naming of containers files (#181)

* cleaner way to get the extractor string asociated to extractor kwargs provided -> not anymore sensitive to the order of kargs given by user -> It deals smoothly If user set kwargs to default value * The search of saved containers (SPE,Charge,wfs) is more robust -> Exception raised and cautch properly when file no found * propagation of changes in makers * script updates + fix of broken photostat code * update of shell scripts --------- Co-authored-by: guillaume.grolleron <[email protected]>
cta-observatory · Feb 10, 2025 · f2df538 · f2df538
1 parent 6d1ebad
commit f2df538
Show file tree

Hide file tree

Showing 14 changed files with 245 additions and 164 deletions.
diff --git a/notebooks/tool_implementation/tuto_photostat.py b/notebooks/tool_implementation/tuto_photostat.py
@@ -20,17 +20,18 @@
 import os
 import pathlib
 
+import matplotlib.pyplot as plt
+
+from nectarchain.data.management import DataManagement
+from nectarchain.makers.calibration import PhotoStatisticNectarCAMCalibrationTool
+from nectarchain.makers.extractor.utils import CtapipeExtractor
+
 logging.basicConfig(
     format="%(asctime)s %(name)s %(levelname)s %(message)s", level=logging.INFO
 )
 log = logging.getLogger(__name__)
 log.handlers = logging.getLogger("__main__").handlers
 
-import matplotlib.pyplot as plt
-
-from nectarchain.data.management import DataManagement
-from nectarchain.makers.calibration import PhotoStatisticNectarCAMCalibrationTool
-from nectarchain.makers.extractor.utils import CtapipeExtractor
 
 # %%
 extractor_kwargs = {"window_width": 12, "window_shift": 4}
@@ -42,23 +43,31 @@
 FF_run_number = 3937
 
 # %%
-str_extractor_kwargs = CtapipeExtractor.get_extractor_kwargs_str(extractor_kwargs)
+str_extractor_kwargs = CtapipeExtractor.get_extractor_kwargs_str(
+    method=method, extractor_kwargs=extractor_kwargs
+)
 path = DataManagement.find_SPE_HHV(
     run_number=HHV_run_number,
     method=method,
     str_extractor_kwargs=str_extractor_kwargs,
 )
 if len(path) == 1:
     log.info(
-        f"{path[0]} found associated to HHV run {HHV_run_number}, method {method} and extractor kwargs {str_extractor_kwargs}"
+        f"{path[0]} found associated to HHV run {HHV_run_number},"
+        f"method {method} and extractor kwargs {str_extractor_kwargs}"
     )
 else:
-    _text = f"no file found in $NECTARCAM_DATA/../SPEfit associated to HHV run {HHV_run_number}, method {method} and extractor kwargs {str_extractor_kwargs}"
+    _text = (
+        f"no file found in $NECTARCAM_DATA/../SPEfit associated to HHV run"
+        f"{HHV_run_number}, method {method} and extractor kwargs {str_extractor_kwargs}"
+    )
     log.error(_text)
     raise FileNotFoundError(_text)
 
 # %% [markdown]
-#  WARNING : for now you can't split the event loop in slice for the Photo-statistic method, however in case of the charges havn't been computed on disk, the loop over events will only store the charge, therefore memory errors should happen rarely
+#  WARNING : for now you can't split the event loop in slice for the Photo-statistic
+# method, however in case of the charges havn't been computed on disk, the loop over
+# events will only store the charge, therefore memory errors should happen rarely
 
 # %%
 tool = PhotoStatisticNectarCAMCalibrationTool(

diff --git a/src/nectarchain/data/management.py b/src/nectarchain/data/management.py
@@ -230,7 +230,7 @@ def __get_GRID_location_ELog(
                 break
 
         if i == len(lines) - 1:
-            e = Exception("lfns not found on GRID")
+            e = FileNotFoundError("lfns not found on GRID")
             log.error(e, exc_info=True)
             log.debug(lines)
             raise e
@@ -283,16 +283,17 @@ def find_photostat(
         ped_method="FullWaveformSum",
         str_extractor_kwargs="",
     ):
-        full_file = glob.glob(
-            pathlib.Path(
-                f"{os.environ.get('NECTARCAMDATA','/tmp')}/PhotoStat/"
-                f"PhotoStatisticNectarCAM_FFrun{FF_run_number}_{FF_method}"
-                f"_{str_extractor_kwargs}_Pedrun{ped_run_number}_{ped_method}.h5"
-            ).__str__()
+        path = pathlib.Path(
+            f"{os.environ.get('NECTARCAMDATA','/tmp')}/PhotoStat/"
+            f"PhotoStatisticNectarCAM_FFrun{FF_run_number}_{FF_method}"
+            f"_{str_extractor_kwargs}_Pedrun{ped_run_number}_{ped_method}.h5"
         )
+        full_file = glob.glob(str(path))
         log.debug("for now it does not check if there are files with max events")
         if len(full_file) != 1:
-            raise Exception(f"the files is {full_file}")
+            raise FileNotFoundError(
+                f"When looking for {str(path)} : the found files are {full_file}"
+            )
         return full_file
 
     @staticmethod
@@ -328,67 +329,98 @@ def find_SPE_HHV(
     ):
         keyword = kwargs.get("keyword", "FlatFieldSPEHHV")
         std_key = "" if free_pp_n else "Std"
-        full_file = glob.glob(
-            pathlib.Path(
-                f"{os.environ.get('NECTARCAMDATA','/tmp')}/SPEfit/"
-                f"{keyword}{std_key}NectarCAM_run{run_number}*_{method}"
-                f"_{str_extractor_kwargs}.h5"
-            ).__str__()
+        path = pathlib.Path(
+            f"{os.environ.get('NECTARCAMDATA','/tmp')}/SPEfit/"
+            f"{keyword}{std_key}NectarCAM_run{run_number}*_{method}"
+            f"_{str_extractor_kwargs}.h5"
         )
-        # need to improve the files search !!
-        #       -> unstable behavior with SPE results computed
-        #           with maxevents not to None
-        if len(full_file) != 1:
-            all_files = glob.glob(
-                pathlib.Path(
-                    f"{os.environ.get('NECTARCAMDATA','/tmp')}/SPEfit/"
-                    f"FlatFieldSPEHHVStdNectarCAM_run{run_number}_maxevents*_"
-                    f"{method}_{str_extractor_kwargs}.h5"
-                ).__str__()
+        full_file = glob.glob(str(path))
+        if len(full_file) == 0:
+            raise FileNotFoundError(f"No file found looking for {str(path)}")
+        elif len(full_file) > 1:
+            log.debug(f"Several files found for {str(path)} : {full_file}")
+            for file in full_file:
+                if "maxevents" not in file:
+                    log.debug(
+                        f"File found with the most important"
+                        f"number of events for {str(path)} : {file}"
+                    )
+                    return file
+            path = pathlib.Path(
+                f"{os.environ.get('NECTARCAMDATA','/tmp')}/SPEfit/"
+                f"{keyword}{std_key}NectarCAM_run{run_number}_maxevents*_"
+                f"{method}_{str_extractor_kwargs}.h5"
             )
-            max_events = 0
-            for i, file in enumerate(all_files):
-                data = file.split("/")[-1].split(".h5")[0].split("_")
-                for _data in data:
-                    if "maxevents" in _data:
-                        _max_events = int(_data.split("maxevents")[-1])
-                        break
-                if _max_events >= max_events:
-                    max_events = _max_events
-                    index = i
-            return [all_files[index]]
+            all_files = glob.glob(str(path))
+            if len(all_files) == 0:
+                raise FileNotFoundError(f"No file found looking for {str(path)}")
+            else:
+                log.debug(f"Files found for {str(path)} : {all_files}")
+                max_events = 0
+                for i, file in enumerate(all_files):
+                    data = file.split("/")[-1].split(".h5")[0].split("_")
+                    for _data in data:
+                        if "maxevents" in _data:
+                            _max_events = int(_data.split("maxevents")[-1])
+                            break
+                    if _max_events >= max_events:
+                        max_events = _max_events
+                        index = i
+                log.debug(f"Best file found : {all_files[index]}")
+                return [all_files[index]]
         else:
+            log.debug(f"File found for {str(path)} : {full_file}")
             return full_file
 
     @staticmethod
     def __find_computed_data(
         run_number, max_events=None, ext=".h5", data_type="waveforms"
     ):
-        out = glob.glob(
-            pathlib.Path(
+        if max_events is not None:
+            path = pathlib.Path(
+                f"{os.environ.get('NECTARCAMDATA','/tmp')}/runs/"
+                f"{data_type}/*_run{run_number}_maxevents*{ext}"
+            )
+        else:
+            path = pathlib.Path(
                 f"{os.environ.get('NECTARCAMDATA','/tmp')}/runs/"
                 f"{data_type}/*_run{run_number}{ext}"
-            ).__str__()
-        )
-        if not (max_events is None):
-            all_files = glob.glob(
-                pathlib.Path(
-                    f"{os.environ.get('NECTARCAMDATA','/tmp')}/runs/"
-                    f"{data_type}/*_run{run_number}_maxevents*{ext}"
-                ).__str__()
             )
-            best_max_events = np.inf
-            best_index = None
-            for i, file in enumerate(all_files):
-                data = file.split("/")[-1].split(".h5")[0].split("_")
+        out = glob.glob(str(path))
+        if len(out) == 0:
+            raise FileNotFoundError(f"No file found looking for {str(path)}")
+        elif len(out) > 1:
+            if max_events is None:
+                raise FileExistsError(f"Several files found for {str(path)} : {out}")
+            else:
+                log.debug(
+                    f"Several files found for {str(path)} : {out},"
+                    f"will look for the most complete one"
+                )
+                best_max_events = np.inf
+                best_index = None
+                for i, file in enumerate(out):
+                    data = file.split("/")[-1].split(".h5")[0].split("_")
+                    for _data in data:
+                        if "maxevents" in _data:
+                            _max_events = int(_data.split("maxevents")[-1])
+                            break
+                    if _max_events >= max_events:
+                        if _max_events < best_max_events:
+                            best_max_events = _max_events
+                            best_index = i
+                if best_index is not None:
+                    out = [out[best_index]]
+        else:
+            if max_events is not None:
+                data = out[0].split("/")[-1].split(".h5")[0].split("_")
                 for _data in data:
                     if "maxevents" in _data:
                         _max_events = int(_data.split("maxevents")[-1])
                         break
-                if _max_events >= max_events:
-                    if _max_events < best_max_events:
-                        best_max_events = _max_events
-                        best_index = i
-            if not (best_index is None):
-                out = [all_files[best_index]]
+                if _max_events < max_events:
+                    raise FileNotFoundError(
+                        f"File found for {str(path)} : {out[0]} has less events "
+                        f"than max_events asked {max_events}"
+                    )
         return out
diff --git a/src/nectarchain/makers/calibration/gain/flatfield_spe_makers.py b/src/nectarchain/makers/calibration/gain/flatfield_spe_makers.py
@@ -9,6 +9,7 @@
 from ....data.container import ChargesContainer, ChargesContainers
 from ....data.container.core import merge_map_ArrayDataContainer
 from ....data.management import DataManagement
+from ....utils.error import TooMuchFileException
 from ...component import ArrayDataComponent, NectarCAMComponent
 from ...extractor.utils import CtapipeExtractor
 from .core import GainNectarCAMCalibrationTool
@@ -46,25 +47,37 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         str_extractor_kwargs = CtapipeExtractor.get_extractor_kwargs_str(
-            self.extractor_kwargs
+            method=self.method,
+            extractor_kwargs=self.extractor_kwargs,
         )
         if not (self.reload_events):
-            files = DataManagement.find_charges(
-                run_number=self.run_number,
-                method=self.method,
-                str_extractor_kwargs=str_extractor_kwargs,
-                max_events=self.max_events,
-            )
-            if len(files) == 1:
+            try:
+                files = DataManagement.find_charges(
+                    run_number=self.run_number,
+                    method=self.method,
+                    str_extractor_kwargs=str_extractor_kwargs,
+                    max_events=self.max_events,
+                )
+                if len(files) == 1:
+                    log.warning(
+                        "You asked events_per_slice but you don't want to\
+                        reload events and a charges file is on disk, \
+                        then events_per_slice is set to None"
+                    )
+                    self.events_per_slice = None
+                else:
+                    raise TooMuchFileException("No single charges file found")
+            except (FileNotFoundError, TooMuchFileException) as e:
+                log.warning(e)
                 log.warning(
-                    "You asked events_per_slice but you don't want to reload events and\
-                        a charges file is on disk, then events_per_slice is set to None"
+                    "You will not be able to reload charges from\
+                    disk when start() call"
                 )
-                self.events_per_slice = None
 
     def _init_output_path(self):
         str_extractor_kwargs = CtapipeExtractor.get_extractor_kwargs_str(
-            self.extractor_kwargs
+            method=self.method,
+            extractor_kwargs=self.extractor_kwargs,
         )
         if self.events_per_slice is None:
             ext = ".h5"
@@ -94,14 +107,19 @@ def start(
         **kwargs,
     ):
         str_extractor_kwargs = CtapipeExtractor.get_extractor_kwargs_str(
-            self.extractor_kwargs
-        )
-        files = DataManagement.find_charges(
-            run_number=self.run_number,
             method=self.method,
-            str_extractor_kwargs=str_extractor_kwargs,
-            max_events=self.max_events,
+            extractor_kwargs=self.extractor_kwargs,
         )
+        try:
+            files = DataManagement.find_charges(
+                run_number=self.run_number,
+                method=self.method,
+                str_extractor_kwargs=str_extractor_kwargs,
+                max_events=self.max_events,
+            )
+        except Exception as e:
+            log.warning(e)
+            files = []
         if self.reload_events or len(files) != 1:
             if len(files) != 1:
                 self.log.info(
@@ -135,7 +153,7 @@ def start(
                     self.components[
                         0
                     ]._chargesContainers = merge_map_ArrayDataContainer(
-                        chargesContainers
+                        next(chargesContainers)
                     )
                 else:
                     self.log.info("merging along slices")
@@ -152,12 +170,6 @@ def start(
                     )
 
     def _write_container(self, container: Container, index_component: int = 0) -> None:
-        # if isinstance(container,SPEfitContainer) :
-        #    self.writer.write(table_name = f"{self.method}_
-        # {CtapipeExtractor.get_extractor_kwargs_str(self.extractor_kwargs)}",
-        #                      containers = container,
-        #    )
-        # else :
         super()._write_container(container=container, index_component=index_component)
 
 

diff --git a/src/nectarchain/makers/calibration/gain/photostat_makers.py b/src/nectarchain/makers/calibration/gain/photostat_makers.py
@@ -59,7 +59,8 @@ def __init__(self, *args, **kwargs):
 
     def _init_output_path(self):
         str_extractor_kwargs = CtapipeExtractor.get_extractor_kwargs_str(
-            self.extractor_kwargs
+            method=self.method,
+            extractor_kwargs=self.extractor_kwargs,
         )
         if self.max_events is None:
             filename = (
@@ -96,18 +97,27 @@ def start(
         **kwargs,
     ):
         str_extractor_kwargs = CtapipeExtractor.get_extractor_kwargs_str(
-            self.extractor_kwargs
-        )
-        FF_files = DataManagement.find_charges(
-            run_number=self.run_number,
             method=self.method,
-            str_extractor_kwargs=str_extractor_kwargs,
-            max_events=self.max_events,
-        )
-        Ped_files = DataManagement.find_charges(
-            run_number=self.Ped_run_number,
-            max_events=self.max_events,
+            extractor_kwargs=self.extractor_kwargs,
         )
+        try:
+            FF_files = DataManagement.find_charges(
+                run_number=self.run_number,
+                method=self.method,
+                str_extractor_kwargs=str_extractor_kwargs,
+                max_events=self.max_events,
+            )
+        except Exception as e:
+            self.log.warning(e)
+            FF_files = []
+        try:
+            Ped_files = DataManagement.find_charges(
+                run_number=self.Ped_run_number,
+                max_events=self.max_events,
+            )
+        except Exception as e:
+            self.log.warning(e)
+            Ped_files = []
         if self.reload_events or len(FF_files) != 1 or len(Ped_files) != 1:
             if len(FF_files) != 1 or len(Ped_files) != 1:
                 self.log.info(