Merge branch 'main' of github.com:tdixon97/legend-pydataobj into main

legend-exp · Jan 28, 2025 · fc0bc10 · fc0bc10
2 parents 563152d + b678606
commit fc0bc10
Show file tree

Hide file tree

Showing 21 changed files with 243 additions and 41 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ ci:
 
 repos:
   - repo: https://github.com/adamchainz/blacken-docs
-    rev: "1.18.0"
+    rev: "1.19.1"
     hooks:
       - id: blacken-docs
         additional_dependencies: [black==23.*]
@@ -29,7 +29,7 @@ repos:
       - id: trailing-whitespace
 
   - repo: https://github.com/kynan/nbstripout
-    rev: "0.7.1"
+    rev: "0.8.1"
     hooks:
       - id: nbstripout
         args:
@@ -40,7 +40,7 @@ repos:
           ]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.6.9"
+    rev: "v0.8.6"
     hooks:
       - id: ruff
         args: ["--fix", "--show-fixes"]
@@ -72,12 +72,12 @@ repos:
         args: [--prose-wrap=always]
 
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.20.2
+    rev: v0.23
     hooks:
       - id: validate-pyproject
 
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: 0.29.3
+    rev: 0.30.0
     hooks:
       - id: check-dependabot
       - id: check-github-workflows

diff --git a/src/lgdo/__init__.py b/src/lgdo/__init__.py
@@ -62,23 +62,23 @@
 )
 
 __all__ = [
+    "LGDO",
     "Array",
-    "ArrayOfEqualSizedArrays",
     "ArrayOfEncodedEqualSizedArrays",
+    "ArrayOfEqualSizedArrays",
     "FixedSizeArray",
     "Histogram",
-    "LGDO",
+    "LH5Iterator",
+    "LH5Store",
     "Scalar",
     "Struct",
     "Table",
-    "VectorOfVectors",
     "VectorOfEncodedVectors",
+    "VectorOfVectors",
     "WaveformTable",
+    "__version__",
     "load_dfs",
     "load_nda",
     "ls",
     "show",
-    "LH5Iterator",
-    "LH5Store",
-    "__version__",
 ]
diff --git a/src/lgdo/compression/__init__.py b/src/lgdo/compression/__init__.py
@@ -30,9 +30,9 @@
 from .varlen import ULEB128ZigZagDiff
 
 __all__ = [
-    "WaveformCodec",
-    "encode",
-    "decode",
     "RadwareSigcompress",
     "ULEB128ZigZagDiff",
+    "WaveformCodec",
+    "decode",
+    "encode",
 ]
diff --git a/src/lgdo/lh5/__init__.py b/src/lgdo/lh5/__init__.py
@@ -20,10 +20,10 @@
     "LH5Store",
     "load_dfs",
     "load_nda",
+    "ls",
     "read",
-    "write",
     "read_as",
-    "ls",
     "read_n_rows",
     "show",
+    "write",
 ]
diff --git a/src/lgdo/lh5/_serializers/__init__.py b/src/lgdo/lh5/_serializers/__init__.py
@@ -24,22 +24,22 @@
 from .write.vector_of_vectors import _h5_write_vector_of_vectors
 
 __all__ = [
-    "_h5_read_lgdo",
-    "_h5_read_vector_of_vectors",
-    "_h5_read_ndarray",
     "_h5_read_array",
+    "_h5_read_array_of_encoded_equalsized_arrays",
+    "_h5_read_array_of_equalsized_arrays",
     "_h5_read_encoded_array",
     "_h5_read_fixedsize_array",
-    "_h5_read_array_of_equalsized_arrays",
-    "_h5_read_struct",
-    "_h5_read_table",
     "_h5_read_histogram",
+    "_h5_read_lgdo",
+    "_h5_read_ndarray",
     "_h5_read_scalar",
-    "_h5_read_array_of_encoded_equalsized_arrays",
+    "_h5_read_struct",
+    "_h5_read_table",
     "_h5_read_vector_of_encoded_vectors",
-    "_h5_write_scalar",
+    "_h5_read_vector_of_vectors",
     "_h5_write_array",
-    "_h5_write_vector_of_vectors",
-    "_h5_write_struct",
     "_h5_write_lgdo",
+    "_h5_write_scalar",
+    "_h5_write_struct",
+    "_h5_write_vector_of_vectors",
 ]
diff --git a/src/lgdo/lh5/_serializers/write/array.py b/src/lgdo/lh5/_serializers/write/array.py
@@ -64,6 +64,11 @@ def _h5_write_array(
         if "hdf5_settings" in obj.attrs:
             h5py_kwargs |= obj.attrs["hdf5_settings"]
 
+        # HACK: a tuple is strictly requested for the "chunks" setting, but
+        # we'd like to pass a list too in some situations
+        if "chunks" in h5py_kwargs and isinstance(h5py_kwargs["chunks"], list):
+            h5py_kwargs["chunks"] = tuple(h5py_kwargs["chunks"])
+
         # create HDF5 dataset
         ds = group.create_dataset(name, data=nda, **h5py_kwargs)
 

diff --git a/src/lgdo/lh5/exceptions.py b/src/lgdo/lh5/exceptions.py
@@ -16,6 +16,9 @@ def __str__(self) -> str:
             + super().__str__()
         )
 
+    def __reduce__(self) -> tuple:  # for pickling.
+        return self.__class__, (*self.args, self.file, self.obj)
+
 
 class LH5EncodeError(Exception):
     def __init__(
@@ -32,3 +35,6 @@ def __str__(self) -> str:
             f"while writing object {self.group}/{self.name} to file {self.file}: "
             + super().__str__()
         )
+
+    def __reduce__(self) -> tuple:  # for pickling.
+        return self.__class__, (*self.args, self.file, self.group, self.name)
diff --git a/src/lgdo/types/__init__.py b/src/lgdo/types/__init__.py
@@ -15,17 +15,17 @@
 from .waveformtable import WaveformTable
 
 __all__ = [
+    "LGDO",
     "Array",
-    "ArrayOfEqualSizedArrays",
     "ArrayOfEncodedEqualSizedArrays",
+    "ArrayOfEqualSizedArrays",
     "FixedSizeArray",
     "Histogram",
-    "LGDO",
     "Scalar",
     "Struct",
     "Table",
-    "VectorOfVectors",
     "VectorOfEncodedVectors",
+    "VectorOfVectors",
     "WaveformTable",
 ]
 

diff --git a/src/lgdo/types/histogram.py b/src/lgdo/types/histogram.py
@@ -418,13 +418,18 @@ def fill(self, data, w: NDArray = None, keys: Sequence[str] = None) -> None:
 
     def __setitem__(self, name: str, obj: LGDO) -> None:
         # do not allow for new attributes on this
-        msg = "histogram fields cannot be mutated"
-        raise TypeError(msg)
+        known_keys = ("binning", "weights", "isdensity")
+        if name in known_keys and not dict.__contains__(self, name):
+            # but allow initialization while unpickling (after __init__() this is unreachable)
+            dict.__setitem__(self, name, obj)
+        else:
+            msg = "histogram fields cannot be mutated "
+            raise TypeError(msg)
 
     def __getattr__(self, name: str) -> None:
         # do not allow for new attributes on this
         msg = "histogram fields cannot be mutated"
-        raise TypeError(msg)
+        raise AttributeError(msg)
 
     def add_field(self, name: str | int, obj: LGDO) -> None:  # noqa: ARG002
         """

diff --git a/src/lgdo/types/lgdo.py b/src/lgdo/types/lgdo.py
@@ -11,6 +11,12 @@
 class LGDO(ABC):
     """Abstract base class representing a LEGEND Data Object (LGDO)."""
 
+    def __new__(cls, *_args, **_kwargs):
+        # allow for (un-)pickling LGDO objects.
+        obj = super().__new__(cls)
+        obj.attrs = {}
+        return obj
+
     @abstractmethod
     def __init__(self, attrs: dict[str, Any] | None = None) -> None:
         self.attrs = {} if attrs is None else dict(attrs)

diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py
@@ -41,6 +41,12 @@ class Table(Struct):
     :meth:`__len__` to access valid data, which returns the ``size`` attribute.
     """
 
+    def __new__(cls, *args, **kwargs):
+        # allow for (un-)pickling LGDO objects.
+        obj = super().__new__(cls, *args, **kwargs)
+        obj.size = None
+        return obj
+
     def __init__(
         self,
         col_dict: Mapping[str, LGDO] | pd.DataFrame | ak.Array | None = None,

diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py
@@ -13,6 +13,7 @@
 import awkward_pandas as akpd
 import numpy as np
 import pandas as pd
+from numba import jit
 from numpy.typing import ArrayLike, DTypeLike, NDArray
 
 from .. import utils
@@ -560,18 +561,16 @@ def to_aoesa(
             compatible one.
         """
         if self.ndim == 2:
-            ak_arr = self.view_as("ak")
-
             if max_len is None:
-                max_len = int(ak.max(ak.count(ak_arr, axis=-1)))
-
-            nda = ak.fill_none(
-                ak.pad_none(ak_arr, max_len, clip=True), fill_val
-            ).to_numpy(allow_missing=False)
-
+                lens = np.copy(self.cumulative_length)
+                lens[1:] = lens[1:] - lens[:-1]
+                max_len = int(np.max(lens))
+            nda = np.full((len(self), max_len), fill_val)
             if preserve_dtype:
                 nda = nda.astype(self.flattened_data.dtype, copy=False)
 
+            _to_aoesa(self.flattened_data.nda, self.cumulative_length.nda, nda)
+
             return aoesa.ArrayOfEqualSizedArrays(nda=nda, attrs=self.getattrs())
 
         raise NotImplementedError
@@ -664,3 +663,11 @@ def view_as(
 
         msg = f"{library} is not a supported third-party format."
         raise ValueError(msg)
+
+
+@jit
+def _to_aoesa(flattened_array, cumulative_length, nda):
+    prev_cl = 0
+    for i, cl in enumerate(cumulative_length):
+        nda[i, : (cl - prev_cl)] = flattened_array[prev_cl:cl]
+        prev_cl = cl
diff --git a/tests/lh5/test_exceptions.py b/tests/lh5/test_exceptions.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+import pickle
+
+from lgdo.lh5.exceptions import LH5DecodeError, LH5EncodeError
+
+
+def test_pickle():
+    # test (un-)pickling of LH5 exceptions; e.g. for multiprocessing use.
+
+    ex = LH5EncodeError("message", "file", "group", "name")
+    ex = pickle.loads(pickle.dumps(ex))
+    assert isinstance(ex, LH5EncodeError)
+
+    ex = LH5DecodeError("message", "file", "obj")
+    ex = pickle.loads(pickle.dumps(ex))
+    assert isinstance(ex, LH5DecodeError)
diff --git a/tests/lh5/test_lh5_write.py b/tests/lh5/test_lh5_write.py
@@ -44,6 +44,19 @@ def test_write_with_hdf5_compression(lgnd_file, tmptestdir):
         assert h5f["/geds/raw/waveform/values"].compression is None
         assert h5f["/geds/raw/waveform/values"].shuffle is False
 
+    store.write(
+        wft.values,
+        "/geds/raw/waveform/values",
+        f"{tmptestdir}/tmp-pygama-hdf5-compressed-wfs.lh5",
+        wo_mode="overwrite_file",
+        chunks=[1, 10],
+        compression=None,
+        shuffle=False,
+    )
+    with h5py.File(f"{tmptestdir}/tmp-pygama-hdf5-compressed-wfs.lh5") as h5f:
+        assert h5f["/geds/raw/waveform/values"].compression is None
+        assert h5f["/geds/raw/waveform/values"].shuffle is False
+
 
 def test_write_empty_vov(tmptestdir):
     vov = types.VectorOfVectors(flattened_data=[], cumulative_length=[])

diff --git a/tests/types/test_array.py b/tests/types/test_array.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import pickle
+
 import awkward as ak
 import numpy as np
 import pandas as pd
@@ -61,3 +63,14 @@ def test_view():
 
     with pytest.raises(ValueError):
         a.view_as("ak", with_units=True)
+
+
+def test_pickle():
+    obj = Array(nda=np.array([1, 2, 3, 4]))
+    obj.attrs["attr1"] = 1
+
+    ex = pickle.loads(pickle.dumps(obj))
+    assert isinstance(ex, Array)
+    assert ex.attrs["attr1"] == 1
+    assert ex.attrs["datatype"] == obj.attrs["datatype"]
+    assert np.all(ex.nda == np.array([1, 2, 3, 4]))
diff --git a/tests/types/test_encoded.py b/tests/types/test_encoded.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import pickle
+
 import awkward as ak
 import awkward_pandas as akpd
 import numpy as np
@@ -285,3 +287,51 @@ def test_aoeesa_view_as():
 
     with pytest.raises(TypeError):
         df = voev.view_as("np")
+
+
+def test_aoeesa_pickle():
+    obj = ArrayOfEncodedEqualSizedArrays(
+        encoded_data=VectorOfVectors(
+            flattened_data=Array(nda=np.array([1, 2, 3, 4, 5, 2, 4, 8, 9, 7, 5, 3, 1])),
+            cumulative_length=Array(nda=np.array([2, 5, 6, 10, 13])),
+        ),
+        decoded_size=99,
+    )
+
+    ex = pickle.loads(pickle.dumps(obj))
+
+    desired = [
+        [1, 2],
+        [3, 4, 5],
+        [2],
+        [4, 8, 9, 7],
+        [5, 3, 1],
+    ]
+
+    for i, v in enumerate(ex):
+        assert np.array_equal(v, desired[i])
+
+
+def test_voev_pickle():
+    obj = VectorOfEncodedVectors(
+        encoded_data=VectorOfVectors(
+            flattened_data=Array(nda=np.array([1, 2, 3, 4, 5, 2, 4, 8, 9, 7, 5, 3, 1])),
+            cumulative_length=Array(nda=np.array([2, 5, 6, 10, 13])),
+        ),
+        decoded_size=Array(shape=5, fill_val=6),
+        attrs={"units": "s"},
+    )
+
+    ex = pickle.loads(pickle.dumps(obj))
+
+    desired = [
+        [1, 2],
+        [3, 4, 5],
+        [2],
+        [4, 8, 9, 7],
+        [5, 3, 1],
+    ]
+
+    for i, (v, s) in enumerate(ex):
+        assert np.array_equal(v, desired[i])
+        assert s == 6