From 545d6dc39d68de96dd5ddfa95e1eb19a08961420 Mon Sep 17 00:00:00 2001 From: iguinn Date: Fri, 11 Oct 2024 13:30:47 -0700 Subject: [PATCH 01/27] Array manages capacity separately from size --- src/lgdo/types/array.py | 72 ++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index ec555bcf..ae9409e7 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -78,11 +78,7 @@ def __init__( elif isinstance(nda, Array): nda = nda.nda - elif not isinstance(nda, np.ndarray): - nda = np.array(nda) - self.nda = nda - self.dtype = self.nda.dtype super().__init__(attrs) @@ -96,18 +92,72 @@ def form_datatype(self) -> str: return dt + "<" + nd + ">{" + et + "}" def __len__(self) -> int: - return len(self.nda) + return self._size + + @property + def nda(self): + return self._nda[: self._size, ...] if self._nda.shape != () else self._nda + + @nda.setter + def nda(self, value): + self._nda = value if isinstance(value, np.ndarray) else np.array(value) + self._size = len(self._nda) if self._nda.shape != () else 0 + + @property + def dtype(self): + return self._nda.dtype + + @property + def shape(self): + return (len(self),) + self._nda.shape[1:] + + def set_capacity(self, capacity: int) -> None: + "Set size (number of rows) of internal memory buffer" + if capacity < len(self): + msg = "Cannot reduce capacity below Array length" + raise ValueError(msg) + self._nda.resize((capacity,) + self._nda.shape[1:], refcheck=True) + + def get_capacity(self) -> int: + "Get capacity (i.e. max size before memory must be re-allocated)" + return len(self._nda) + + def trim_capacity(self) -> None: + "Set capacity to be minimum needed to support Array size" + self.set_capacity(np.prod(self.shape)) + + def resize(self, new_size: int, trim=False) -> None: + """Set size of Array in rows. Only change capacity if it must be + increased to accommodate new rows; in this case double capacity. + If trim is True, capacity will be set to match size.""" + + if trim and new_size != self.get_capacity: + self.set_capacity(new_size) + + # If capacity is not big enough, set to next power of 2 big enough + if new_size > self.get_capacity(): + self.set_capacity(int(2 ** (np.ceil(np.log2(new_size))))) - def resize(self, new_size: int) -> None: - new_shape = (new_size,) + self.nda.shape[1:] - return self.nda.resize(new_shape, refcheck=True) + self._size = new_size def append(self, value: np.ndarray) -> None: - self.resize(len(self) + 1) - self.nda[-1] = value + "Append value to end of array (with copy)" + self.insert(len(self), value) def insert(self, i: int, value: int | float) -> None: - self.nda = np.insert(self.nda, i, value) + "Insert value into row i (with copy)" + value = np.array(value) + if value.shape == self.shape[1:]: + self.resize(len(self) + 1) + self[i + 1 :] = self[i:-1] + self[i] = value + elif value.shape[1:] == self.shape[1:]: + self.resize(len(self) + len(value)) + self[i + len(value) :] = self[i : -len(value)] + self[i : i + len(value)] = value + else: + msg = f"Could not insert value with shape {value.shape} into Array with shape {self.shape}" + raise ValueError(msg) def __getitem__(self, key): return self.nda[key] From 980ad6a863dfba0515d358d29b4236192fe9534b Mon Sep 17 00:00:00 2001 From: iguinn Date: Fri, 11 Oct 2024 13:31:30 -0700 Subject: [PATCH 02/27] VectorOfVectors dtype is a property --- src/lgdo/types/vectorofvectors.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index c4f543d6..f6d5575f 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -209,9 +209,6 @@ def __init__( elif self.flattened_data is None: self.flattened_data = flattened_data - # finally set dtype - self.dtype = self.flattened_data.dtype - # set ndim self.ndim = 2 pointer = self.flattened_data @@ -224,6 +221,10 @@ def __init__( super().__init__(attrs) + @property + def dtype(self): + return self.flattened_data.dtype + def datatype_name(self) -> str: return "array" From d0d8ce203f0dfcb34f05c967e705793a6dbba612 Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 10:13:24 -0700 Subject: [PATCH 03/27] Raise error on insert if i>len --- src/lgdo/types/array.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index ae9409e7..32343074 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -146,6 +146,10 @@ def append(self, value: np.ndarray) -> None: def insert(self, i: int, value: int | float) -> None: "Insert value into row i (with copy)" + if i > len(self): + msg = f"index {i} is out of bounds for array with size {len(self)}" + raise IndexError(msg) + value = np.array(value) if value.shape == self.shape[1:]: self.resize(len(self) + 1) From 075a4f1b7847629921e39eec1d1be5fa9090115b Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 10:14:29 -0700 Subject: [PATCH 04/27] Add get/set_capacity to VoV and change modifiers to take advantage of changes to len/capacity management --- src/lgdo/types/vectorofvectors.py | 119 ++++++++++++------------------ 1 file changed, 47 insertions(+), 72 deletions(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index f6d5575f..985e9427 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -209,20 +209,14 @@ def __init__( elif self.flattened_data is None: self.flattened_data = flattened_data - # set ndim - self.ndim = 2 - pointer = self.flattened_data - while True: - if isinstance(pointer, Array): - break - - self.ndim += 1 - pointer = pointer.flattened_data - super().__init__(attrs) @property - def dtype(self): + def ndim(self): + return 1 + (1 if isinstance(self.flattened_data, Array) else self.flattened_data.ndim) + + @property + def dtype(self) -> np.dtype: return self.flattened_data.dtype def datatype_name(self) -> str: @@ -276,7 +270,27 @@ def __setitem__(self, i: int, new: NDArray) -> None: else: raise NotImplementedError - def resize(self, new_size: int) -> None: + def set_capacity(self, cap_cl, *cap_args) -> None: + """Set capacity of internal data arrays. Expect number of args to + equal `self.n_dim`. First arg is capacity of cumulative length array. + If `self.n_dim` is 2, second argument is capacity of flattened data, + otherwise arguments are fed recursively to remaining dimensions. + """ + self.cumulative_length.set_capacity(cap_cl) + self.flattened_data.set_capacity(*cap_args) + + def get_capacity(self) -> Tuple[int]: + """Get tuple containing capacity of each dimension. First dimension + is cumulative length array. Last dimension is flattened data. + """ + return (self.cumulative_length.get_capacity(), *self.flattened_data.get_capacity()) + + def trim_capacity(self) -> None: + "Set capacity for all dimensions to minimum needed to hold data" + self.cumulative_length.trim_capacity() + self.flattened_data.trim_capacity() + + def resize(self, new_size: int, trim: bool = False) -> None: """Resize vector along the first axis. `self.flattened_data` is resized only if `new_size` is smaller than the @@ -285,6 +299,8 @@ def resize(self, new_size: int) -> None: If `new_size` is larger than the current vector length, `self.cumulative_length` is padded with its last element. This corresponds to appending empty vectors. + + If `trim` is ``True``, resize capacity to match new size Examples -------- @@ -303,23 +319,20 @@ def resize(self, new_size: int) -> None: [3], ] """ - vidx = self.cumulative_length old_s = len(self) - dlen = new_size - old_s - csum = vidx[-1] if len(self) > 0 else 0 # first resize the cumulative length - self.cumulative_length.resize(new_size) + self.cumulative_length.resize(new_size, trim) # if new_size > size, new elements are filled with zeros, let's fix # that - if dlen > 0: - self.cumulative_length[old_s:] = csum + if new_size > old_s: + self.cumulative_length[old_s:] = self.cumulative_length[old_s-1] # then resize the data array # if dlen > 0 this has no effect if len(self.cumulative_length) > 0: - self.flattened_data.resize(self.cumulative_length[-1]) + self.flattened_data.resize(self.cumulative_length[-1], trim) def append(self, new: NDArray) -> None: """Append a 1D vector `new` at the end. @@ -334,20 +347,7 @@ def append(self, new: NDArray) -> None: [8 9], ] """ - if self.ndim == 2: - # first extend cumulative_length by +1 - self.cumulative_length.resize(len(self) + 1) - # set it at the right value - newlen = ( - self.cumulative_length[-2] + len(new) if len(self) > 1 else len(new) - ) - self.cumulative_length[-1] = newlen - # then resize flattened_data to accommodate the new vector - self.flattened_data.resize(len(self.flattened_data) + len(new)) - # finally set it - self[-1] = new - else: - raise NotImplementedError + self.insert(len(self), new) def insert(self, i: int, new: NDArray) -> None: """Insert a vector at index `i`. @@ -364,23 +364,15 @@ def insert(self, i: int, new: NDArray) -> None: [8 9], [4 5], ] - - Warning - ------- - This method involves a significant amount of memory re-allocation and - is expected to perform poorly on large vectors. """ if self.ndim == 2: - if i >= len(self): - msg = f"index {i} is out of bounds for vector owith size {len(self)}" + if i > len(self): + msg = f"index {i} is out of bounds for vector with size {len(self)}" raise IndexError(msg) - self.flattened_data = Array( - np.insert(self.flattened_data, self.cumulative_length[i - 1], new) - ) - self.cumulative_length = Array( - np.insert(self.cumulative_length, i, self.cumulative_length[i - 1]) - ) + i_start = 0 if i==0 else self.cumulative_length[i-1] + self.flattened_data.insert(i_start, new) + self.cumulative_length.insert(i, i_start) self.cumulative_length[i:] += np.uint32(len(new)) else: raise NotImplementedError @@ -400,11 +392,6 @@ def replace(self, i: int, new: NDArray) -> None: [[8 9], [4 5], ] - - Warning - ------- - This method involves a significant amount of memory re-allocation and - is expected to perform poorly on large vectors. """ if self.ndim == 2: if i >= len(self): @@ -414,27 +401,15 @@ def replace(self, i: int, new: NDArray) -> None: vidx = self.cumulative_length dlen = len(new) - len(self[i]) - if dlen == 0: - # don't waste resources - self[i] = new - elif dlen < 0: - start = vidx[i - 1] - stop = start + len(new) - # set the already allocated indices - self.flattened_data[start:stop] = new - # then delete the extra indices - self.flattened_data = Array( - np.delete(self.flattened_data, np.s_[stop : vidx[i]]) - ) - else: - # set the already allocated indices - self.flattened_data[vidx[i - 1] : vidx[i]] = new[: len(self[i])] - # then insert the remaining - self.flattened_data = Array( - np.insert(self.flattened_data, vidx[i], new[len(self[i]) :]) - ) - - vidx[i:] = vidx[i:] + dlen + if dlen != 0: + # move the subsequent entries + vidx[i:] += dlen + self.flattened_data.resize(vidx[-1]) + self.flattened_data._nda[vidx[i]:vidx[-1]] = self.flattened_data._nda[vidx[i]-dlen:vidx[-1]-dlen] + + # set the already allocated indices + start = vidx[i - 1] if i>0 else 0 + self.flattened_data[start:vidx[i]] = new else: raise NotImplementedError From 1e1fddaba1f29177eeb1f304c9998ea846261e6a Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 10:19:15 -0700 Subject: [PATCH 05/27] Modify core.read and store.read to resize array when filling in place instead of returning n_read --- src/lgdo/cli.py | 6 +- src/lgdo/lh5/core.py | 42 +++--- src/lgdo/lh5/iterator.py | 33 ++--- src/lgdo/lh5/store.py | 78 ++--------- src/lgdo/types/histogram.py | 4 +- tests/compression/conftest.py | 2 +- tests/compression/test_radware_sigcompress.py | 2 +- tests/lh5/conftest.py | 2 +- tests/lh5/test_core.py | 2 +- tests/lh5/test_lh5_iterator.py | 17 ++- tests/lh5/test_lh5_store.py | 128 +++++++++--------- tests/lh5/test_lh5_write.py | 24 ++-- tests/test_cli.py | 18 +-- tests/types/test_histogram.py | 4 +- tests/types/test_vectorofvectors.py | 2 +- 15 files changed, 152 insertions(+), 212 deletions(-) diff --git a/src/lgdo/cli.py b/src/lgdo/cli.py index 6563fd66..73105738 100644 --- a/src/lgdo/cli.py +++ b/src/lgdo/cli.py @@ -227,10 +227,10 @@ def lh5concat(args=None): continue # read as little as possible - obj, _ = store.read(current, h5f0, n_rows=1) + obj = store.read(current, h5f0, n_rows=1) if isinstance(obj, (Table, Array, VectorOfVectors)): # read all! - obj, _ = store.read(current, h5f0) + obj = store.read(current, h5f0) lgdos[current] = obj break @@ -292,7 +292,7 @@ def _inplace_table_filter(name, table, obj_list): log.info(msg) for name in lgdos: - obj, _ = store.read(name, file) + obj = store.read(name, file) # need to remove nested LGDOs from obj too before appending if isinstance(obj, Table): _inplace_table_filter(name, obj, obj_list) diff --git a/src/lgdo/lh5/core.py b/src/lgdo/lh5/core.py index 80132376..fc97338d 100644 --- a/src/lgdo/lh5/core.py +++ b/src/lgdo/lh5/core.py @@ -119,11 +119,12 @@ def read( lh5_file = h5py.File(lh5_file, mode="r", locking=locking) lh5_obj = lh5_file[name] else: - lh5_files = list(lh5_file) - n_rows_read = 0 - obj_buf_is_new = False - - for i, h5f in enumerate(lh5_files): + if obj_buf is not None: + obj_buf.resize(obj_buf_start) + else: + obj_buf_start = 0 + + for i, h5f in enumerate(lh5_file): if ( isinstance(idx, (list, tuple)) and len(idx) > 0 @@ -145,33 +146,26 @@ def read( idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i else: idx_i = None - n_rows_i = n_rows - n_rows_read - obj_ret = read( + obj_buf_start_i = len(obj_buf) if obj_buf else 0 + n_rows_i = n_rows - (obj_buf_start_i - obj_buf_start) + + obj_buf = read( name, h5f, - start_row, + start_row if i==0 else 0, n_rows_i, idx_i, use_h5idx, field_mask, obj_buf, - obj_buf_start, + obj_buf_start_i, decompress, ) - if isinstance(obj_ret, tuple): - obj_buf, n_rows_read_i = obj_ret - obj_buf_is_new = True - else: - obj_buf = obj_ret - n_rows_read_i = len(obj_buf) - n_rows_read += n_rows_read_i - if n_rows_read >= n_rows or obj_buf is None: - return obj_buf, n_rows_read - start_row = 0 - obj_buf_start += n_rows_read_i - return obj_buf if obj_buf_is_new else (obj_buf, n_rows_read) + if obj_buf is None or (len(obj_buf) - obj_buf_start) >= n_rows: + return obj_buf + return obj_buf if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]): idx = idx[0] @@ -188,8 +182,12 @@ def read( obj_buf_start=obj_buf_start, decompress=decompress, ) + try: + obj.resize(obj_buf_start + n_rows_read) + except AttributeError: + pass - return obj if obj_buf is None else (obj, n_rows_read) + return obj def write( diff --git a/src/lgdo/lh5/iterator.py b/src/lgdo/lh5/iterator.py index efced94c..88c2573c 100644 --- a/src/lgdo/lh5/iterator.py +++ b/src/lgdo/lh5/iterator.py @@ -21,7 +21,7 @@ class LH5Iterator(typing.Iterator): This class can be used either for random access: - >>> lh5_obj, n_rows = lh5_it.read(entry) + >>> lh5_obj = lh5_it.read(entry) to read the block of entries starting at entry. In case of multiple files or the use of an event selection, entry refers to a global event index @@ -29,7 +29,7 @@ class LH5Iterator(typing.Iterator): This can also be used as an iterator: - >>> for lh5_obj, entry, n_rows in LH5Iterator(...): + >>> for lh5_obj, entry in LH5Iterator(...): >>> # do the thing! This is intended for if you are reading a large quantity of data but @@ -129,7 +129,6 @@ def __init__( msg = f"can't open any files from {lh5_files}" raise RuntimeError(msg) - self.n_rows = 0 self.current_entry = 0 self.next_entry = 0 @@ -235,11 +234,10 @@ def get_global_entrylist(self) -> np.ndarray: ) return self.global_entry_list - def read(self, entry: int) -> tuple[LGDO, int]: - """Read the nextlocal chunk of events, starting at entry. Return the - LH5 buffer and number of rows read.""" - self.n_rows = 0 + def read(self, entry: int) -> LGDO: + "Read the nextlocal chunk of events, starting at entry." i_file = np.searchsorted(self.entry_map, entry, "right") + self.lh5_buffer.resize(0) # if file hasn't been opened yet, search through files # sequentially until we find the right one @@ -250,10 +248,10 @@ def read(self, entry: int) -> tuple[LGDO, int]: i_file += 1 if i_file == len(self.lh5_files): - return (self.lh5_buffer, self.n_rows) + return self.lh5_buffer local_entry = entry - self._get_file_cumentries(i_file - 1) - while self.n_rows < self.buffer_len and i_file < len(self.file_map): + while len(self.lh5_buffer) < self.buffer_len and i_file < len(self.file_map): # Loop through files local_idx = self.get_file_entrylist(i_file) if local_idx is not None and len(local_idx) == 0: @@ -262,18 +260,17 @@ def read(self, entry: int) -> tuple[LGDO, int]: continue i_local = local_idx[local_entry] if local_idx is not None else local_entry - self.lh5_buffer, n_rows = self.lh5_st.read( + self.lh5_buffer = self.lh5_st.read( self.groups[i_file], self.lh5_files[i_file], start_row=i_local, - n_rows=self.buffer_len - self.n_rows, + n_rows=self.buffer_len - len(self.lh5_buffer), idx=local_idx, field_mask=self.field_mask, obj_buf=self.lh5_buffer, - obj_buf_start=self.n_rows, + obj_buf_start=len(self.lh5_buffer), ) - self.n_rows += n_rows i_file += 1 local_entry = 0 @@ -282,7 +279,7 @@ def read(self, entry: int) -> tuple[LGDO, int]: if self.friend is not None: self.friend.read(entry) - return (self.lh5_buffer, self.n_rows) + return self.lh5_buffer def reset_field_mask(self, mask): """Replaces the field mask of this iterator and any friends with mask""" @@ -307,8 +304,8 @@ def __iter__(self) -> typing.Iterator: def __next__(self) -> tuple[LGDO, int, int]: """Read next buffer_len entries and return lh5_table, iterator entry and n_rows read.""" - buf, n_rows = self.read(self.next_entry) - self.next_entry = self.current_entry + n_rows - if n_rows == 0: + buf = self.read(self.next_entry) + if len(buf) == 0: raise StopIteration - return (buf, self.current_entry, n_rows) + self.next_entry = self.current_entry + len(buf) + return (buf, self.current_entry) diff --git a/src/lgdo/lh5/store.py b/src/lgdo/lh5/store.py index eab09ed6..a1149e6b 100644 --- a/src/lgdo/lh5/store.py +++ b/src/lgdo/lh5/store.py @@ -19,6 +19,7 @@ from .. import types from . import _serializers, utils +from .core import read log = logging.getLogger(__name__) @@ -150,7 +151,7 @@ def get_buffer( """Returns an LH5 object appropriate for use as a pre-allocated buffer in a read loop. Sets size to `size` if object has a size. """ - obj, n_rows = self.read(name, lh5_file, n_rows=0, field_mask=field_mask) + obj = self.read(name, lh5_file, n_rows=0, field_mask=field_mask) if hasattr(obj, "resize") and size is not None: obj.resize(new_size=size) return obj @@ -177,69 +178,20 @@ def read( """ # grab files from store if isinstance(lh5_file, (str, h5py.File)): - lh5_obj = self.gimme_file(lh5_file, "r", **file_kwargs)[name] + h5f = self.gimme_file(lh5_file, "r", **file_kwargs) else: - lh5_files = list(lh5_file) - n_rows_read = 0 - - for i, h5f in enumerate(lh5_files): - if ( - isinstance(idx, (list, tuple)) - and len(idx) > 0 - and not np.isscalar(idx[0]) - ): - # a list of lists: must be one per file - idx_i = idx[i] - elif idx is not None: - # make idx a proper tuple if it's not one already - if not (isinstance(idx, tuple) and len(idx) == 1): - idx = (idx,) - # idx is a long continuous array - n_rows_i = utils.read_n_rows(name, h5f) - # find the length of the subset of idx that contains indices - # that are less than n_rows_i - n_rows_to_read_i = bisect.bisect_left(idx[0], n_rows_i) - # now split idx into idx_i and the remainder - idx_i = np.array(idx[0])[:n_rows_to_read_i] - idx = np.array(idx[0])[n_rows_to_read_i:] - n_rows_i - else: - idx_i = None - n_rows_i = n_rows - n_rows_read - - obj_buf, n_rows_read_i = self.read( - name, - h5f, - start_row, - n_rows_i, - idx_i, - use_h5idx, - field_mask, - obj_buf, - obj_buf_start, - decompress, - ) - - n_rows_read += n_rows_read_i - if n_rows_read >= n_rows or obj_buf is None: - return obj_buf, n_rows_read - start_row = 0 - obj_buf_start += n_rows_read_i - return obj_buf, n_rows_read - - if isinstance(idx, (list, tuple)) and len(idx) > 0 and not np.isscalar(idx[0]): - idx = idx[0] - return _serializers._h5_read_lgdo( - lh5_obj.id, - lh5_obj.file.filename, - lh5_obj.name, - start_row=start_row, - n_rows=n_rows, - idx=idx, - use_h5idx=use_h5idx, - field_mask=field_mask, - obj_buf=obj_buf, - obj_buf_start=obj_buf_start, - decompress=decompress, + h5f = [self.gimme_file(f, "r", **file_kwargs) for f in lh5_file] + return read( + name, + h5f, + start_row, + n_rows, + idx, + use_h5idx, + field_mask, + obj_buf, + obj_buf_start, + decompress, ) def write( diff --git a/src/lgdo/types/histogram.py b/src/lgdo/types/histogram.py index 7efde7d6..b2311306 100644 --- a/src/lgdo/types/histogram.py +++ b/src/lgdo/types/histogram.py @@ -318,12 +318,12 @@ def binning(self) -> tuple[Histogram.Axis, ...]: def __setitem__(self, name: str, obj: LGDO) -> None: # do not allow for new attributes on this msg = "histogram fields cannot be mutated" - raise TypeError(msg) + raise AttributeError(msg) def __getattr__(self, name: str) -> None: # do not allow for new attributes on this msg = "histogram fields cannot be mutated" - raise TypeError(msg) + raise AttributeError(msg) def add_field(self, name: str | int, obj: LGDO) -> None: # noqa: ARG002 """ diff --git a/tests/compression/conftest.py b/tests/compression/conftest.py index a2451579..cb96d622 100644 --- a/tests/compression/conftest.py +++ b/tests/compression/conftest.py @@ -8,7 +8,7 @@ @pytest.fixture() def wftable(lgnd_test_data): store = lh5.LH5Store() - wft, _ = store.read( + wft = store.read( "/geds/raw/waveform", lgnd_test_data.get_path("lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5"), ) diff --git a/tests/compression/test_radware_sigcompress.py b/tests/compression/test_radware_sigcompress.py index 8387bea6..f54455dc 100644 --- a/tests/compression/test_radware_sigcompress.py +++ b/tests/compression/test_radware_sigcompress.py @@ -182,7 +182,7 @@ def test_aoesa(wftable): def test_performance(lgnd_test_data): store = lh5.LH5Store() - obj, _ = store.read( + obj = store.read( "/geds/raw/waveform", lgnd_test_data.get_path("lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5"), ) diff --git a/tests/lh5/conftest.py b/tests/lh5/conftest.py index 772c0a8a..fa3bf1c4 100644 --- a/tests/lh5/conftest.py +++ b/tests/lh5/conftest.py @@ -120,7 +120,7 @@ def lh5_file(tmptestdir): @pytest.fixture(scope="module") def enc_lgnd_file(lgnd_file, tmptestdir): store = lh5.LH5Store() - wft, n_rows = store.read("/geds/raw/waveform", lgnd_file) + wft = store.read("/geds/raw/waveform", lgnd_file) wft.values.attrs["compression"] = compression.RadwareSigcompress(codec_shift=-32768) store.write( wft, diff --git a/tests/lh5/test_core.py b/tests/lh5/test_core.py index 2db57634..7481feb7 100644 --- a/tests/lh5/test_core.py +++ b/tests/lh5/test_core.py @@ -30,7 +30,7 @@ def test_write(tmptestdir): def test_read_as(lh5_file): store = lh5.LH5Store() - obj1, _ = store.read("/data/struct/table", lh5_file, start_row=1) + obj1 = store.read("/data/struct/table", lh5_file, start_row=1) obj1 = obj1.view_as("pd", with_units=True) obj2 = lh5.read_as( diff --git a/tests/lh5/test_lh5_iterator.py b/tests/lh5/test_lh5_iterator.py index 0f934b3a..59b32cf3 100644 --- a/tests/lh5/test_lh5_iterator.py +++ b/tests/lh5/test_lh5_iterator.py @@ -23,17 +23,16 @@ def test_basics(lgnd_file): buffer_len=5, ) - lh5_obj, n_rows = lh5_it.read(4) - assert n_rows == 5 + lh5_obj = lh5_it.read(4) + assert len(lh5_obj) == 5 assert isinstance(lh5_obj, lgdo.Table) assert list(lh5_obj.keys()) == ["baseline"] assert ( lh5_obj["baseline"].nda == np.array([14353, 14254, 14525, 11656, 13576]) ).all() - for lh5_obj, entry, n_rows in lh5_it: + for lh5_obj, entry in lh5_it: assert len(lh5_obj) == 5 - assert n_rows == 5 assert entry % 5 == 0 @@ -73,7 +72,7 @@ def test_lgnd_waveform_table_fancy_idx(lgnd_file): buffer_len=5, ) - lh5_obj, n_rows = lh5_it.read(0) + lh5_obj = lh5_it.read(0) assert isinstance(lh5_obj, lgdo.WaveformTable) assert len(lh5_obj) == 5 @@ -115,9 +114,9 @@ def test_friend(more_lgnd_files): friend=lh5_raw_it, ) - lh5_obj, n_rows = lh5_it.read(0) + lh5_obj = lh5_it.read(0) - assert n_rows == 5 + assert len(lh5_obj) == 5 assert isinstance(lh5_obj, lgdo.Table) assert set(lh5_obj.keys()) == {"waveform", "baseline", "is_valid_0vbb"} @@ -133,7 +132,7 @@ def test_iterate(more_lgnd_files): buffer_len=5, ) - for lh5_out, entry, n_rows in lh5_it: + for lh5_out, entry in lh5_it: assert set(lh5_out.keys()) == {"is_valid_0vbb", "timestamp", "zacEmax_ctc_cal"} assert entry % 5 == 0 - assert n_rows == 5 + assert len(lh5_out) == 5 diff --git a/tests/lh5/test_lh5_store.py b/tests/lh5/test_lh5_store.py index 2e33ec9d..63931d44 100644 --- a/tests/lh5/test_lh5_store.py +++ b/tests/lh5/test_lh5_store.py @@ -63,10 +63,9 @@ def test_get_buffer(lh5_file): def test_read_scalar(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/scalar", lh5_file) + lh5_obj = store.read("/data/struct/scalar", lh5_file) assert isinstance(lh5_obj, lgdo.Scalar) assert lh5_obj.value == 10 - assert n_rows == 1 assert lh5_obj.attrs["sth"] == 1 with h5py.File(lh5_file) as h5f: assert h5f["/data/struct/scalar"].compression is None @@ -74,63 +73,63 @@ def test_read_scalar(lh5_file): def test_read_array(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/array", lh5_file) + lh5_obj = store.read("/data/struct/array", lh5_file) assert isinstance(lh5_obj, types.Array) assert (lh5_obj.nda == np.array([2, 3, 4])).all() - assert n_rows == 3 + assert len(lh5_obj) == 3 with h5py.File(lh5_file) as h5f: assert ( h5f["/data/struct/array"].compression is DEFAULT_HDF5_SETTINGS["compression"] ) - lh5_obj, n_rows = store.read("/data/struct_full/array2d", lh5_file) + lh5_obj = store.read("/data/struct_full/array2d", lh5_file) assert isinstance(lh5_obj, types.Array) assert lh5_obj == types.Array(shape=(23, 56), fill_val=69, dtype=int) def test_read_array_slice(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/data/struct_full/array", lh5_file, start_row=1, n_rows=3 ) assert isinstance(lh5_obj, types.Array) - assert n_rows == 3 + assert len(lh5_obj) == 3 assert lh5_obj == lgdo.Array([2, 3, 4]) - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/data/struct_full/array", [lh5_file, lh5_file], start_row=1, n_rows=6 ) assert isinstance(lh5_obj, types.Array) - assert n_rows == 6 + assert len(lh5_obj) == 6 assert lh5_obj == lgdo.Array([2, 3, 4, 5, 1, 2]) def test_read_array_fancy_idx(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct_full/array", lh5_file, idx=[0, 3, 4]) + lh5_obj = store.read("/data/struct_full/array", lh5_file, idx=[0, 3, 4]) assert isinstance(lh5_obj, types.Array) - assert n_rows == 3 + assert len(lh5_obj) == 3 assert lh5_obj == lgdo.Array([1, 4, 5]) - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/data/struct_full/array", [lh5_file, lh5_file], idx=[[0, 3, 4], [0, 3, 4]] ) assert isinstance(lh5_obj, types.Array) - assert n_rows == 6 + assert len(lh5_obj) == 6 assert lh5_obj == lgdo.Array([1, 4, 5, 1, 4, 5]) def test_read_vov(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/vov", lh5_file) + lh5_obj = store.read("/data/struct/vov", lh5_file) assert isinstance(lh5_obj, types.VectorOfVectors) assert lh5_obj == lgdo.VectorOfVectors( [[3, 4, 5], [2], [4, 8, 9, 7]], attrs={"myattr": 2} ) - assert n_rows == 3 + assert len(lh5_obj) == 3 assert lh5_obj.attrs["myattr"] == 2 with h5py.File(lh5_file) as h5f: @@ -143,7 +142,7 @@ def test_read_vov(lh5_file): is DEFAULT_HDF5_SETTINGS["compression"] ) - lh5_obj, n_rows = store.read("/data/struct/vov3d", lh5_file) + lh5_obj = store.read("/data/struct/vov3d", lh5_file) assert isinstance(lh5_obj, types.VectorOfVectors) assert ak.all( @@ -154,26 +153,26 @@ def test_read_vov(lh5_file): def test_read_vov_fancy_idx(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct_full/vov", lh5_file, idx=[0], n_rows=1) + lh5_obj = store.read("/data/struct_full/vov", lh5_file, idx=[0], n_rows=1) assert isinstance(lh5_obj, types.VectorOfVectors) - lh5_obj, n_rows = store.read("/data/struct_full/vov", lh5_file, idx=[0, 2]) + lh5_obj = store.read("/data/struct_full/vov", lh5_file, idx=[0, 2]) assert isinstance(lh5_obj, types.VectorOfVectors) assert lh5_obj == types.VectorOfVectors([[1, 2], [2]], attrs={"myattr": 2}) - assert n_rows == 2 + assert len(lh5_obj) == 2 - lh5_obj, n_rows = store.read("/data/struct_full/vov3d", lh5_file, idx=[0, 2]) + lh5_obj = store.read("/data/struct_full/vov3d", lh5_file, idx=[0, 2]) assert isinstance(lh5_obj, types.VectorOfVectors) print(lh5_obj) assert lh5_obj == types.VectorOfVectors([[[1, 2], [3, 4, 5]], [[5, 3, 1]]]) - assert n_rows == 2 + assert len(lh5_obj) == 2 def test_read_voev(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/voev", lh5_file, decompress=False) + lh5_obj = store.read("/data/struct/voev", lh5_file, decompress=False) assert isinstance(lh5_obj, types.VectorOfEncodedVectors) desired = [np.array([3, 4, 5]), np.array([2]), np.array([4, 8, 9, 7])] @@ -181,13 +180,13 @@ def test_read_voev(lh5_file): for i in range(len(desired)): assert (desired[i] == lh5_obj[i][0]).all() - assert n_rows == 3 + assert len(lh5_obj) == 3 - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/data/struct/voev", [lh5_file, lh5_file], decompress=False ) assert isinstance(lh5_obj, types.VectorOfEncodedVectors) - assert n_rows == 6 + assert len(lh5_obj) == 6 with h5py.File(lh5_file) as h5f: assert h5f["/data/struct/voev/encoded_data/flattened_data"].compression is None @@ -203,7 +202,7 @@ def test_read_voev(lh5_file): def test_read_voev_fancy_idx(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/data/struct_full/voev", lh5_file, idx=[0, 2], decompress=False ) assert isinstance(lh5_obj, types.VectorOfEncodedVectors) @@ -213,38 +212,38 @@ def test_read_voev_fancy_idx(lh5_file): for i in range(len(desired)): assert (desired[i] == lh5_obj[i][0]).all() - assert n_rows == 2 + assert len(lh5_obj) == 2 def test_read_aoesa(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/aoesa", lh5_file) + lh5_obj = store.read("/data/struct/aoesa", lh5_file) assert isinstance(lh5_obj, types.ArrayOfEqualSizedArrays) assert (lh5_obj.nda == np.full((3, 5), fill_value=42)).all() def test_read_table(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/table", lh5_file) + lh5_obj = store.read("/data/struct/table", lh5_file) assert isinstance(lh5_obj, types.Table) - assert n_rows == 3 + assert len(lh5_obj) == 3 - lh5_obj, n_rows = store.read("/data/struct/table", [lh5_file, lh5_file]) - assert n_rows == 6 + lh5_obj = store.read("/data/struct/table", [lh5_file, lh5_file]) + assert len(lh5_obj) == 6 assert lh5_obj.attrs["stuff"] == 5 assert lh5_obj["a"].attrs["attr"] == 9 def test_read_empty_struct(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/empty_struct", lh5_file) + lh5_obj = store.read("/data/struct/empty_struct", lh5_file) assert isinstance(lh5_obj, types.Struct) assert list(lh5_obj.keys()) == [] def test_read_hdf5_compressed_data(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/table", lh5_file) + lh5_obj = store.read("/data/struct/table", lh5_file) assert "compression" not in lh5_obj["b"].attrs with h5py.File(lh5_file) as h5f: @@ -260,12 +259,12 @@ def test_read_hdf5_compressed_data(lh5_file): def test_read_wftable(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/wftable", lh5_file) + lh5_obj = store.read("/data/struct/wftable", lh5_file) assert isinstance(lh5_obj, types.WaveformTable) - assert n_rows == 3 + assert len(lh5_obj) == 3 - lh5_obj, n_rows = store.read("/data/struct/wftable", [lh5_file, lh5_file]) - assert n_rows == 6 + lh5_obj = store.read("/data/struct/wftable", [lh5_file, lh5_file]) + assert len(lh5_obj) == 6 assert lh5_obj.values.attrs["custom"] == 8 with h5py.File(lh5_file) as h5f: @@ -285,35 +284,35 @@ def test_read_wftable(lh5_file): def test_read_wftable_encoded(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct/wftable_enc", lh5_file, decompress=False) + lh5_obj = store.read("/data/struct/wftable_enc", lh5_file, decompress=False) assert isinstance(lh5_obj, types.WaveformTable) assert isinstance(lh5_obj.values, types.ArrayOfEncodedEqualSizedArrays) - assert n_rows == 3 + assert len(lh5_obj) == 3 assert lh5_obj.values.attrs["codec"] == "radware_sigcompress" assert "codec_shift" in lh5_obj.values.attrs - lh5_obj, n_rows = store.read("/data/struct/wftable_enc/values", lh5_file) + lh5_obj = store.read("/data/struct/wftable_enc/values", lh5_file) assert isinstance(lh5_obj, lgdo.ArrayOfEqualSizedArrays) - assert n_rows == 3 + assert len(lh5_obj) == 3 - lh5_obj, n_rows = store.read("/data/struct/wftable_enc", lh5_file) + lh5_obj = store.read("/data/struct/wftable_enc", lh5_file) assert isinstance(lh5_obj, lgdo.WaveformTable) assert isinstance(lh5_obj.values, lgdo.ArrayOfEqualSizedArrays) - assert n_rows == 3 + assert len(lh5_obj) == 3 - lh5_obj_chain, n_rows = store.read( + lh5_obj_chain = store.read( "/data/struct/wftable_enc", [lh5_file, lh5_file], decompress=False ) - assert n_rows == 6 + assert len(lh5_obj) == 6 assert isinstance(lh5_obj_chain.values, lgdo.ArrayOfEncodedEqualSizedArrays) - lh5_obj_chain, n_rows = store.read( + lh5_obj_chain = store.read( "/data/struct/wftable_enc", [lh5_file, lh5_file], decompress=True ) assert isinstance(lh5_obj_chain.values, lgdo.ArrayOfEqualSizedArrays) assert np.array_equal(lh5_obj_chain.values[:3], lh5_obj.values) assert np.array_equal(lh5_obj_chain.values[3:], lh5_obj.values) - assert n_rows == 6 + assert len(lh5_obj) == 6 with h5py.File(lh5_file) as h5f: assert ( @@ -336,20 +335,20 @@ def test_read_wftable_encoded(lh5_file): def test_read_with_field_mask(lh5_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/data/struct_full", lh5_file, field_mask=["array"]) + lh5_obj = store.read("/data/struct_full", lh5_file, field_mask=["array"]) assert list(lh5_obj.keys()) == ["array"] - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/data/struct_full", lh5_file, field_mask=("array", "table") ) assert sorted(lh5_obj.keys()) == ["array", "table"] - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/data/struct_full", lh5_file, field_mask={"array": True} ) assert list(lh5_obj.keys()) == ["array"] - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/data/struct_full", lh5_file, field_mask={"vov": False, "voev": False} ) assert sorted(lh5_obj.keys()) == [ @@ -368,23 +367,21 @@ def test_read_with_field_mask(lh5_file): def test_read_lgnd_array(lgnd_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/geds/raw/baseline", lgnd_file) + lh5_obj = store.read("/geds/raw/baseline", lgnd_file) assert isinstance(lh5_obj, types.Array) - assert n_rows == 100 assert len(lh5_obj) == 100 - lh5_obj, n_rows = store.read("/geds/raw/waveform/values", lgnd_file) + lh5_obj = store.read("/geds/raw/waveform/values", lgnd_file) assert isinstance(lh5_obj, types.ArrayOfEqualSizedArrays) def test_read_lgnd_array_fancy_idx(lgnd_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/geds/raw/baseline", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68] ) assert isinstance(lh5_obj, types.Array) - assert n_rows == 7 assert len(lh5_obj) == 7 assert (lh5_obj.nda == [13508, 14353, 14525, 14341, 15079, 11675, 13995]).all() @@ -392,20 +389,18 @@ def test_read_lgnd_array_fancy_idx(lgnd_file): def test_read_lgnd_vov(lgnd_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/geds/raw/tracelist", lgnd_file) + lh5_obj = store.read("/geds/raw/tracelist", lgnd_file) assert isinstance(lh5_obj, types.VectorOfVectors) - assert n_rows == 100 assert len(lh5_obj) == 100 def test_read_lgnd_vov_fancy_idx(lgnd_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/geds/raw/tracelist", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68] ) assert isinstance(lh5_obj, types.VectorOfVectors) - assert n_rows == 7 assert len(lh5_obj) == 7 assert (lh5_obj.cumulative_length.nda == [1, 2, 3, 4, 5, 6, 7]).all() assert (lh5_obj.flattened_data.nda == [40, 60, 64, 60, 64, 28, 60]).all() @@ -413,19 +408,18 @@ def test_read_lgnd_vov_fancy_idx(lgnd_file): def test_read_array_concatenation(lgnd_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/geds/raw/baseline", [lgnd_file, lgnd_file]) + lh5_obj = store.read("/geds/raw/baseline", [lgnd_file, lgnd_file]) assert isinstance(lh5_obj, types.Array) - assert n_rows == 200 assert len(lh5_obj) == 200 def test_read_lgnd_waveform_table(lgnd_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read("/geds/raw/waveform", lgnd_file) + lh5_obj = store.read("/geds/raw/waveform", lgnd_file) assert isinstance(lh5_obj, types.WaveformTable) - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/geds/raw/waveform", lgnd_file, start_row=10, @@ -441,7 +435,7 @@ def test_read_lgnd_waveform_table(lgnd_file): def test_read_lgnd_waveform_table_fancy_idx(lgnd_file): store = lh5.LH5Store() - lh5_obj, n_rows = store.read( + lh5_obj = store.read( "/geds/raw/waveform", lgnd_file, idx=[7, 9, 25, 27, 33, 38, 46, 52, 57, 59, 67, 71, 72, 82, 90, 92, 93, 94, 97], @@ -452,6 +446,6 @@ def test_read_lgnd_waveform_table_fancy_idx(lgnd_file): def test_read_compressed_lgnd_waveform_table(lgnd_file, enc_lgnd_file): store = lh5.LH5Store() - wft, _ = store.read("/geds/raw/waveform", enc_lgnd_file) + wft = store.read("/geds/raw/waveform", enc_lgnd_file) assert isinstance(wft.values, types.ArrayOfEqualSizedArrays) assert "compression" not in wft.values.attrs diff --git a/tests/lh5/test_lh5_write.py b/tests/lh5/test_lh5_write.py index fd9604d4..97065552 100644 --- a/tests/lh5/test_lh5_write.py +++ b/tests/lh5/test_lh5_write.py @@ -17,7 +17,7 @@ def test_write_compressed_lgnd_waveform_table(enc_lgnd_file): # noqa: ARG001 def test_write_with_hdf5_compression(lgnd_file, tmptestdir): store = lh5.LH5Store() - wft, n_rows = store.read("/geds/raw/waveform", lgnd_file) + wft = store.read("/geds/raw/waveform", lgnd_file) store.write( wft, "/geds/raw/waveform", @@ -55,7 +55,7 @@ def test_write_empty_vov(tmptestdir): group="/data", ) - obj, _ = store.read("/data/vov", f"{tmptestdir}/tmp-pygama-lgdo-empty-vov.lh5") + obj = store.read("/data/vov", f"{tmptestdir}/tmp-pygama-lgdo-empty-vov.lh5") assert obj == vov @@ -123,7 +123,7 @@ def test_write_object_overwrite_table_no_deletion(caplog, tmptestdir): ] # Now, check that the data were overwritten - tb_dat, _ = store.read("my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") + tb_dat = store.read("my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") assert np.array_equal(tb_dat["dset1"].nda, np.ones(10)) @@ -149,7 +149,7 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir): ) # Now, try to overwrite with a different field # Now, check that the data were overwritten - tb_dat, _ = store.read("my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") + tb_dat = store.read("my_group", f"{tmptestdir}/write_object_overwrite_test.lh5") assert np.array_equal(tb_dat["dset2"].nda, np.ones(10)) # Also make sure that the first table's fields aren't lurking around the lh5 file! @@ -180,7 +180,7 @@ def test_write_object_overwrite_table_with_deletion(caplog, tmptestdir): ) # Now, try to overwrite with a different field # Now, check that the data were overwritten - tb_dat, _ = store.read( + tb_dat = store.read( "my_group/my_table", f"{tmptestdir}/write_object_overwrite_test.lh5" ) assert np.array_equal(tb_dat["dset2"].nda, np.ones(10)) @@ -236,7 +236,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): ] # Now, check that the data were overwritten - tb_dat, _ = store.read( + tb_dat = store.read( "my_group/my_table", f"{tmptestdir}/write_object_overwrite_test.lh5" ) assert np.array_equal(tb_dat["values"].nda, np.ones((10, 10))) @@ -255,7 +255,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): ) # Now, check that the data were overwritten - array_dat, _ = store.read( + array_dat = store.read( "my_array", f"{tmptestdir}/write_object_overwrite_test.lh5" ) expected_out_array = np.append(np.zeros(5), np.ones(20)) @@ -275,7 +275,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): ) # Now, check that the data were overwritten - scalar_dat, _ = store.read( + scalar_dat = store.read( "my_scalar", f"{tmptestdir}/write_object_overwrite_test.lh5" ) @@ -294,7 +294,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): write_start=1, ) # start overwriting the second list of lists - vector_dat, _ = store.read( + vector_dat = store.read( "my_vector", f"{tmptestdir}/write_object_overwrite_test.lh5" ) @@ -385,7 +385,7 @@ def test_write_object_append_column(tmptestdir): ) # Now, check that the data were appended - tb_dat, _ = store.read( + tb_dat = store.read( "my_group/my_table", f"{tmptestdir}/write_object_append_column_test.lh5" ) assert isinstance(tb_dat, types.Table) @@ -440,7 +440,7 @@ def test_write_histogram(caplog, tmptestdir): ) # Now, check that the data were overwritten - h3, _ = store.read( + h3 = store.read( "my_group/my_histogram", f"{tmptestdir}/write_histogram_test.lh5" ) assert np.array_equal(h3.weights.nda, np.array([[10, 10], [10, 10]])) @@ -508,7 +508,7 @@ def test_write_histogram_variable(caplog, tmptestdir): ) # Now, check that the data were overwritten - h3, _ = store.read( + h3 = store.read( "my_group/my_histogram", f"{tmptestdir}/write_histogram_test.lh5" ) assert np.array_equal(h3.weights.nda, np.array([[10, 10], [10, 10]])) diff --git a/tests/test_cli.py b/tests/test_cli.py index ac054854..2b98b987 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -74,10 +74,10 @@ def test_lh5concat(lgnd_test_data, tmptestdir): ] store = lh5.LH5Store() - tbl1, size = store.read("ch1057600/raw", infile1) - tbl2, size = store.read("ch1057600/raw", infile2) - tbl, size = store.read("ch1057600/raw", outfile) - assert size == 20 + tbl1 = store.read("ch1057600/raw", infile1) + tbl2 = store.read("ch1057600/raw", infile2) + tbl = store.read("ch1057600/raw", outfile) + assert len(tbl) == 20 for i in range(10): assert tbl.packet_id[i] == tbl1.packet_id[i] @@ -111,7 +111,7 @@ def test_lh5concat(lgnd_test_data, tmptestdir): "ch1057600/raw/waveform/values", ] - tbl, _ = store.read("ch1057600/raw", outfile) + tbl = store.read("ch1057600/raw", outfile) assert isinstance(tbl, types.Table) arg_list[4] = "--exclude" @@ -136,10 +136,10 @@ def test_lh5concat(lgnd_test_data, tmptestdir): "ch1057600/raw/waveform/t0", ] - tbl1, size = store.read("ch1059201/raw", infile1) - tbl2, size = store.read("ch1059201/raw", infile2) - tbl, size = store.read("ch1059201/raw", outfile) - assert size == 20 + tbl1 = store.read("ch1059201/raw", infile1) + tbl2 = store.read("ch1059201/raw", infile2) + tbl = store.read("ch1059201/raw", outfile) + assert len(tbl) == 20 for i in range(10): assert tbl.packet_id[i] == tbl1.packet_id[i] diff --git a/tests/types/test_histogram.py b/tests/types/test_histogram.py index c77878af..d922eb75 100644 --- a/tests/types/test_histogram.py +++ b/tests/types/test_histogram.py @@ -265,9 +265,9 @@ def test_view_as_np(): def test_not_like_table(): h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) assert h.form_datatype() == "struct{binning,weights,isdensity}" - with pytest.raises(TypeError): + with pytest.raises(AttributeError): x = h.x # noqa: F841 - with pytest.raises(TypeError): + with pytest.raises(AttributeError): h["x"] = Scalar(1.0) with pytest.raises(TypeError): h.add_field("x", Scalar(1.0)) diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index 0948c7bc..8357a5c5 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -439,5 +439,5 @@ def test_lh5_iterator_view_as(lgnd_test_data): "ch1067205/dsp/energies", ) - for obj, _, _ in it: + for obj, _ in it: assert ak.is_valid(obj.view_as("ak")) From 23a03a6272f7be6a001037ef90b49af89aa7aab5 Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 11:27:34 -0700 Subject: [PATCH 06/27] Changed table to handle capacity and resizing similar to array --- src/lgdo/lh5/core.py | 11 +++------- src/lgdo/types/table.py | 44 ++++++++++++++++++++++++++++--------- tests/lh5/test_lh5_store.py | 6 +++++ tests/types/test_table.py | 42 +++++++++++++++++++++-------------- 4 files changed, 68 insertions(+), 35 deletions(-) diff --git a/src/lgdo/lh5/core.py b/src/lgdo/lh5/core.py index fc97338d..3a2c2cb1 100644 --- a/src/lgdo/lh5/core.py +++ b/src/lgdo/lh5/core.py @@ -92,8 +92,7 @@ def read( will be set to ``True``, while the rest will default to ``False``. obj_buf Read directly into memory provided in `obj_buf`. Note: the buffer - will be expanded to accommodate the data requested. To maintain the - buffer length, send in ``n_rows = len(obj_buf)``. + will be resized to accommodate the data retrieved. obj_buf_start Start location in ``obj_buf`` for read. For concatenating data to array-like objects. @@ -106,12 +105,8 @@ def read( Returns ------- - (object, n_rows_read) - `object` is the read-out object `n_rows_read` is the number of rows - successfully read out. Essential for arrays when the amount of data - is smaller than the object buffer. For scalars and structs - `n_rows_read` will be``1``. For tables it is redundant with - ``table.loc``. If `obj_buf` is ``None``, only `object` is returned. + object + `the read-out object """ if isinstance(lh5_file, h5py.File): lh5_obj = lh5_file[name] diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 81c43bf3..b1bef50a 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -100,7 +100,30 @@ def __len__(self) -> int: """Provides ``__len__`` for this array-like class.""" return self.size - def resize(self, new_size: int | None = None, do_warn: bool = False) -> None: + def set_capacity(self, capacity: int | ArrayLike) -> None: + "Set size (number of rows) of internal memory buffer" + if isinstance(capacity, int): + for obj in self.values(): + obj.set_capacity(capacity) + else: + if len(capacity) != len(self.keys()): + msg = "List of capacities must have same length as number of keys" + raise ValueError(msg) + + for obj, cap in zip(self.values(), capacity): + obj.set_capacity(cap) + + def get_capacity(self) -> int: + "Get list of capacities for each key" + return [ v.get_capacity() for v in self.values() ] + + def trim_capacity(self) -> int: + "Set capacity to be minimum needed to support Array size" + for v in self.values(): + v.trim_capacity() + + + def resize(self, new_size: int | None = None, do_warn: bool = False, trim: bool = False) -> None: # if new_size = None, use the size from the first field for field, obj in self.items(): if new_size is None: @@ -112,19 +135,20 @@ def resize(self, new_size: int | None = None, do_warn: bool = False) -> None: f"with size {len(obj)} != {new_size}" ) if isinstance(obj, Table): - obj.resize(new_size) + obj.resize(new_size, trim) else: - obj.resize(new_size) + obj.resize(new_size, trim) self.size = new_size - def push_row(self) -> None: - self.loc += 1 - - def is_full(self) -> bool: - return self.loc >= self.size + def append(self, vals: Dict) -> None: + "Append vals to end of table. Vals is a mapping from table key to val" + self.insert(len(self), vals) - def clear(self) -> None: - self.loc = 0 + def insert(self, i: int, vals: Dict) -> None: + "Insert vals into table at row i. Vals is a mapping from table key to val" + for k, ar in self.items(): + ar.insert(i, vals[k]) + self.size += 1 def add_field(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None: """Add a field (column) to the table. diff --git a/tests/lh5/test_lh5_store.py b/tests/lh5/test_lh5_store.py index 63931d44..99501832 100644 --- a/tests/lh5/test_lh5_store.py +++ b/tests/lh5/test_lh5_store.py @@ -132,6 +132,12 @@ def test_read_vov(lh5_file): assert len(lh5_obj) == 3 assert lh5_obj.attrs["myattr"] == 2 + lh5_obj = store.read("/data/struct/vov", [lh5_file, lh5_file]) + assert len(lh5_obj) == 6 + assert lh5_obj == lgdo.VectorOfVectors( + [[3, 4, 5], [2], [4, 8, 9, 7], [3, 4, 5], [2], [4, 8, 9, 7]], attrs={"myattr": 2} + ) + with h5py.File(lh5_file) as h5f: assert ( h5f["/data/struct/vov/cumulative_length"].compression diff --git a/tests/types/test_table.py b/tests/types/test_table.py index efbb6234..f1cadbc1 100644 --- a/tests/types/test_table.py +++ b/tests/types/test_table.py @@ -78,25 +78,33 @@ def test_datatype_name(): assert tbl.datatype_name() == "table" -def test_push_row(): - tbl = Table() - tbl.push_row() - assert tbl.loc == 1 - - -def test_is_full(): - tbl = Table(size=2) - tbl.push_row() - assert tbl.is_full() is False - tbl.push_row() - assert tbl.is_full() is True +def test_append(): + col_dict = { + "a": lgdo.Array(nda=np.array([1, 2, 3, 4])), + "b": lgdo.Array(nda=np.array([5, 6, 7, 8])), + } + tbl = Table(col_dict=col_dict) + tbl.append({"a": -1, "b": -1}) + assert len(tbl) == 5 + assert tbl == Table( { + "a": lgdo.Array(nda=np.array([1, 2, 3, 4, -1])), + "b": lgdo.Array(nda=np.array([5, 6, 7, 8, -1])), + } ) + +def test_insert(): + col_dict = { + "a": lgdo.Array(nda=np.array([1, 2, 3, 4])), + "b": lgdo.Array(nda=np.array([5, 6, 7, 8])), + } -def test_clear(): - tbl = Table() - tbl.push_row() - tbl.clear() - assert tbl.loc == 0 + tbl = Table(col_dict=col_dict) + tbl.insert(1, {"a": -1, "b": -1}) + assert len(tbl) == 5 + assert tbl == Table( { + "a": lgdo.Array(nda=np.array([1, -1, 2, 3, 4])), + "b": lgdo.Array(nda=np.array([5, -1, 6, 7, 8])), + } ) def test_add_field(): From 32ceef9ef6592191d2877c38da426ce26bcc0b44 Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 13:34:59 -0700 Subject: [PATCH 07/27] Fixed test --- tests/lh5/test_lh5_store.py | 4 ++-- tests/types/test_table.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/lh5/test_lh5_store.py b/tests/lh5/test_lh5_store.py index 99501832..30077dac 100644 --- a/tests/lh5/test_lh5_store.py +++ b/tests/lh5/test_lh5_store.py @@ -309,7 +309,7 @@ def test_read_wftable_encoded(lh5_file): lh5_obj_chain = store.read( "/data/struct/wftable_enc", [lh5_file, lh5_file], decompress=False ) - assert len(lh5_obj) == 6 + assert len(lh5_obj_chain) == 6 assert isinstance(lh5_obj_chain.values, lgdo.ArrayOfEncodedEqualSizedArrays) lh5_obj_chain = store.read( @@ -318,7 +318,7 @@ def test_read_wftable_encoded(lh5_file): assert isinstance(lh5_obj_chain.values, lgdo.ArrayOfEqualSizedArrays) assert np.array_equal(lh5_obj_chain.values[:3], lh5_obj.values) assert np.array_equal(lh5_obj_chain.values[3:], lh5_obj.values) - assert len(lh5_obj) == 6 + assert len(lh5_obj_chain) == 6 with h5py.File(lh5_file) as h5f: assert ( diff --git a/tests/types/test_table.py b/tests/types/test_table.py index f1cadbc1..cfdbe23d 100644 --- a/tests/types/test_table.py +++ b/tests/types/test_table.py @@ -14,7 +14,6 @@ def test_init(): tbl = Table() assert not tbl.size - assert tbl.loc == 0 tbl = Table(size=10) assert tbl.size == 10 From 8d7c1eb4af133fc585cc5aa415938cdc237beaf5 Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 13:36:10 -0700 Subject: [PATCH 08/27] Added abstract base class for LGDO collections --- src/lgdo/lh5/_serializers/read/composite.py | 4 +- src/lgdo/types/array.py | 19 ++++-- src/lgdo/types/encoded.py | 43 +++++++------ src/lgdo/types/lgdo.py | 70 +++++++++++++++++++++ src/lgdo/types/table.py | 27 ++++---- src/lgdo/types/vectorofvectors.py | 10 +-- 6 files changed, 125 insertions(+), 48 deletions(-) diff --git a/src/lgdo/lh5/_serializers/read/composite.py b/src/lgdo/lh5/_serializers/read/composite.py index 2674499e..d4c4464f 100644 --- a/src/lgdo/lh5/_serializers/read/composite.py +++ b/src/lgdo/lh5/_serializers/read/composite.py @@ -370,15 +370,13 @@ def _h5_read_table( table = Table(col_dict=col_dict, attrs=attrs) # set (write) loc to end of tree - table.loc = n_rows_read + table.resize(do_warn=True) return table, n_rows_read # We have read all fields into the object buffer. Run # checks: All columns should be the same size. So update # table's size as necessary, warn if any mismatches are found obj_buf.resize(do_warn=True) - # set (write) loc to end of tree - obj_buf.loc = obj_buf_start + n_rows_read # check attributes utils.check_obj_buf_attrs(obj_buf.attrs, attrs, fname, oname) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index 32343074..e567977c 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -17,12 +17,12 @@ from .. import utils from ..units import default_units_registry as u -from .lgdo import LGDO +from .lgdo import LGDOCollection log = logging.getLogger(__name__) -class Array(LGDO): +class Array(LGDOCollection): r"""Holds an :class:`numpy.ndarray` and attributes. :class:`Array` (and the other various array types) holds an `nda` instead @@ -111,7 +111,7 @@ def dtype(self): def shape(self): return (len(self),) + self._nda.shape[1:] - def set_capacity(self, capacity: int) -> None: + def reserve_capacity(self, capacity: int) -> None: "Set size (number of rows) of internal memory buffer" if capacity < len(self): msg = "Cannot reduce capacity below Array length" @@ -124,7 +124,7 @@ def get_capacity(self) -> int: def trim_capacity(self) -> None: "Set capacity to be minimum needed to support Array size" - self.set_capacity(np.prod(self.shape)) + self.reserve_capacity(np.prod(self.shape)) def resize(self, new_size: int, trim=False) -> None: """Set size of Array in rows. Only change capacity if it must be @@ -132,11 +132,11 @@ def resize(self, new_size: int, trim=False) -> None: If trim is True, capacity will be set to match size.""" if trim and new_size != self.get_capacity: - self.set_capacity(new_size) + self.reserve_capacity(new_size) # If capacity is not big enough, set to next power of 2 big enough if new_size > self.get_capacity(): - self.set_capacity(int(2 ** (np.ceil(np.log2(new_size))))) + self.reserve_capacity(int(2 ** (np.ceil(np.log2(new_size))))) self._size = new_size @@ -163,6 +163,13 @@ def insert(self, i: int, value: int | float) -> None: msg = f"Could not insert value with shape {value.shape} into Array with shape {self.shape}" raise ValueError(msg) + def replace(self, i: int, value: int | float) -> None: + "Replace value at row i" + if i >= len(self): + msg = f"index {i} is out of bounds for array with size {len(self)}" + raise IndexError(msg) + self[i] = value + def __getitem__(self, key): return self.nda[key] diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 5fe38249..15d6dadb 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -11,12 +11,12 @@ from .. import utils from .array import Array -from .lgdo import LGDO +from .lgdo import LGDOCollection from .scalar import Scalar from .vectorofvectors import VectorOfVectors -class VectorOfEncodedVectors(LGDO): +class VectorOfEncodedVectors(LGDOCollection): """An array of variable-length encoded arrays. Used to represent an encoded :class:`.VectorOfVectors`. In addition to an @@ -92,6 +92,17 @@ def __eq__(self, other: VectorOfEncodedVectors) -> bool: return False + def reserve_capacity(self, *capacity: int) -> None: + self.encoded_data.reserve_capacity(*capacity) + self.decoded_size.reserve_capacity(capacity[0]) + + def get_capacity(self) -> Tuple: + return (self.decoded_size.get_capacity, *self.encoded_data.get_capacity()) + + def trim_capacity(self) -> None: + self.encoded_data.trim_capacity() + self.decoded_size.trim_capacity() + def resize(self, new_size: int) -> None: """Resize vector along the first axis. @@ -102,21 +113,6 @@ def resize(self, new_size: int) -> None: self.encoded_data.resize(new_size) self.decoded_size.resize(new_size) - def append(self, value: tuple[NDArray, int]) -> None: - """Append a 1D encoded vector at the end. - - Parameters - ---------- - value - a tuple holding the encoded array and its decoded size. - - See Also - -------- - .VectorOfVectors.append - """ - self.encoded_data.append(value[0]) - self.decoded_size.append(value[1]) - def insert(self, i: int, value: tuple[NDArray, int]) -> None: """Insert an encoded vector at index `i`. @@ -282,7 +278,7 @@ def view_as( raise ValueError(msg) -class ArrayOfEncodedEqualSizedArrays(LGDO): +class ArrayOfEncodedEqualSizedArrays(LGDOCollection): """An array of encoded arrays with equal decoded size. Used to represent an encoded :class:`.ArrayOfEqualSizedArrays`. In addition @@ -349,7 +345,16 @@ def __eq__(self, other: ArrayOfEncodedEqualSizedArrays) -> bool: return False - def resize(self, new_size: int) -> None: + def reserve_capacity(self, *capacity: int) -> None: + self.encoded_data.reserve_capacity(capacity) + + def get_capacity(self) -> Tuple: + return self.encoded_data.get_capacity() + + def trim_capacity(self) -> None: + self.encoded_data.trim_capacity() + + def resize(self, new_size: int, trim: bool = False) -> None: """Resize array along the first axis. See Also diff --git a/src/lgdo/types/lgdo.py b/src/lgdo/types/lgdo.py index 4a965c04..18c7e7a4 100644 --- a/src/lgdo/types/lgdo.py +++ b/src/lgdo/types/lgdo.py @@ -86,3 +86,73 @@ def __str__(self) -> str: def __repr__(self) -> str: return self.__class__.__name__ + f"(attrs={self.attrs!r})" + + +class LGDOCollection(LGDO): + """Abstract base class representing a LEGEND Collection Object (LGDO). + This defines the interface for classes used as table columns. + """ + + @abstractmethod + def __init__(self, attrs: dict[str, Any] | None = None) -> None: + super().__init__(attrs) + + @abstractmethod + def __len__(self) -> int: + """Provides ``__len__`` for this array-like class.""" + + @abstractmethod + def reserve_capacity(self, capacity: int) -> None: + """Reserve capacity (in rows) for later use. Internal memory buffers + will have enough entries to store this many rows. + """ + + @abstractmethod + def get_capacity(self) -> int: + "get reserved capacity of internal memory buffers in rows" + + @abstractmethod + def trim_capacity(self) -> None: + """set capacity to only what is required to store current contents + of LGDOCollection + """ + + @abstractmethod + def resize(self, new_size: int, trim: bool = False) -> None: + """Return this LGDO's datatype attribute string.""" + + def append(self, val) -> None: + "append val to end of LGDOCollection" + self.insert(len(self), val) + + @abstractmethod + def insert(self, i: int, val) -> None: + "insert val into LGDOCollection at position i" + + @abstractmethod + def replace(self, i: int, val) -> None: + "replace item at position i with val in LGDOCollection" + + def clear(self, trim: bool = False) -> None: + "set size of LGDOCollection to zero" + self.resize(0, trim) + + def getattrs(self, datatype: bool = False) -> dict: + """Return a copy of the LGDO attributes dictionary. + + Parameters + ---------- + datatype + if ``False``, remove ``datatype`` attribute from the output + dictionary. + """ + d = dict(self.attrs) + if not datatype: + d.pop("datatype", None) + return d + + def __str__(self) -> str: + return repr(self) + + def __repr__(self) -> str: + return self.__class__.__name__ + f"(attrs={self.attrs!r})" diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index b1bef50a..4fe9103f 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -18,7 +18,7 @@ from .array import Array from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays -from .lgdo import LGDO +from .lgdo import LGDOCollection from .scalar import Scalar from .struct import Struct from .vectorofvectors import VectorOfVectors @@ -26,7 +26,7 @@ log = logging.getLogger(__name__) -class Table(Struct): +class Table(Struct, LGDOCollection): """A special struct of arrays or subtable columns of equal length. Holds onto an internal read/write location ``loc`` that is useful in @@ -42,7 +42,7 @@ class Table(Struct): def __init__( self, - col_dict: Mapping[str, LGDO] | pd.DataFrame | ak.Array | None = None, + col_dict: Mapping[str, LGDOCollection] | pd.DataFrame | ak.Array | None = None, size: int | None = None, attrs: Mapping[str, Any] | None = None, ) -> None: @@ -58,7 +58,7 @@ def __init__( col_dict instantiate this table using the supplied mapping of column names and array-like objects. Supported input types are: mapping of - strings to LGDOs, :class:`pd.DataFrame` and :class:`ak.Array`. + strings to LGDOCollections, :class:`pd.DataFrame` and :class:`ak.Array`. Note 1: no copy is performed, the objects are used directly (unless :class:`ak.Array` is provided). Note 2: if `size` is not ``None``, all arrays will be resized to match it. Note 3: if the arrays have @@ -86,13 +86,10 @@ def __init__( if col_dict is not None and len(col_dict) > 0: self.resize(new_size=size, do_warn=(size is None)) - # if no col_dict, just set the size (default to 1024) + # if no col_dict, just set the size else: self.size = size if size is not None else None - # always start at loc=0 - self.loc = 0 - def datatype_name(self) -> str: return "table" @@ -100,18 +97,18 @@ def __len__(self) -> int: """Provides ``__len__`` for this array-like class.""" return self.size - def set_capacity(self, capacity: int | ArrayLike) -> None: + def reserve_capacity(self, capacity: int | ArrayLike) -> None: "Set size (number of rows) of internal memory buffer" if isinstance(capacity, int): for obj in self.values(): - obj.set_capacity(capacity) + obj.reserve_capacity(capacity) else: if len(capacity) != len(self.keys()): msg = "List of capacities must have same length as number of keys" raise ValueError(msg) for obj, cap in zip(self.values(), capacity): - obj.set_capacity(cap) + obj.reserve_capacity(cap) def get_capacity(self) -> int: "Get list of capacities for each key" @@ -150,7 +147,7 @@ def insert(self, i: int, vals: Dict) -> None: ar.insert(i, vals[k]) self.size += 1 - def add_field(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None: + def add_field(self, name: str, obj: LGDOCollection, use_obj_size: bool = False) -> None: """Add a field (column) to the table. Use the name "field" here to match the terminology used in @@ -187,7 +184,7 @@ def add_field(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None: new_size = len(obj) if use_obj_size else self.size self.resize(new_size=new_size) - def add_column(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None: + def add_column(self, name: str, obj: LGDOCollection, use_obj_size: bool = False) -> None: """Alias for :meth:`.add_field` using table terminology 'column'.""" self.add_field(name, obj, use_obj_size=use_obj_size) @@ -218,8 +215,8 @@ def join( set to ``False`` to turn off warnings associated with mismatched `loc` parameter or :meth:`add_column` warnings. """ - if other_table.loc != self.loc and do_warn: - log.warning(f"other_table.loc ({other_table.loc}) != self.loc({self.loc})") + if len(other_table) != len(self) and do_warn: + log.warning(f"len(other_table) ({len(other_table)}) != len(self) ({len(self)})") if cols is None: cols = other_table.keys() for name in cols: diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 985e9427..d28870f0 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -19,12 +19,12 @@ from . import arrayofequalsizedarrays as aoesa from . import vovutils from .array import Array -from .lgdo import LGDO +from .lgdo import LGDOCollection log = logging.getLogger(__name__) -class VectorOfVectors(LGDO): +class VectorOfVectors(LGDOCollection): """A n-dimensional variable-length 1D array of variable-length 1D arrays. If the vector is 2-dimensional, the internal representation is as two NumPy @@ -270,14 +270,14 @@ def __setitem__(self, i: int, new: NDArray) -> None: else: raise NotImplementedError - def set_capacity(self, cap_cl, *cap_args) -> None: + def reserve_capacity(self, cap_cl, *cap_args) -> None: """Set capacity of internal data arrays. Expect number of args to equal `self.n_dim`. First arg is capacity of cumulative length array. If `self.n_dim` is 2, second argument is capacity of flattened data, otherwise arguments are fed recursively to remaining dimensions. """ - self.cumulative_length.set_capacity(cap_cl) - self.flattened_data.set_capacity(*cap_args) + self.cumulative_length.reserve_capacity(cap_cl) + self.flattened_data.reserve_capacity(*cap_args) def get_capacity(self) -> Tuple[int]: """Get tuple containing capacity of each dimension. First dimension From 8a5dcb27be935b24a4fb11a8eb9cb49b8c08cba3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 13 Oct 2024 21:15:19 +0000 Subject: [PATCH 09/27] style: pre-commit fixes --- src/lgdo/lh5/core.py | 4 ++-- src/lgdo/lh5/store.py | 2 -- src/lgdo/types/array.py | 2 +- src/lgdo/types/encoded.py | 2 +- src/lgdo/types/table.py | 21 ++++++++++++++------- src/lgdo/types/vectorofvectors.py | 27 +++++++++++++++++---------- tests/lh5/test_lh5_store.py | 27 ++++++++------------------- tests/lh5/test_lh5_write.py | 12 +++--------- tests/types/test_table.py | 21 +++++++++++++-------- 9 files changed, 59 insertions(+), 59 deletions(-) diff --git a/src/lgdo/lh5/core.py b/src/lgdo/lh5/core.py index 3a2c2cb1..b6a52dc4 100644 --- a/src/lgdo/lh5/core.py +++ b/src/lgdo/lh5/core.py @@ -118,7 +118,7 @@ def read( obj_buf.resize(obj_buf_start) else: obj_buf_start = 0 - + for i, h5f in enumerate(lh5_file): if ( isinstance(idx, (list, tuple)) @@ -148,7 +148,7 @@ def read( obj_buf = read( name, h5f, - start_row if i==0 else 0, + start_row if i == 0 else 0, n_rows_i, idx_i, use_h5idx, diff --git a/src/lgdo/lh5/store.py b/src/lgdo/lh5/store.py index a1149e6b..424e230d 100644 --- a/src/lgdo/lh5/store.py +++ b/src/lgdo/lh5/store.py @@ -5,7 +5,6 @@ from __future__ import annotations -import bisect import logging import os import sys @@ -14,7 +13,6 @@ from typing import Any import h5py -import numpy as np from numpy.typing import ArrayLike from .. import types diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index e567977c..c1c720ff 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -149,7 +149,7 @@ def insert(self, i: int, value: int | float) -> None: if i > len(self): msg = f"index {i} is out of bounds for array with size {len(self)}" raise IndexError(msg) - + value = np.array(value) if value.shape == self.shape[1:]: self.resize(len(self) + 1) diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index 15d6dadb..b4caf7f8 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -102,7 +102,7 @@ def get_capacity(self) -> Tuple: def trim_capacity(self) -> None: self.encoded_data.trim_capacity() self.decoded_size.trim_capacity() - + def resize(self, new_size: int) -> None: """Resize vector along the first axis. diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 4fe9103f..5fbe7c6b 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -106,21 +106,22 @@ def reserve_capacity(self, capacity: int | ArrayLike) -> None: if len(capacity) != len(self.keys()): msg = "List of capacities must have same length as number of keys" raise ValueError(msg) - + for obj, cap in zip(self.values(), capacity): obj.reserve_capacity(cap) def get_capacity(self) -> int: "Get list of capacities for each key" - return [ v.get_capacity() for v in self.values() ] + return [v.get_capacity() for v in self.values()] def trim_capacity(self) -> int: "Set capacity to be minimum needed to support Array size" for v in self.values(): v.trim_capacity() - - def resize(self, new_size: int | None = None, do_warn: bool = False, trim: bool = False) -> None: + def resize( + self, new_size: int | None = None, do_warn: bool = False, trim: bool = False + ) -> None: # if new_size = None, use the size from the first field for field, obj in self.items(): if new_size is None: @@ -147,7 +148,9 @@ def insert(self, i: int, vals: Dict) -> None: ar.insert(i, vals[k]) self.size += 1 - def add_field(self, name: str, obj: LGDOCollection, use_obj_size: bool = False) -> None: + def add_field( + self, name: str, obj: LGDOCollection, use_obj_size: bool = False + ) -> None: """Add a field (column) to the table. Use the name "field" here to match the terminology used in @@ -184,7 +187,9 @@ def add_field(self, name: str, obj: LGDOCollection, use_obj_size: bool = False) new_size = len(obj) if use_obj_size else self.size self.resize(new_size=new_size) - def add_column(self, name: str, obj: LGDOCollection, use_obj_size: bool = False) -> None: + def add_column( + self, name: str, obj: LGDOCollection, use_obj_size: bool = False + ) -> None: """Alias for :meth:`.add_field` using table terminology 'column'.""" self.add_field(name, obj, use_obj_size=use_obj_size) @@ -216,7 +221,9 @@ def join( `loc` parameter or :meth:`add_column` warnings. """ if len(other_table) != len(self) and do_warn: - log.warning(f"len(other_table) ({len(other_table)}) != len(self) ({len(self)})") + log.warning( + f"len(other_table) ({len(other_table)}) != len(self) ({len(self)})" + ) if cols is None: cols = other_table.keys() for name in cols: diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index d28870f0..5bed511e 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -213,8 +213,10 @@ def __init__( @property def ndim(self): - return 1 + (1 if isinstance(self.flattened_data, Array) else self.flattened_data.ndim) - + return 1 + ( + 1 if isinstance(self.flattened_data, Array) else self.flattened_data.ndim + ) + @property def dtype(self) -> np.dtype: return self.flattened_data.dtype @@ -283,7 +285,10 @@ def get_capacity(self) -> Tuple[int]: """Get tuple containing capacity of each dimension. First dimension is cumulative length array. Last dimension is flattened data. """ - return (self.cumulative_length.get_capacity(), *self.flattened_data.get_capacity()) + return ( + self.cumulative_length.get_capacity(), + *self.flattened_data.get_capacity(), + ) def trim_capacity(self) -> None: "Set capacity for all dimensions to minimum needed to hold data" @@ -299,7 +304,7 @@ def resize(self, new_size: int, trim: bool = False) -> None: If `new_size` is larger than the current vector length, `self.cumulative_length` is padded with its last element. This corresponds to appending empty vectors. - + If `trim` is ``True``, resize capacity to match new size Examples @@ -327,7 +332,7 @@ def resize(self, new_size: int, trim: bool = False) -> None: # if new_size > size, new elements are filled with zeros, let's fix # that if new_size > old_s: - self.cumulative_length[old_s:] = self.cumulative_length[old_s-1] + self.cumulative_length[old_s:] = self.cumulative_length[old_s - 1] # then resize the data array # if dlen > 0 this has no effect @@ -370,7 +375,7 @@ def insert(self, i: int, new: NDArray) -> None: msg = f"index {i} is out of bounds for vector with size {len(self)}" raise IndexError(msg) - i_start = 0 if i==0 else self.cumulative_length[i-1] + i_start = 0 if i == 0 else self.cumulative_length[i - 1] self.flattened_data.insert(i_start, new) self.cumulative_length.insert(i, i_start) self.cumulative_length[i:] += np.uint32(len(new)) @@ -405,11 +410,13 @@ def replace(self, i: int, new: NDArray) -> None: # move the subsequent entries vidx[i:] += dlen self.flattened_data.resize(vidx[-1]) - self.flattened_data._nda[vidx[i]:vidx[-1]] = self.flattened_data._nda[vidx[i]-dlen:vidx[-1]-dlen] - + self.flattened_data._nda[vidx[i] : vidx[-1]] = self.flattened_data._nda[ + vidx[i] - dlen : vidx[-1] - dlen + ] + # set the already allocated indices - start = vidx[i - 1] if i>0 else 0 - self.flattened_data[start:vidx[i]] = new + start = vidx[i - 1] if i > 0 else 0 + self.flattened_data[start : vidx[i]] = new else: raise NotImplementedError diff --git a/tests/lh5/test_lh5_store.py b/tests/lh5/test_lh5_store.py index 30077dac..711182e5 100644 --- a/tests/lh5/test_lh5_store.py +++ b/tests/lh5/test_lh5_store.py @@ -90,9 +90,7 @@ def test_read_array(lh5_file): def test_read_array_slice(lh5_file): store = lh5.LH5Store() - lh5_obj = store.read( - "/data/struct_full/array", lh5_file, start_row=1, n_rows=3 - ) + lh5_obj = store.read("/data/struct_full/array", lh5_file, start_row=1, n_rows=3) assert isinstance(lh5_obj, types.Array) assert len(lh5_obj) == 3 assert lh5_obj == lgdo.Array([2, 3, 4]) @@ -135,7 +133,8 @@ def test_read_vov(lh5_file): lh5_obj = store.read("/data/struct/vov", [lh5_file, lh5_file]) assert len(lh5_obj) == 6 assert lh5_obj == lgdo.VectorOfVectors( - [[3, 4, 5], [2], [4, 8, 9, 7], [3, 4, 5], [2], [4, 8, 9, 7]], attrs={"myattr": 2} + [[3, 4, 5], [2], [4, 8, 9, 7], [3, 4, 5], [2], [4, 8, 9, 7]], + attrs={"myattr": 2}, ) with h5py.File(lh5_file) as h5f: @@ -188,9 +187,7 @@ def test_read_voev(lh5_file): assert len(lh5_obj) == 3 - lh5_obj = store.read( - "/data/struct/voev", [lh5_file, lh5_file], decompress=False - ) + lh5_obj = store.read("/data/struct/voev", [lh5_file, lh5_file], decompress=False) assert isinstance(lh5_obj, types.VectorOfEncodedVectors) assert len(lh5_obj) == 6 @@ -344,14 +341,10 @@ def test_read_with_field_mask(lh5_file): lh5_obj = store.read("/data/struct_full", lh5_file, field_mask=["array"]) assert list(lh5_obj.keys()) == ["array"] - lh5_obj = store.read( - "/data/struct_full", lh5_file, field_mask=("array", "table") - ) + lh5_obj = store.read("/data/struct_full", lh5_file, field_mask=("array", "table")) assert sorted(lh5_obj.keys()) == ["array", "table"] - lh5_obj = store.read( - "/data/struct_full", lh5_file, field_mask={"array": True} - ) + lh5_obj = store.read("/data/struct_full", lh5_file, field_mask={"array": True}) assert list(lh5_obj.keys()) == ["array"] lh5_obj = store.read( @@ -384,9 +377,7 @@ def test_read_lgnd_array(lgnd_file): def test_read_lgnd_array_fancy_idx(lgnd_file): store = lh5.LH5Store() - lh5_obj = store.read( - "/geds/raw/baseline", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68] - ) + lh5_obj = store.read("/geds/raw/baseline", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68]) assert isinstance(lh5_obj, types.Array) assert len(lh5_obj) == 7 assert (lh5_obj.nda == [13508, 14353, 14525, 14341, 15079, 11675, 13995]).all() @@ -403,9 +394,7 @@ def test_read_lgnd_vov(lgnd_file): def test_read_lgnd_vov_fancy_idx(lgnd_file): store = lh5.LH5Store() - lh5_obj = store.read( - "/geds/raw/tracelist", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68] - ) + lh5_obj = store.read("/geds/raw/tracelist", lgnd_file, idx=[2, 4, 6, 9, 11, 16, 68]) assert isinstance(lh5_obj, types.VectorOfVectors) assert len(lh5_obj) == 7 assert (lh5_obj.cumulative_length.nda == [1, 2, 3, 4, 5, 6, 7]).all() diff --git a/tests/lh5/test_lh5_write.py b/tests/lh5/test_lh5_write.py index 97065552..28a5bd2e 100644 --- a/tests/lh5/test_lh5_write.py +++ b/tests/lh5/test_lh5_write.py @@ -255,9 +255,7 @@ def test_write_object_overwrite_lgdo(caplog, tmptestdir): ) # Now, check that the data were overwritten - array_dat = store.read( - "my_array", f"{tmptestdir}/write_object_overwrite_test.lh5" - ) + array_dat = store.read("my_array", f"{tmptestdir}/write_object_overwrite_test.lh5") expected_out_array = np.append(np.zeros(5), np.ones(20)) assert np.array_equal(array_dat.nda, expected_out_array) @@ -440,9 +438,7 @@ def test_write_histogram(caplog, tmptestdir): ) # Now, check that the data were overwritten - h3 = store.read( - "my_group/my_histogram", f"{tmptestdir}/write_histogram_test.lh5" - ) + h3 = store.read("my_group/my_histogram", f"{tmptestdir}/write_histogram_test.lh5") assert np.array_equal(h3.weights.nda, np.array([[10, 10], [10, 10]])) assert h3.binning[0].edges[0] == 2 assert h3.binning[1].edges[-1] == 7 @@ -508,9 +504,7 @@ def test_write_histogram_variable(caplog, tmptestdir): ) # Now, check that the data were overwritten - h3 = store.read( - "my_group/my_histogram", f"{tmptestdir}/write_histogram_test.lh5" - ) + h3 = store.read("my_group/my_histogram", f"{tmptestdir}/write_histogram_test.lh5") assert np.array_equal(h3.weights.nda, np.array([[10, 10], [10, 10]])) assert np.array_equal(h3.binning[0].edges, np.array([2, 3.5, 4])) with pytest.raises(TypeError): diff --git a/tests/types/test_table.py b/tests/types/test_table.py index cfdbe23d..06e70a83 100644 --- a/tests/types/test_table.py +++ b/tests/types/test_table.py @@ -86,10 +86,13 @@ def test_append(): tbl = Table(col_dict=col_dict) tbl.append({"a": -1, "b": -1}) assert len(tbl) == 5 - assert tbl == Table( { - "a": lgdo.Array(nda=np.array([1, 2, 3, 4, -1])), - "b": lgdo.Array(nda=np.array([5, 6, 7, 8, -1])), - } ) + assert tbl == Table( + { + "a": lgdo.Array(nda=np.array([1, 2, 3, 4, -1])), + "b": lgdo.Array(nda=np.array([5, 6, 7, 8, -1])), + } + ) + def test_insert(): col_dict = { @@ -100,10 +103,12 @@ def test_insert(): tbl = Table(col_dict=col_dict) tbl.insert(1, {"a": -1, "b": -1}) assert len(tbl) == 5 - assert tbl == Table( { - "a": lgdo.Array(nda=np.array([1, -1, 2, 3, 4])), - "b": lgdo.Array(nda=np.array([5, -1, 6, 7, 8])), - } ) + assert tbl == Table( + { + "a": lgdo.Array(nda=np.array([1, -1, 2, 3, 4])), + "b": lgdo.Array(nda=np.array([5, -1, 6, 7, 8])), + } + ) def test_add_field(): From 8dd3d75a20630dc637e48de43d61ab56e78c76f1 Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 14:24:43 -0700 Subject: [PATCH 10/27] Appease the pre-commit bot --- src/lgdo/lh5/core.py | 5 ++--- src/lgdo/types/encoded.py | 6 +++--- src/lgdo/types/table.py | 8 ++++---- src/lgdo/types/vectorofvectors.py | 2 +- tests/compression/conftest.py | 3 +-- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/lgdo/lh5/core.py b/src/lgdo/lh5/core.py index b6a52dc4..4acd3ebc 100644 --- a/src/lgdo/lh5/core.py +++ b/src/lgdo/lh5/core.py @@ -4,6 +4,7 @@ import inspect import sys from collections.abc import Mapping, Sequence +from contextlib import suppress from typing import Any import h5py @@ -177,10 +178,8 @@ def read( obj_buf_start=obj_buf_start, decompress=decompress, ) - try: + with suppress(AttributeError): obj.resize(obj_buf_start + n_rows_read) - except AttributeError: - pass return obj diff --git a/src/lgdo/types/encoded.py b/src/lgdo/types/encoded.py index b4caf7f8..fe7b522e 100644 --- a/src/lgdo/types/encoded.py +++ b/src/lgdo/types/encoded.py @@ -96,7 +96,7 @@ def reserve_capacity(self, *capacity: int) -> None: self.encoded_data.reserve_capacity(*capacity) self.decoded_size.reserve_capacity(capacity[0]) - def get_capacity(self) -> Tuple: + def get_capacity(self) -> tuple: return (self.decoded_size.get_capacity, *self.encoded_data.get_capacity()) def trim_capacity(self) -> None: @@ -348,7 +348,7 @@ def __eq__(self, other: ArrayOfEncodedEqualSizedArrays) -> bool: def reserve_capacity(self, *capacity: int) -> None: self.encoded_data.reserve_capacity(capacity) - def get_capacity(self) -> Tuple: + def get_capacity(self) -> tuple: return self.encoded_data.get_capacity() def trim_capacity(self) -> None: @@ -361,7 +361,7 @@ def resize(self, new_size: int, trim: bool = False) -> None: -------- .VectorOfVectors.resize """ - self.encoded_data.resize(new_size) + self.encoded_data.resize(new_size, trim) def append(self, value: NDArray) -> None: """Append a 1D encoded array at the end. diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 5fbe7c6b..43e7347d 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -18,7 +18,7 @@ from .array import Array from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays -from .lgdo import LGDOCollection +from .lgdo import LGDO, LGDOCollection from .scalar import Scalar from .struct import Struct from .vectorofvectors import VectorOfVectors @@ -97,7 +97,7 @@ def __len__(self) -> int: """Provides ``__len__`` for this array-like class.""" return self.size - def reserve_capacity(self, capacity: int | ArrayLike) -> None: + def reserve_capacity(self, capacity: int | list) -> None: "Set size (number of rows) of internal memory buffer" if isinstance(capacity, int): for obj in self.values(): @@ -138,11 +138,11 @@ def resize( obj.resize(new_size, trim) self.size = new_size - def append(self, vals: Dict) -> None: + def append(self, vals: dict) -> None: "Append vals to end of table. Vals is a mapping from table key to val" self.insert(len(self), vals) - def insert(self, i: int, vals: Dict) -> None: + def insert(self, i: int, vals: dict) -> None: "Insert vals into table at row i. Vals is a mapping from table key to val" for k, ar in self.items(): ar.insert(i, vals[k]) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 5bed511e..51675631 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -281,7 +281,7 @@ def reserve_capacity(self, cap_cl, *cap_args) -> None: self.cumulative_length.reserve_capacity(cap_cl) self.flattened_data.reserve_capacity(*cap_args) - def get_capacity(self) -> Tuple[int]: + def get_capacity(self) -> tuple[int]: """Get tuple containing capacity of each dimension. First dimension is cumulative length array. Last dimension is flattened data. """ diff --git a/tests/compression/conftest.py b/tests/compression/conftest.py index cb96d622..75ab953a 100644 --- a/tests/compression/conftest.py +++ b/tests/compression/conftest.py @@ -8,8 +8,7 @@ @pytest.fixture() def wftable(lgnd_test_data): store = lh5.LH5Store() - wft = store.read( + return store.read( "/geds/raw/waveform", lgnd_test_data.get_path("lh5/LDQTA_r117_20200110T105115Z_cal_geds_raw.lh5"), ) - return wft From 5a2e402cfd5bdd1ed263ff08408fd61f118ff4c4 Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 17:46:40 -0700 Subject: [PATCH 11/27] Fixed tutorial --- docs/source/notebooks/LH5Files.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/notebooks/LH5Files.ipynb b/docs/source/notebooks/LH5Files.ipynb index 7cb97cf3..390d0efb 100644 --- a/docs/source/notebooks/LH5Files.ipynb +++ b/docs/source/notebooks/LH5Files.ipynb @@ -189,8 +189,8 @@ "source": [ "from lgdo.lh5 import LH5Iterator\n", "\n", - "for lh5_obj, entry, n_rows in LH5Iterator(lh5_file, \"geds/raw/energy\", buffer_len=20):\n", - " print(f\"entry {entry}, energy = {lh5_obj} ({n_rows} rows)\")" + "for lh5_obj, entry in LH5Iterator(lh5_file, \"geds/raw/energy\", buffer_len=20):\n", + " print(f\"entry {entry}, energy = {lh5_obj} ({len(lh5_obj)} rows)\")" ] }, { @@ -211,7 +211,7 @@ "from lgdo.lh5 import LH5Store\n", "\n", "store = LH5Store(keep_open=True) # with keep_open=True, files are kept open inside the store\n", - "store.read(\"geds/raw\", lh5_file) # returns a tuple: (obj, n_rows_read)" + "store.read(\"geds/raw\", lh5_file)\n" ] }, { From 0a24cf998015bc944bbd192c444e6fcbdd15419f Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 17:56:38 -0700 Subject: [PATCH 12/27] Fixed docstring error --- src/lgdo/lh5/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/lh5/core.py b/src/lgdo/lh5/core.py index 4acd3ebc..c4e76296 100644 --- a/src/lgdo/lh5/core.py +++ b/src/lgdo/lh5/core.py @@ -107,7 +107,7 @@ def read( Returns ------- object - `the read-out object + the read-out object """ if isinstance(lh5_file, h5py.File): lh5_obj = lh5_file[name] From 0fb6adf2e86a87b1521468302a2e83b8a19be785 Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 21:29:52 -0700 Subject: [PATCH 13/27] Added tests for capacity and fixed bugs --- src/lgdo/types/array.py | 4 ++-- src/lgdo/types/lgdo.py | 22 +--------------------- src/lgdo/types/table.py | 11 ++--------- src/lgdo/types/vectorofvectors.py | 11 +++++++---- tests/types/test_table.py | 28 ++++++++++++++++++++++++++++ tests/types/test_vectorofvectors.py | 22 +++++++++++++++++++++- 6 files changed, 61 insertions(+), 37 deletions(-) diff --git a/src/lgdo/types/array.py b/src/lgdo/types/array.py index c1c720ff..51540396 100644 --- a/src/lgdo/types/array.py +++ b/src/lgdo/types/array.py @@ -131,6 +131,8 @@ def resize(self, new_size: int, trim=False) -> None: increased to accommodate new rows; in this case double capacity. If trim is True, capacity will be set to match size.""" + self._size = new_size + if trim and new_size != self.get_capacity: self.reserve_capacity(new_size) @@ -138,8 +140,6 @@ def resize(self, new_size: int, trim=False) -> None: if new_size > self.get_capacity(): self.reserve_capacity(int(2 ** (np.ceil(np.log2(new_size))))) - self._size = new_size - def append(self, value: np.ndarray) -> None: "Append value to end of array (with copy)" self.insert(len(self), value) diff --git a/src/lgdo/types/lgdo.py b/src/lgdo/types/lgdo.py index 18c7e7a4..be8a9c85 100644 --- a/src/lgdo/types/lgdo.py +++ b/src/lgdo/types/lgdo.py @@ -135,24 +135,4 @@ def replace(self, i: int, val) -> None: def clear(self, trim: bool = False) -> None: "set size of LGDOCollection to zero" - self.resize(0, trim) - - def getattrs(self, datatype: bool = False) -> dict: - """Return a copy of the LGDO attributes dictionary. - - Parameters - ---------- - datatype - if ``False``, remove ``datatype`` attribute from the output - dictionary. - """ - d = dict(self.attrs) - if not datatype: - d.pop("datatype", None) - return d - - def __str__(self) -> str: - return repr(self) - - def __repr__(self) -> str: - return self.__class__.__name__ + f"(attrs={self.attrs!r})" + self.resize(0, trim=trim) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 43e7347d..58689249 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -29,10 +29,6 @@ class Table(Struct, LGDOCollection): """A special struct of arrays or subtable columns of equal length. - Holds onto an internal read/write location ``loc`` that is useful in - managing table I/O using functions like :meth:`push_row`, :meth:`is_full`, - and :meth:`clear`. - Note ---- If you write to a table and don't fill it up to its total size, be sure to @@ -78,7 +74,8 @@ def __init__( col_dict = _ak_to_lgdo_or_col_dict(col_dict) # call Struct constructor - super().__init__(obj_dict=col_dict, attrs=attrs) + Struct.__init__(self, obj_dict=col_dict) + LGDOCollection.__init__(self, attrs=attrs) # if col_dict is not empty, set size according to it # if size is also supplied, resize all fields to match it @@ -138,10 +135,6 @@ def resize( obj.resize(new_size, trim) self.size = new_size - def append(self, vals: dict) -> None: - "Append vals to end of table. Vals is a mapping from table key to val" - self.insert(len(self), vals) - def insert(self, i: int, vals: dict) -> None: "Insert vals into table at row i. Vals is a mapping from table key to val" for k, ar in self.items(): diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 51675631..513804d0 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -285,10 +285,11 @@ def get_capacity(self) -> tuple[int]: """Get tuple containing capacity of each dimension. First dimension is cumulative length array. Last dimension is flattened data. """ - return ( - self.cumulative_length.get_capacity(), - *self.flattened_data.get_capacity(), - ) + fd_cap = self.flattened_data.get_capacity() + if isinstance(fd_cap, int): + return (self.cumulative_length.get_capacity(), fd_cap) + else: + return (self.cumulative_length.get_capacity(), *fd_cap) def trim_capacity(self) -> None: "Set capacity for all dimensions to minimum needed to hold data" @@ -338,6 +339,8 @@ def resize(self, new_size: int, trim: bool = False) -> None: # if dlen > 0 this has no effect if len(self.cumulative_length) > 0: self.flattened_data.resize(self.cumulative_length[-1], trim) + else: + self.flattened_data.resize(0, trim) def append(self, new: NDArray) -> None: """Append a 1D vector `new` at the end. diff --git a/tests/types/test_table.py b/tests/types/test_table.py index 06e70a83..f5ec04f3 100644 --- a/tests/types/test_table.py +++ b/tests/types/test_table.py @@ -76,6 +76,34 @@ def test_datatype_name(): tbl = Table() assert tbl.datatype_name() == "table" +def test_resize_and_capacity(): + col_dict = { + "a": lgdo.Array(nda=np.array([1, 2, 3, 4])), + "b": lgdo.Array(nda=np.array([5, 6, 7, 8])), + } + tbl = Table(col_dict=col_dict) + + assert(len(tbl) == 4) + assert(tbl.get_capacity() == [4, 4]) + + tbl.reserve_capacity([5, 7]) + assert(len(tbl) == 4) + assert(tbl.get_capacity() == [5, 7]) + + tbl.resize(6) + assert(len(tbl) == 6) + assert(tbl.get_capacity()[0] >= 6 and tbl.get_capacity()[1] == 7) + + tbl.trim_capacity() + assert(len(tbl) == 6) + assert(tbl.get_capacity() == [6, 6]) + + with pytest.raises(ValueError): + tbl.reserve_capacity(3) + + tbl.clear(trim=True) + assert(len(tbl) == 0) + assert(tbl.get_capacity() == [0, 0]) def test_append(): col_dict = { diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index 8357a5c5..7df645ea 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -173,25 +173,45 @@ def test_getitem(testvov): assert np.array_equal(v[-1], [1, 2]) -def test_resize(testvov): +def test_resize_and_capacity(testvov): vov = testvov.v2d + assert vov.get_capacity() == (5, 13) + vov.resize(3) assert ak.is_valid(vov.view_as("ak")) + assert vov.get_capacity() == (5, 13) assert len(vov.cumulative_length) == 3 assert len(vov.flattened_data) == vov.cumulative_length[-1] assert vov == VectorOfVectors([[1, 2], [3, 4, 5], [2]]) + vov.trim_capacity() + assert ak.is_valid(vov.view_as("ak")) + assert vov.get_capacity() == (3, 6) + assert len(vov.cumulative_length) == 3 + assert len(vov.flattened_data) == vov.cumulative_length[-1] + assert vov == VectorOfVectors([[1, 2], [3, 4, 5], [2]]) + + vov.reserve_capacity(5, 10) vov.resize(5) assert ak.is_valid(vov.view_as("ak")) + assert vov.get_capacity()[0] >= 5 and vov.get_capacity()[1] >= 7 assert len(vov) == 5 assert len(vov[3]) == 0 assert len(vov[4]) == 0 assert vov == VectorOfVectors([[1, 2], [3, 4, 5], [2], [], []]) + vov.clear(trim=True) + assert ak.is_valid(vov.view_as("ak")) + assert vov.get_capacity() == (0, 0) + assert len(vov) == 0 + vov = testvov.v3d + assert vov.get_capacity() == (3, 5, 13) + vov.resize(3) + assert vov.get_capacity() == (3, 5, 13) assert ak.is_valid(vov.view_as("ak")) assert len(vov.cumulative_length) == 3 assert len(vov.flattened_data) == vov.cumulative_length[-1] From 03f5ce7cda9e8ed399214f0845b772b9994816ef Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Oct 2024 04:30:16 +0000 Subject: [PATCH 14/27] style: pre-commit fixes --- tests/types/test_table.py | 24 +++++++++++++----------- tests/types/test_vectorofvectors.py | 6 +++--- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/tests/types/test_table.py b/tests/types/test_table.py index f5ec04f3..eabbb525 100644 --- a/tests/types/test_table.py +++ b/tests/types/test_table.py @@ -76,6 +76,7 @@ def test_datatype_name(): tbl = Table() assert tbl.datatype_name() == "table" + def test_resize_and_capacity(): col_dict = { "a": lgdo.Array(nda=np.array([1, 2, 3, 4])), @@ -83,27 +84,28 @@ def test_resize_and_capacity(): } tbl = Table(col_dict=col_dict) - assert(len(tbl) == 4) - assert(tbl.get_capacity() == [4, 4]) + assert len(tbl) == 4 + assert tbl.get_capacity() == [4, 4] tbl.reserve_capacity([5, 7]) - assert(len(tbl) == 4) - assert(tbl.get_capacity() == [5, 7]) + assert len(tbl) == 4 + assert tbl.get_capacity() == [5, 7] tbl.resize(6) - assert(len(tbl) == 6) - assert(tbl.get_capacity()[0] >= 6 and tbl.get_capacity()[1] == 7) + assert len(tbl) == 6 + assert tbl.get_capacity()[0] >= 6 and tbl.get_capacity()[1] == 7 tbl.trim_capacity() - assert(len(tbl) == 6) - assert(tbl.get_capacity() == [6, 6]) + assert len(tbl) == 6 + assert tbl.get_capacity() == [6, 6] with pytest.raises(ValueError): tbl.reserve_capacity(3) - + tbl.clear(trim=True) - assert(len(tbl) == 0) - assert(tbl.get_capacity() == [0, 0]) + assert len(tbl) == 0 + assert tbl.get_capacity() == [0, 0] + def test_append(): col_dict = { diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index 7df645ea..15a69390 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -177,7 +177,7 @@ def test_resize_and_capacity(testvov): vov = testvov.v2d assert vov.get_capacity() == (5, 13) - + vov.resize(3) assert ak.is_valid(vov.view_as("ak")) assert vov.get_capacity() == (5, 13) @@ -195,7 +195,7 @@ def test_resize_and_capacity(testvov): vov.reserve_capacity(5, 10) vov.resize(5) assert ak.is_valid(vov.view_as("ak")) - assert vov.get_capacity()[0] >= 5 and vov.get_capacity()[1] >= 7 + assert vov.get_capacity()[0] >= 5 and vov.get_capacity()[1] >= 7 assert len(vov) == 5 assert len(vov[3]) == 0 assert len(vov[4]) == 0 @@ -205,7 +205,7 @@ def test_resize_and_capacity(testvov): assert ak.is_valid(vov.view_as("ak")) assert vov.get_capacity() == (0, 0) assert len(vov) == 0 - + vov = testvov.v3d assert vov.get_capacity() == (3, 5, 13) From a1cf2b2113014ca8092fb26ea50506732cae4fc2 Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 21:36:15 -0700 Subject: [PATCH 15/27] Appease pre-commit bot --- src/lgdo/types/vectorofvectors.py | 3 +-- tests/types/test_table.py | 3 ++- tests/types/test_vectorofvectors.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index 513804d0..3fa1d59a 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -288,8 +288,7 @@ def get_capacity(self) -> tuple[int]: fd_cap = self.flattened_data.get_capacity() if isinstance(fd_cap, int): return (self.cumulative_length.get_capacity(), fd_cap) - else: - return (self.cumulative_length.get_capacity(), *fd_cap) + return (self.cumulative_length.get_capacity(), *fd_cap) def trim_capacity(self) -> None: "Set capacity for all dimensions to minimum needed to hold data" diff --git a/tests/types/test_table.py b/tests/types/test_table.py index eabbb525..d60f00d9 100644 --- a/tests/types/test_table.py +++ b/tests/types/test_table.py @@ -93,7 +93,8 @@ def test_resize_and_capacity(): tbl.resize(6) assert len(tbl) == 6 - assert tbl.get_capacity()[0] >= 6 and tbl.get_capacity()[1] == 7 + assert tbl.get_capacity()[0] >= 6 + assert tbl.get_capacity()[1] == 7 tbl.trim_capacity() assert len(tbl) == 6 diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index 15a69390..c28dee56 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -195,7 +195,8 @@ def test_resize_and_capacity(testvov): vov.reserve_capacity(5, 10) vov.resize(5) assert ak.is_valid(vov.view_as("ak")) - assert vov.get_capacity()[0] >= 5 and vov.get_capacity()[1] >= 7 + assert vov.get_capacity()[0] >= 5 + assert vov.get_capacity()[1] >= 7 assert len(vov) == 5 assert len(vov[3]) == 0 assert len(vov[4]) == 0 From 0a0bffb70163f82968332e88e5a4a41bafbabfe1 Mon Sep 17 00:00:00 2001 From: iguinn Date: Sun, 13 Oct 2024 21:44:44 -0700 Subject: [PATCH 16/27] Improve test coverage --- tests/types/test_array.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tests/types/test_array.py b/tests/types/test_array.py index b055aa34..26dcd51f 100644 --- a/tests/types/test_array.py +++ b/tests/types/test_array.py @@ -26,17 +26,45 @@ def test_init(): assert array.attrs == attrs | {"datatype": "array<1>{real}"} -def test_resize(): +def test_resize_and_capacity(): array = Array(nda=np.array([1, 2, 3, 4])) + assert array.get_capacity() == 4 + array.resize(3) + assert array.get_capacity() == 4 assert (array.nda == np.array([1, 2, 3])).all() + array.resize(5) + assert array.get_capacity() >= 5 + + array.clear(trim=True) + assert array.get_capacity() == 0 + assert len(array) == 0 + def test_insert(): a = Array(np.array([1, 2, 3, 4])) a.insert(2, [-1, -1]) assert a == Array([1, 2, -1, -1, 3, 4]) + with pytest.raises(IndexError): + a.insert(10, 10) + + +def test_append(): + a = Array(np.array([1, 2, 3, 4])) + a.append(-1) + assert a == Array([1, 2, 3, 4, -1]) + + +def test_replace(): + a = Array(np.array([1, 2, 3, 4])) + a.replace(2, -1) + assert a == Array([1, 2, -1, 4]) + + with pytest.raises(IndexError): + a.replace(10, 10) + def test_view(): a = Array(np.array([1, 2, 3, 4]), attrs={"units": "m"}) From ae4979f818ab010b49e88b5546b012c1c0e45ba5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Nov 2024 16:25:02 +0000 Subject: [PATCH 17/27] style: pre-commit fixes --- docs/source/notebooks/LH5Files.ipynb | 6 ++++-- tests/lh5/test_lh5_store.py | 4 +--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/notebooks/LH5Files.ipynb b/docs/source/notebooks/LH5Files.ipynb index a9a24dcb..a03e29b0 100644 --- a/docs/source/notebooks/LH5Files.ipynb +++ b/docs/source/notebooks/LH5Files.ipynb @@ -212,8 +212,10 @@ "source": [ "from lgdo.lh5 import LH5Store\n", "\n", - "store = LH5Store(keep_open=True) # with keep_open=True, files are kept open inside the store\n", - "store.read(\"geds/raw\", lh5_file)\n" + "store = LH5Store(\n", + " keep_open=True\n", + ") # with keep_open=True, files are kept open inside the store\n", + "store.read(\"geds/raw\", lh5_file)" ] }, { diff --git a/tests/lh5/test_lh5_store.py b/tests/lh5/test_lh5_store.py index 87c033ca..f46a5d74 100644 --- a/tests/lh5/test_lh5_store.py +++ b/tests/lh5/test_lh5_store.py @@ -134,9 +134,7 @@ def test_read_array_fancy_idx(lh5_file): assert lh5_obj == lgdo.Array([1, 4, 5, 1, 4, 5]) # Test with out of range index - lh5_obj = store.read( - "/data/struct_full/array", lh5_file, idx=[0, 3, 4, 100] - ) + lh5_obj = store.read("/data/struct_full/array", lh5_file, idx=[0, 3, 4, 100]) assert isinstance(lh5_obj, types.Array) assert len(lh5_obj) == 3 assert lh5_obj == lgdo.Array([1, 4, 5]) From 7afab7228f1fa19b29ef4cbbde8d833ebed16c11 Mon Sep 17 00:00:00 2001 From: iguinn Date: Mon, 25 Nov 2024 12:24:59 -0800 Subject: [PATCH 18/27] Fixed test --- tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 36bce2d8..c5bd80dc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -164,6 +164,6 @@ def test_lh5concat(lgnd_test_data, tmptestdir): outfile = f"{tmptestdir}/concat_test_struct_out.lh5" cli.lh5concat(["--output", outfile, "--", infile1, infile2]) - out_stp = store.read("stp", outfile)[0] + out_stp = store.read("stp", outfile) assert out_stp.attrs["datatype"] == "struct{x}" assert np.all(out_stp.x["col"].nda == np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])) From 74cb2817e7db0bb4c7bb4b06212e0428a243877e Mon Sep 17 00:00:00 2001 From: iguinn Date: Tue, 26 Nov 2024 20:01:01 -0800 Subject: [PATCH 19/27] When filling VoV from AoesA, if length of V is longer than A use filler value --- src/lgdo/types/vectorofvectors.py | 10 +++++++++- src/lgdo/types/vovutils.py | 18 ++++++++++++++---- tests/types/test_vectorofvectors.py | 11 ++++++++++- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/lgdo/types/vectorofvectors.py b/src/lgdo/types/vectorofvectors.py index a27461ce..5e7203c6 100644 --- a/src/lgdo/types/vectorofvectors.py +++ b/src/lgdo/types/vectorofvectors.py @@ -468,7 +468,15 @@ def _set_vector_unsafe( cum_lens = np.add(start, lens.cumsum(), dtype=int) # fill with fast vectorized routine - vovutils._nb_fill(vec, lens, self.flattened_data.nda[start : cum_lens[-1]]) + if np.issubdtype(self.flattened_data.dtype, np.unsignedinteger): + nan_val = np.iinfo(self.flattened_data.dtype).max + if np.issubdtype(self.flattened_data.dtype, np.integer): + nan_val = np.iinfo(self.flattened_data.dtype).min + else: + nan_val = np.nan + vovutils._nb_fill( + vec, lens, nan_val, self.flattened_data.nda[start : cum_lens[-1]] + ) # add new vector(s) length to cumulative_length self.cumulative_length[i : i + len(lens)] = cum_lens diff --git a/src/lgdo/types/vovutils.py b/src/lgdo/types/vovutils.py index c3862eec..abae760b 100644 --- a/src/lgdo/types/vovutils.py +++ b/src/lgdo/types/vovutils.py @@ -81,7 +81,7 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND @numba.guvectorize( [ - f"{data_type}[:,:],{size_type}[:],{data_type}[:]" + f"{data_type}[:,:],{size_type}[:],{data_type},{data_type}[:]" for data_type in [ "b1", "i1", @@ -99,10 +99,12 @@ def _nb_build_cl(sorted_array_in: NDArray, cumulative_length_out: NDArray) -> ND ] for size_type in ["i4", "i8", "u4", "u8"] ], - "(l,m),(l),(n)", + "(l,m),(l),(),(n)", **nb_kwargs, ) -def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray): +def _nb_fill( + aoa_in: NDArray, len_in: NDArray, nan_val: int | float, flattened_array_out: NDArray +): """Vectorized function to fill flattened array from array of arrays and lengths. Values in aoa_in past lengths will not be copied. @@ -112,6 +114,9 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray): array of arrays containing values to be copied len_in array of vector lengths for each row of aoa_in + nan_val + value to use when len_in is longer than aoa_in. Should use + np.nan for floating point, and 0xfff... for integer types flattened_array_out flattened array to copy values into. Must be longer than sum of lengths in len_in @@ -122,9 +127,14 @@ def _nb_fill(aoa_in: NDArray, len_in: NDArray, flattened_array_out: NDArray): raise ValueError(msg) start = 0 + max_len = aoa_in.shape[1] for i, ll in enumerate(len_in): stop = start + ll - flattened_array_out[start:stop] = aoa_in[i, :ll] + if ll > max_len: + flattened_array_out[start : start + max_len] = aoa_in[i, :] + flattened_array_out[start + max_len : stop] = nan_val + else: + flattened_array_out[start:stop] = aoa_in[i, :ll] start = stop diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index c03c1f24..f1b2e1b1 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -389,7 +389,7 @@ def test_set_vector_unsafe(testvov): np.array([4, 8, 9, 7], dtype=testvov.dtype), np.array([5, 3, 1], dtype=testvov.dtype), ] - desired_aoa = np.zeros(shape=(5, 5), dtype=testvov.dtype) + desired_aoa = np.zeros(shape=(5, 4), dtype=testvov.dtype) desired_lens = np.array([len(arr) for arr in desired]) # test sequential filling @@ -404,6 +404,15 @@ def test_set_vector_unsafe(testvov): third_vov._set_vector_unsafe(0, desired_aoa, desired_lens) assert testvov == third_vov + # test vectorized filling when len is longer than array + fourth_vov = lgdo.VectorOfVectors(shape_guess=(5, 5), dtype=testvov.dtype) + desired_lens[3] = 10 + fourth_vov._set_vector_unsafe(0, desired_aoa, desired_lens) + exp_entry_w_overflow = np.concatenate( + [desired[3], np.array([np.iinfo(testvov.dtype).min] * 6)] + ) + assert np.all(fourth_vov[3] == exp_entry_w_overflow) + def test_iter(testvov): testvov = testvov.v2d From 51137a72acd5f6e93433e01bf98e237d368b0dcf Mon Sep 17 00:00:00 2001 From: iguinn Date: Fri, 20 Dec 2024 10:31:58 -0800 Subject: [PATCH 20/27] Do not return current_i_entry when iterating --- src/lgdo/lh5/iterator.py | 8 +++++--- tests/lh5/test_lh5_iterator.py | 14 ++++++++------ tests/types/test_vectorofvectors.py | 2 +- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/lgdo/lh5/iterator.py b/src/lgdo/lh5/iterator.py index 0bae1d01..83793bc8 100644 --- a/src/lgdo/lh5/iterator.py +++ b/src/lgdo/lh5/iterator.py @@ -25,7 +25,7 @@ class LH5Iterator(typing.Iterator): This can be used as an iterator: - >>> for lh5_obj, entry in LH5Iterator(...): + >>> for lh5_obj in LH5Iterator(...): >>> # do the thing! This is intended for if you are reading a large quantity of data. This @@ -43,6 +43,8 @@ class LH5Iterator(typing.Iterator): In addition to accessing requested data via ``lh5_obj``, several properties exist to tell you where that data came from: + - lh5_it.current_i_entry: get the index within the entry list of the + first entry that is currently read - lh5_it.current_local_entries: get the entry numbers relative to the file the data came from - lh5_it.current_global_entries: get the entry number relative to the @@ -52,7 +54,7 @@ class LH5Iterator(typing.Iterator): This class can also be used for random access: - >>> lh5_obj, n_rows = lh5_it.read(i_entry) + >>> lh5_obj = lh5_it.read(i_entry) to read the block of entries starting at i_entry. In case of multiple files or the use of an event selection, i_entry refers to a global event index @@ -492,4 +494,4 @@ def __next__(self) -> tuple[LGDO, int, int]: if len(buf) == 0: raise StopIteration self.next_i_entry = self.current_i_entry + len(buf) - return (buf, self.current_i_entry) + return buf diff --git a/tests/lh5/test_lh5_iterator.py b/tests/lh5/test_lh5_iterator.py index a5f2f689..273612cf 100644 --- a/tests/lh5/test_lh5_iterator.py +++ b/tests/lh5/test_lh5_iterator.py @@ -31,7 +31,8 @@ def test_basics(lgnd_file): lh5_obj["baseline"].nda == np.array([14353, 14254, 14525, 11656, 13576]) ).all() - for lh5_obj, entry in lh5_it: + for lh5_obj in lh5_it: + entry = lh5_it.current_i_entry assert len(lh5_obj) == 5 assert entry % 5 == 0 assert all(lh5_it.current_local_entries == np.arange(entry, entry + 5)) @@ -161,8 +162,9 @@ def test_iterate(more_lgnd_files): ], ] - for lh5_out, entry in lh5_it: + for lh5_out in lh5_it: assert set(lh5_out.keys()) == {"is_valid_0vbb", "timestamp", "zacEmax_ctc_cal"} + entry = lh5_it.current_i_entry assert entry % 5 == 0 assert len(lh5_out) == 5 assert all(lh5_it.current_local_entries == exp_loc_entries[entry // 5]) @@ -178,9 +180,9 @@ def test_iterate(more_lgnd_files): buffer_len=5, ) - for lh5_out, entry in lh5_it: + for lh5_out in lh5_it: assert set(lh5_out.keys()) == {"is_valid_0vbb", "timestamp", "zacEmax_ctc_cal"} - assert entry % 5 == 0 + assert lh5_it.current_i_entry % 5 == 0 assert len(lh5_out) == 5 print(lh5_it.get_global_entrylist()) assert all( @@ -197,9 +199,9 @@ def test_iterate(more_lgnd_files): buffer_len=5, ) - for lh5_out, entry in lh5_it: + for lh5_out in lh5_it: assert set(lh5_out.keys()) == {"is_valid_0vbb", "timestamp", "zacEmax_ctc_cal"} - assert entry % 5 == 0 + assert lh5_it.current_i_entry % 5 == 0 assert len(lh5_out) == 5 with pytest.raises(ValueError): diff --git a/tests/types/test_vectorofvectors.py b/tests/types/test_vectorofvectors.py index f1b2e1b1..59b0f2fc 100644 --- a/tests/types/test_vectorofvectors.py +++ b/tests/types/test_vectorofvectors.py @@ -469,5 +469,5 @@ def test_lh5_iterator_view_as(lgnd_test_data): "ch1067205/dsp/energies", ) - for obj, _ in it: + for obj in it: assert ak.is_valid(obj.view_as("ak")) From 2e0f5973aab238ae9271a8b995a65caee75a4c72 Mon Sep 17 00:00:00 2001 From: iguinn Date: Fri, 20 Dec 2024 10:35:34 -0800 Subject: [PATCH 21/27] Fixed tests --- tests/lh5/test_lh5_store.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/lh5/test_lh5_store.py b/tests/lh5/test_lh5_store.py index 75352772..15082dcf 100644 --- a/tests/lh5/test_lh5_store.py +++ b/tests/lh5/test_lh5_store.py @@ -292,10 +292,9 @@ def test_read_table_fancy_idx(lh5_file): assert isinstance(lh5_obj, types.Table) assert len(lh5_obj) == 2 - lh5_obj, n_rows = store.read("/data/struct/table", lh5_file, idx=[]) + lh5_obj = store.read("/data/struct/table", lh5_file, idx=[]) assert isinstance(lh5_obj, types.Table) - assert n_rows == 0 - + assert len(lh5_obj) == 0 def test_read_empty_struct(lh5_file): store = lh5.LH5Store() @@ -460,14 +459,12 @@ def test_read_lgnd_vov_fancy_idx(lgnd_file): assert (lh5_obj.cumulative_length.nda == [1, 2, 3, 4, 5, 6, 7]).all() assert (lh5_obj.flattened_data.nda == [40, 60, 64, 60, 64, 28, 60]).all() - lh5_obj, n_rows = store.read("/geds/raw/tracelist", lgnd_file, idx=[]) + lh5_obj = store.read("/geds/raw/tracelist", lgnd_file, idx=[]) assert isinstance(lh5_obj, types.VectorOfVectors) - assert n_rows == 0 assert len(lh5_obj) == 0 - lh5_obj, n_rows = store.read("/geds/raw/tracelist", [lgnd_file] * 3, idx=[250]) + lh5_obj = store.read("/geds/raw/tracelist", [lgnd_file] * 3, idx=[250]) assert isinstance(lh5_obj, types.VectorOfVectors) - assert n_rows == 1 assert len(lh5_obj) == 1 From 10193a973b7c1f8e484e9ac99cfed580e1195d88 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 18:35:55 +0000 Subject: [PATCH 22/27] style: pre-commit fixes --- tests/lh5/test_lh5_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/lh5/test_lh5_store.py b/tests/lh5/test_lh5_store.py index 15082dcf..01958928 100644 --- a/tests/lh5/test_lh5_store.py +++ b/tests/lh5/test_lh5_store.py @@ -296,6 +296,7 @@ def test_read_table_fancy_idx(lh5_file): assert isinstance(lh5_obj, types.Table) assert len(lh5_obj) == 0 + def test_read_empty_struct(lh5_file): store = lh5.LH5Store() lh5_obj = store.read("/data/struct/empty_struct", lh5_file) From 847c19cf5780bbd557b98bb9c12db840eb781bfc Mon Sep 17 00:00:00 2001 From: iguinn Date: Tue, 14 Jan 2025 11:10:13 -0800 Subject: [PATCH 23/27] Fixed broken test --- src/lgdo/types/histogram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/types/histogram.py b/src/lgdo/types/histogram.py index dc6cf1c2..061a1d1e 100644 --- a/src/lgdo/types/histogram.py +++ b/src/lgdo/types/histogram.py @@ -424,7 +424,7 @@ def __setitem__(self, name: str, obj: LGDO) -> None: dict.__setitem__(self, name, obj) else: msg = "histogram fields cannot be mutated " - raise TypeError(msg) + raise AttributeError(msg) def __getattr__(self, name: str) -> None: # do not allow for new attributes on this From 582960c448dee905a8530c1969dff6a96371e079 Mon Sep 17 00:00:00 2001 From: iguinn Date: Fri, 24 Jan 2025 09:06:39 -0800 Subject: [PATCH 24/27] Fixed tutorial notebook --- docs/source/notebooks/LH5Files.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/notebooks/LH5Files.ipynb b/docs/source/notebooks/LH5Files.ipynb index a03e29b0..1a13dffb 100644 --- a/docs/source/notebooks/LH5Files.ipynb +++ b/docs/source/notebooks/LH5Files.ipynb @@ -191,8 +191,8 @@ "source": [ "from lgdo.lh5 import LH5Iterator\n", "\n", - "for lh5_obj, entry in LH5Iterator(lh5_file, \"geds/raw/energy\", buffer_len=20):\n", - " print(f\"entry {entry}, energy = {lh5_obj} ({len(lh5_obj)} rows)\")" + "for lh5_obj in LH5Iterator(lh5_file, \"geds/raw/energy\", buffer_len=20):\n", + " print(f\"energy = {lh5_obj} ({len(lh5_obj)} rows)\")" ] }, { From d1e92e8811eab5d95995362e331a22cf7a94f547 Mon Sep 17 00:00:00 2001 From: iguinn Date: Fri, 14 Feb 2025 11:42:07 -0800 Subject: [PATCH 25/27] Added ability to specify start and number of entries for iteration --- src/lgdo/lh5/iterator.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/lgdo/lh5/iterator.py b/src/lgdo/lh5/iterator.py index 83793bc8..e023b614 100644 --- a/src/lgdo/lh5/iterator.py +++ b/src/lgdo/lh5/iterator.py @@ -68,6 +68,8 @@ def __init__( base_path: str = "", entry_list: list[int] | list[list[int]] | None = None, entry_mask: list[bool] | list[list[bool]] | None = None, + i_start: int = 0, + n_entries: int | None = None, field_mask: dict[str, bool] | list[str] | tuple[str] | None = None, buffer_len: int = "100*MB", file_cache: int = 10, @@ -92,6 +94,10 @@ def __init__( entry_mask mask of entries to read. If a list of arrays is provided, expect one for each file. Ignore if a selection list is provided. + i_start + index of first entry to start at when iterating + n_entries + number of entries to read before terminating iteration field_mask mask of which fields to read. See :meth:`LH5Store.read` for more details. @@ -186,6 +192,8 @@ def __init__( msg = f"can't open any files from {lh5_files}" raise RuntimeError(msg) + self.i_start = i_start + self.n_entries = n_entries self.current_i_entry = 0 self.next_i_entry = 0 @@ -319,13 +327,22 @@ def get_global_entrylist(self) -> np.ndarray: ) return self.global_entry_list - def read(self, i_entry: int) -> LGDO: + def read(self, i_entry: int, n_entries: int | None = None) -> LGDO: "Read the nextlocal chunk of events, starting at entry." - i_file = np.searchsorted(self.entry_map, i_entry, "right") self.lh5_buffer.resize(0) + + if n_entries is None: + n_entries = self.buffer_len + elif n_entries==0: + return self.lh5_buffer + elif n_entries > self.buffer_len: + msg = "n_entries cannot be larger than buffer_len" + raise ValueError(msg) + # if file hasn't been opened yet, search through files # sequentially until we find the right one + i_file = np.searchsorted(self.entry_map, i_entry, "right") if i_file < len(self.lh5_files) and self.entry_map[i_file] == np.iinfo("q").max: while i_file < len(self.lh5_files) and i_entry >= self._get_file_cumentries( i_file @@ -336,7 +353,7 @@ def read(self, i_entry: int) -> LGDO: return self.lh5_buffer local_i_entry = i_entry - self._get_file_cumentries(i_file - 1) - while len(self.lh5_buffer) < self.buffer_len and i_file < len(self.file_map): + while len(self.lh5_buffer) < n_entries and i_file < len(self.file_map): # Loop through files local_idx = self.get_file_entrylist(i_file) if local_idx is not None and len(local_idx) == 0: @@ -349,7 +366,7 @@ def read(self, i_entry: int) -> LGDO: self.groups[i_file], self.lh5_files[i_file], start_row=i_local, - n_rows=self.buffer_len - len(self.lh5_buffer), + n_rows=n_entries - len(self.lh5_buffer), idx=local_idx, field_mask=self.field_mask, obj_buf=self.lh5_buffer, @@ -485,12 +502,16 @@ def __len__(self) -> int: def __iter__(self) -> typing.Iterator: """Loop through entries in blocks of size buffer_len.""" self.current_i_entry = 0 - self.next_i_entry = 0 + self.next_i_entry = self.i_start return self def __next__(self) -> tuple[LGDO, int, int]: """Read next buffer_len entries and return lh5_table and iterator entry.""" - buf = self.read(self.next_i_entry) + n_entries = self.n_entries + if n_entries is not None: + n_entries = min(self.buffer_len, n_entries+self.i_start-self.next_i_entry) + + buf = self.read(self.next_i_entry, n_entries) if len(buf) == 0: raise StopIteration self.next_i_entry = self.current_i_entry + len(buf) From 58d7872179c002731d79b2eb6e6d07b7d4538192 Mon Sep 17 00:00:00 2001 From: iguinn Date: Fri, 14 Feb 2025 11:42:53 -0800 Subject: [PATCH 26/27] Test use of start and n_entries for iterator --- tests/lh5/test_lh5_iterator.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/lh5/test_lh5_iterator.py b/tests/lh5/test_lh5_iterator.py index 273612cf..f3de6183 100644 --- a/tests/lh5/test_lh5_iterator.py +++ b/tests/lh5/test_lh5_iterator.py @@ -219,3 +219,37 @@ def test_iterate(more_lgnd_files): field_mask=["is_valid_0vbb", "timestamp", "zacEmax_ctc_cal"], buffer_len=5, ) + + +def test_range(lgnd_file): + lh5_it = lh5.LH5Iterator( + lgnd_file, + "/geds/raw", + field_mask=["baseline"], + buffer_len=5, + i_start = 7, + n_entries = 13 + ) + + # Test error when n_entries > buffer_len + with pytest.raises(ValueError): + lh5_obj = lh5_it.read(4, n_entries=7) + + lh5_obj = lh5_it.read(4, n_entries=3) + assert len(lh5_obj) == 3 + assert isinstance(lh5_obj, lgdo.Table) + assert list(lh5_obj.keys()) == ["baseline"] + assert ( + lh5_obj["baseline"].nda == np.array([14353, 14254, 14525]) + ).all() + + exp_i_entries = [7, 12, 17] + exp_lens = [5, 5, 3] + for lh5_obj, exp_i, exp_len in zip(lh5_it, exp_i_entries, exp_lens): + entry = lh5_it.current_i_entry + assert len(lh5_obj) == exp_len + assert entry == exp_i + assert all(lh5_it.current_local_entries == np.arange(entry, entry + exp_len)) + assert all(lh5_it.current_global_entries == np.arange(entry, entry + exp_len)) + assert all(lh5_it.current_files == [lgnd_file] * exp_len) + assert all(lh5_it.current_groups == ["/geds/raw"] * exp_len) From ce1b6fe268e69ec71932368f3830abf791bf1d0e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 19:43:25 +0000 Subject: [PATCH 27/27] style: pre-commit fixes --- src/lgdo/lh5/iterator.py | 11 ++++++----- tests/lh5/test_lh5_iterator.py | 10 ++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/lgdo/lh5/iterator.py b/src/lgdo/lh5/iterator.py index e023b614..6f4d4d46 100644 --- a/src/lgdo/lh5/iterator.py +++ b/src/lgdo/lh5/iterator.py @@ -330,15 +330,14 @@ def get_global_entrylist(self) -> np.ndarray: def read(self, i_entry: int, n_entries: int | None = None) -> LGDO: "Read the nextlocal chunk of events, starting at entry." self.lh5_buffer.resize(0) - + if n_entries is None: n_entries = self.buffer_len - elif n_entries==0: + elif n_entries == 0: return self.lh5_buffer elif n_entries > self.buffer_len: msg = "n_entries cannot be larger than buffer_len" raise ValueError(msg) - # if file hasn't been opened yet, search through files # sequentially until we find the right one @@ -509,8 +508,10 @@ def __next__(self) -> tuple[LGDO, int, int]: """Read next buffer_len entries and return lh5_table and iterator entry.""" n_entries = self.n_entries if n_entries is not None: - n_entries = min(self.buffer_len, n_entries+self.i_start-self.next_i_entry) - + n_entries = min( + self.buffer_len, n_entries + self.i_start - self.next_i_entry + ) + buf = self.read(self.next_i_entry, n_entries) if len(buf) == 0: raise StopIteration diff --git a/tests/lh5/test_lh5_iterator.py b/tests/lh5/test_lh5_iterator.py index f3de6183..6d5e3e98 100644 --- a/tests/lh5/test_lh5_iterator.py +++ b/tests/lh5/test_lh5_iterator.py @@ -227,21 +227,19 @@ def test_range(lgnd_file): "/geds/raw", field_mask=["baseline"], buffer_len=5, - i_start = 7, - n_entries = 13 + i_start=7, + n_entries=13, ) # Test error when n_entries > buffer_len with pytest.raises(ValueError): lh5_obj = lh5_it.read(4, n_entries=7) - + lh5_obj = lh5_it.read(4, n_entries=3) assert len(lh5_obj) == 3 assert isinstance(lh5_obj, lgdo.Table) assert list(lh5_obj.keys()) == ["baseline"] - assert ( - lh5_obj["baseline"].nda == np.array([14353, 14254, 14525]) - ).all() + assert (lh5_obj["baseline"].nda == np.array([14353, 14254, 14525])).all() exp_i_entries = [7, 12, 17] exp_lens = [5, 5, 3]