Skip to content

Commit

Permalink
Warn when adding ragged arrays to DynamicTable without index argument (
Browse files Browse the repository at this point in the history
…#1066)

* add detection of ragged array inputs to table

* add tests for ragged array inputs to table

* add warnings for ragged inputs to table

* update CHANGELOG.md

* check only lists and tuples for raggedness

* add flag to turn off ragged data checks

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
stephprince and pre-commit-ci[bot] authored Mar 14, 2024
1 parent e9c2737 commit f092cbb
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Enhancements
- Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069)
- Added warning when using `add_row` or `add_column` to add a ragged array to `DynamicTable` without an index parameter. @stephprince [#1066](https://github.com/hdmf-dev/hdmf/pull/1066)

## HDMF 3.12.2 (February 9, 2024)

Expand Down
26 changes: 23 additions & 3 deletions src/hdmf/common/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from . import register_class, EXP_NAMESPACE
from ..container import Container, Data
from ..data_utils import DataIO, AbstractDataChunkIterator
from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type
from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged
from ..term_set import TermSetWrapper


Expand Down Expand Up @@ -639,12 +639,16 @@ def __len__(self):
{'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None},
{'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique',
'default': False},
{'name': 'check_ragged', 'type': bool, 'default': True,
'doc': ('whether or not to check for ragged arrays when adding data to the table. '
'Set to False to avoid checking every element if performance issues occur.')},
allow_extra=True)
def add_row(self, **kwargs):
"""
Add a row to the table. If *id* is not provided, it will auto-increment.
"""
data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs)
data, row_id, enforce_unique_id, check_ragged = popargs('data', 'id', 'enforce_unique_id', 'check_ragged',
kwargs)
data = data if data is not None else kwargs

bad_data = []
Expand Down Expand Up @@ -709,6 +713,11 @@ def add_row(self, **kwargs):
c.add_vector(data[colname])
else:
c.add_row(data[colname])
if check_ragged and is_ragged(c.data):
warn(("Data has elements with different lengths and therefore cannot be coerced into an "
"N-dimensional array. Use the 'index' argument when creating a column to add rows "
"with different lengths."),
stacklevel=2)

def __eq__(self, other):
"""Compare if the two DynamicTables contain the same data.
Expand Down Expand Up @@ -748,6 +757,9 @@ def __eq__(self, other):
'doc': ('class to use to represent the column data. If table=True, this field is ignored and a '
'DynamicTableRegion object is used. If enum=True, this field is ignored and a EnumData '
'object is used.')},
{'name': 'check_ragged', 'type': bool, 'default': True,
'doc': ('whether or not to check for ragged arrays when adding data to the table. '
'Set to False to avoid checking every element if performance issues occur.')},
allow_extra=True)
def add_column(self, **kwargs): # noqa: C901
"""
Expand All @@ -760,7 +772,7 @@ def add_column(self, **kwargs): # noqa: C901
:raises ValueError: if the column has already been added to the table
"""
name, data = getargs('name', 'data', kwargs)
index, table, enum, col_cls= popargs('index', 'table', 'enum', 'col_cls', kwargs)
index, table, enum, col_cls, check_ragged = popargs('index', 'table', 'enum', 'col_cls', 'check_ragged', kwargs)

if isinstance(index, VectorIndex):
warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be "
Expand Down Expand Up @@ -823,6 +835,14 @@ def add_column(self, **kwargs): # noqa: C901
# once we have created the column
create_vector_index = None
if ckwargs.get('data', None) is not None:

# if no index was provided, check that data is not ragged
if index is False and check_ragged and is_ragged(data):
warn(("Data has elements with different lengths and therefore cannot be coerced into an "
"N-dimensional array. Use the 'index' argument when adding a column of data with "
"different lengths."),
stacklevel=2)

# Check that we are asked to create an index
if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0:
# Iteratively flatten the data we use for the column based on the depth of the index to generate.
Expand Down
14 changes: 14 additions & 0 deletions src/hdmf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,20 @@ def to_uint_array(arr):
raise ValueError('Cannot convert array of dtype %s to uint.' % arr.dtype)


def is_ragged(data):
"""
Test whether a list of lists or array is ragged / jagged
"""
if isinstance(data, (list, tuple)):
lengths = [len(sub_data) if isinstance(sub_data, (list, tuple)) else 1 for sub_data in data]
if len(set(lengths)) > 1:
return True # ragged at this level

return any(is_ragged(sub_data) for sub_data in data) # check next level

return False


class LabelledDict(dict):
"""A dict wrapper that allows querying by an attribute of the values and running a callable on removed items.
Expand Down
68 changes: 68 additions & 0 deletions tests/unit/common/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,74 @@ def test_add_column_multi_index(self):
]
)

def test_add_column_without_required_index(self):
"""
Add a column with different element lengths without specifying an index parameter
"""
table = self.with_spec()
table.add_row(foo=5, bar=50.0, baz='lizard')
table.add_row(foo=5, bar=50.0, baz='lizard')

# testing adding column without a necessary index parameter
lol_data = [[1, 2, 3], [1, 2, 3, 4]]
str_data = [['a', 'b'], ['a', 'b', 'c']]
empty_data = [[1, 2], []]
multi_nested_data = [[[1, 2, 3], [1, 2, 3, 4]], [1, 2]]
tuple_data = ((1, 2, 3), (1, 2, 3, 4))

msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional "
"array. Use the 'index' argument when adding a column of data with different lengths.")
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col1', description='', data=lol_data,)
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col2', description='', data=str_data,)
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col3', description='', data=empty_data,)
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col4', description='', data=multi_nested_data,)
with self.assertWarnsWith(UserWarning, msg):
table.add_column(name='col5', description='', data=tuple_data,)

def test_add_column_without_required_index_and_no_ragged_check(self):
"""
Add a column with different element lengths without checking for raggedness
"""
lol_data = [[1, 2, 3], [1, 2, 3, 4]]
table = self.with_spec()
table.add_row(foo=5, bar=50.0, baz='lizard')
table.add_row(foo=5, bar=50.0, baz='lizard')
table.add_column(name='col1', description='', data=lol_data, check_ragged=False)

def test_add_row_without_required_index(self):
"""
Add rows with different element lengths without specifying an index parameter
"""

# test adding row of list data with different lengths without index parameter
msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional "
"array. Use the 'index' argument when creating a column to add rows with different lengths.")
table = self.with_spec()
table.add_column(name='qux', description='qux column')
table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3])
with self.assertWarnsWith(UserWarning, msg):
table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4])

# test adding row of tuple/str data with different lengths without index parameter
table = self.with_spec()
table.add_column(name='qux', description='qux column')
table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b'))
with self.assertWarnsWith(UserWarning, msg):
table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b', 'c'))

def test_add_row_without_required_index_and_no_ragged_check(self):
"""
Add rows with different element lengths without checking for raggedness
"""
table = self.with_spec()
table.add_column(name='qux', description='qux column')
table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3])
table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4], check_ragged=False)

def test_add_column_auto_index_int(self):
"""
Add a column as a list of lists after we have already added data so that we need to create a single VectorIndex
Expand Down

0 comments on commit f092cbb

Please sign in to comment.