Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update CSRMatrix and push coverage to 100%, add to_uint_array util #485

Merged
merged 12 commits into from
Dec 4, 2020
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
- Add testing for Python 3.9 and using pre-release packages. @ajtritt, @rly (#459, #472)
- Improve contributing guide. @rly (#474)
- Add citation information to documentation and support for duecredit tool. @rly (#477, #488)
- Add type checking and conversion in `CSRMatrix`. @rly (#485)
- Clean up unreachable validator code. @rly (#483)

### Bug fixes
Expand Down
39 changes: 22 additions & 17 deletions src/hdmf/common/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from . import register_class
from ..container import Container
from ..utils import docval, getargs, call_docval_func
from ..utils import docval, getargs, call_docval_func, to_uint_array


@register_class('CSRMatrix')
Expand All @@ -22,32 +22,37 @@ def __init__(self, **kwargs):
data = getargs('data', kwargs)
if isinstance(data, (np.ndarray, h5py.Dataset)):
if data.ndim == 2:
data = sps.csr_matrix(self.data)
elif data.ndim == 1:
data = sps.csr_matrix(data)
elif data.ndim < 2:
indptr, indices, shape = getargs('indptr', 'indices', 'shape', kwargs)
if any(_ is None for _ in (indptr, indices, shape)):
raise ValueError("must specify indptr, indices, and shape when passing data array")
self.__check_ind(indptr, 'indptr')
self.__check_ind(indices, 'indices')
raise ValueError("Must specify 'indptr', 'indices', and 'shape' arguments when passing data array.")
indptr = self.__check_arr(indptr, 'indptr')
indices = self.__check_arr(indices, 'indices')
shape = self.__check_arr(shape, 'shape')
if len(shape) != 2:
raise ValueError('shape must specify two and only two dimensions')
raise ValueError("'shape' argument must specify two and only two dimensions.")
data = sps.csr_matrix((data, indices, indptr), shape=shape)
else:
raise ValueError("cannot use ndarray of dimensionality > 2")
raise ValueError("'data' argument cannot be ndarray of dimensionality > 2.")
self.__data = data
self.__shape = data.shape

@staticmethod
def __check_ind(ar, arg):
if not (ar.ndim == 1 or np.issubdtype(ar.dtype, int)):
raise ValueError('%s must be a 1D array of integers' % arg)
def __check_arr(ar, arg):
try:
ar = to_uint_array(ar)
except ValueError as ve:
raise ValueError("Cannot convert '%s' to an array of unsigned integers." % arg) from ve
if ar.ndim != 1:
raise ValueError("'%s' must be a 1D array of unsigned integers." % arg)
return ar

def __getattr__(self, val):
return getattr(self.__data, val)

@property
def shape(self):
oruebel marked this conversation as resolved.
Show resolved Hide resolved
return self.__shape
# NOTE: this provides access to self.data, self.indices, self.indptr, self.shape
attr = getattr(self.__data, val)
if val in ('indices', 'indptr', 'shape'): # needed because sps.csr_matrix may contain int arrays for these
attr = to_uint_array(attr)
return attr

def to_spmat(self):
return self.__data
19 changes: 19 additions & 0 deletions src/hdmf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,25 @@ def pystr(s):
return s


def to_uint_array(arr):
"""
Convert a numpy array or array-like object to a numpy array of unsigned integers with the same dtype itemsize.

For example, a list of int32 values is converted to a numpy array with dtype uint32.
:raises ValueError: if input array contains values that are not unsigned integers or non-negative integers.
"""
if not isinstance(arr, np.ndarray):
arr = np.array(arr)
if np.issubdtype(arr.dtype, np.unsignedinteger):
return arr
if np.issubdtype(arr.dtype, np.integer):
if (arr < 0).any():
raise ValueError('Cannot convert negative integer values to uint.')
dt = np.dtype('uint' + str(int(arr.dtype.itemsize*8))) # keep precision
return arr.astype(dt)
raise ValueError('Cannot convert array of dtype %s to uint.' % arr.dtype)


class LabelledDict(dict):
"""A dict wrapper that allows querying by an attribute of the values and running a callable on removed items.

Expand Down
72 changes: 68 additions & 4 deletions tests/unit/common/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,87 @@ def test_from_sparse_matrix(self):
data = np.array([1, 2, 3, 4, 5, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
indptr = np.array([0, 2, 3, 6])
expected = CSRMatrix(data, indices, indptr, (3, 3))
shape = (3, 3)
expected = CSRMatrix(data, indices, indptr, shape)

sps_mat = sps.csr_matrix((data, indices, indptr), shape=(3, 3))
sps_mat = sps.csr_matrix((data, indices, indptr), shape=shape)
received = CSRMatrix(sps_mat)
self.assertContainerEqual(received, expected, ignore_hdmf_attrs=True)

def test_2d_data(self):
data = np.array([[1, 0, 2], [0, 0, 3], [4, 5, 6]])
csr_mat = CSRMatrix(data)
sps_mat = sps.csr_matrix(data)
np.testing.assert_array_equal(csr_mat.data, sps_mat.data)

def test_getattrs(self):
data = np.array([1, 2, 3, 4, 5, 6])
indices = np.array([0, 2, 2, 0, 1, 2], dtype=np.int32)
indptr = np.array([0, 2, 3, 6], dtype=np.int32)
shape = (3, 3)
csr_mat = CSRMatrix(data, indices, indptr, shape)

np.testing.assert_array_equal(data, csr_mat.data)
np.testing.assert_array_equal(indices, csr_mat.indices)
np.testing.assert_array_equal(indptr, csr_mat.indptr)
np.testing.assert_array_equal(shape, csr_mat.shape)
self.assertEqual(csr_mat.indices.dtype.type, np.uint32)
self.assertEqual(csr_mat.indptr.dtype.type, np.uint32)
# NOTE: shape is stored internally in scipy.sparse.spmat as a tuple of ints. this is then converted to ndarray
# but precision differs by OS
self.assertTrue(np.issubdtype(csr_mat.shape.dtype.type, np.unsignedinteger))

def test_to_spmat(self):
data = np.array([1, 2, 3, 4, 5, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
indptr = np.array([0, 2, 3, 6])
csr_mat = CSRMatrix(data, indices, indptr, (3, 3))
shape = (3, 3)
csr_mat = CSRMatrix(data, indices, indptr, shape)
spmat_array = csr_mat.to_spmat().toarray()

expected = np.asarray([[1, 0, 2], [0, 0, 3], [4, 5, 6]])
np.testing.assert_array_equal(spmat_array, expected)

# TODO more unit tests are needed for CSRMatrix
def test_constructor_indices_missing(self):
data = np.array([1, 2, 3, 4, 5, 6])
msg = "Must specify 'indptr', 'indices', and 'shape' arguments when passing data array."
with self.assertRaisesWith(ValueError, msg):
CSRMatrix(data)

def test_constructor_bad_indices(self):
data = np.array([1, 2, 3, 4, 5, 6])
indices = np.array([0, -2, 2, 0, 1, 2])
indptr = np.array([0, 2, 3, 6])
shape = (3, 3)
msg = "Cannot convert 'indices' to an array of unsigned integers."
with self.assertRaisesWith(ValueError, msg):
CSRMatrix(data, indices, indptr, shape)

def test_constructor_bad_indices_dim(self):
data = np.array([1, 2, 3, 4, 5, 6])
indices = np.array([[0, 2, 2, 0, 1, 2]])
indptr = np.array([0, 2, 3, 6])
shape = (3, 3)
msg = "'indices' must be a 1D array of unsigned integers."
with self.assertRaisesWith(ValueError, msg):
CSRMatrix(data, indices, indptr, shape)

def test_constructor_bad_shape(self):
data = np.array([1, 2, 3, 4, 5, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
indptr = np.array([0, 2, 3, 6])
shape = (3, )
msg = "'shape' argument must specify two and only two dimensions."
with self.assertRaisesWith(ValueError, msg):
CSRMatrix(data, indices, indptr, shape)

def test_array_bad_dim(self):
data = np.array([[[1, 2], [3, 4], [5, 6]]])
indices = np.array([0, 2, 2, 0, 1, 2])
indptr = np.array([0, 2, 3, 6])
msg = "'data' argument cannot be ndarray of dimensionality > 2."
with self.assertRaisesWith(ValueError, msg):
CSRMatrix(data, indices, indptr, (3, 3))


class TestCSRMatrixRoundTrip(H5RoundTripMixin, TestCase):
Expand Down
41 changes: 40 additions & 1 deletion tests/unit/utils_test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from hdmf.data_utils import DataChunkIterator, DataIO
from hdmf.testing import TestCase
from hdmf.utils import get_data_shape
from hdmf.utils import get_data_shape, to_uint_array


class TestGetDataShape(TestCase):
Expand Down Expand Up @@ -165,3 +165,42 @@ def test_strict_no_data_load(self):

res = get_data_shape(((1, 2), (3, 4), (5, 6)), strict_no_data_load=True)
self.assertTupleEqual(res, (3, 2))


class TestToUintArray(TestCase):

def test_ndarray_uint(self):
arr = np.array([0, 1, 2], dtype=np.uint)
res = to_uint_array(arr)
np.testing.assert_array_equal(res, arr)

def test_ndarray_int(self):
arr = np.array([0, 1, 2], dtype=np.int)
res = to_uint_array(arr)
np.testing.assert_array_equal(res, arr)

def test_ndarray_int_neg(self):
arr = np.array([0, -1, 2], dtype=np.int)
with self.assertRaisesWith(ValueError, 'Cannot convert negative integer values to uint.'):
to_uint_array(arr)

def test_ndarray_float(self):
arr = np.array([0, 1, 2], dtype=np.float)
with self.assertRaisesWith(ValueError, 'Cannot convert array of dtype float64 to uint.'):
to_uint_array(arr)

def test_list_int(self):
arr = [0, 1, 2]
res = to_uint_array(arr)
expected = np.array([0, 1, 2], dtype=np.uint)
np.testing.assert_array_equal(res, expected)

def test_list_int_neg(self):
arr = [0, -1, 2]
with self.assertRaisesWith(ValueError, 'Cannot convert negative integer values to uint.'):
to_uint_array(arr)

def test_list_float(self):
arr = [0., 1., 2.]
with self.assertRaisesWith(ValueError, 'Cannot convert array of dtype float64 to uint.'):
to_uint_array(arr)