From b07cad363eebe7215815c109272f30d52be4e7e8 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 27 Sep 2023 12:11:40 -0500 Subject: [PATCH 1/4] Additional Support For Nullables --- tiledb/core.cc | 3 +- tiledb/libtiledb.pyx | 25 ++++++++++---- tiledb/tests/test_enumeration.py | 56 ++++++++++---------------------- 3 files changed, 38 insertions(+), 46 deletions(-) diff --git a/tiledb/core.cc b/tiledb/core.cc index c6b8625e7a..43771cad85 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -1183,7 +1183,8 @@ class PyQuery { py::dict results; for (auto &buffer_name : buffers_order_) { auto bp = buffers_.at(buffer_name); - results[py::str(buffer_name)] = py::make_tuple(bp.data, bp.offsets); + results[py::str(buffer_name)] = + py::make_tuple(bp.data, bp.offsets, bp.validity); } return results; } diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index f0e0b181ad..c1ba173c31 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -1932,7 +1932,12 @@ cdef class DenseArrayImpl(Array): enum_label = attr.enum_label if enum_label is not None: values = self.enum(enum_label).values() - result[attr.name] = np.array([values[idx] for idx in result[attr.name]]) + result[attr.name] = np.array( + [ + values[idx] if idx < len(values) else None + for idx in result[attr.name] + ] + ) return result def __repr__(self): @@ -2769,17 +2774,18 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps): if attr.isvar: # ensure that the value is array-convertible, for example: pandas.Series attr_val = np.asarray(attr_val) + if attr.isnullable and attr.name not in nullmaps: + nullmaps[attr.name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8) else: if (np.issubdtype(attr.dtype, np.string_) and not (np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))): raise ValueError("Cannot write a string value to non-string " "typed attribute '{}'!".format(name)) - + + if attr.isnullable and attr.name not in nullmaps: + nullmaps[attr.name] = ~np.ma.masked_invalid(attr_val).mask attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) - if attr.isnullable and attr.name not in nullmaps: - nullmaps[attr.name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8) - except Exception as exc: raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc @@ -2919,7 +2925,11 @@ cdef class SparseArrayImpl(Array): enum_label = attr.enum_label if enum_label is not None: values = self.enum(enum_label).values() - result[attr.name] = np.array([values[idx] for idx in result[attr.name]]) + if attr.isnullable: + data = np.array([values[idx] for idx in result[attr.name].data]) + result[attr.name] = np.ma.array(data, mask=~result[attr.name].mask) + else: + result[attr.name] = np.array([values[idx] for idx in result[attr.name]]) return result def query(self, attrs=None, cond=None, attr_cond=None, dims=None, @@ -3207,6 +3217,9 @@ cdef class SparseArrayImpl(Array): else: arr.dtype = el_dtype out[final_name] = arr + + if self.schema.has_attr(final_name) and self.attr(final_name).isnullable: + out[final_name] = np.ma.array(out[final_name], mask=results[name][2]) return out diff --git a/tiledb/tests/test_enumeration.py b/tiledb/tests/test_enumeration.py index 5b7486e026..5a4cd2d5e4 100644 --- a/tiledb/tests/test_enumeration.py +++ b/tiledb/tests/test_enumeration.py @@ -4,7 +4,7 @@ import tiledb -from .common import DiskTestCase, has_pandas +from .common import DiskTestCase, has_pandas, has_pyarrow class EnumerationTest(DiskTestCase): @@ -82,47 +82,25 @@ def test_array_schema_enumeration(self): assert_array_equal(A.df[:]["attr1"], A[:]["attr1"]) assert_array_equal(A.df[:]["attr2"], A[:]["attr2"]) + @pytest.mark.skipif( + not has_pyarrow() or not has_pandas(), + reason="pyarrow and/or pandas not installed", + ) def test_array_schema_enumeration_nullable(self): - uri = self.path("test_array_schema_enumeration") - dom = tiledb.Domain(tiledb.Dim(domain=(1, 8), tile=1)) - enum1 = tiledb.Enumeration("enmr1", False, np.arange(3) * 10) - enum2 = tiledb.Enumeration("enmr2", False, ["a", "bb", "ccc"]) - attr1 = tiledb.Attr("attr1", dtype=np.int32, enum_label="enmr1") - attr2 = tiledb.Attr("attr2", dtype=np.int32, enum_label="enmr2") - attr3 = tiledb.Attr("attr3", dtype=np.int32) - schema = tiledb.ArraySchema( - domain=dom, attrs=(attr1, attr2, attr3), enums=(enum1, enum2) - ) - tiledb.Array.create(uri, schema) + import pyarrow as pa - data1 = np.random.randint(0, 3, 8) - data2 = np.random.randint(0, 3, 8) - data3 = np.random.randint(0, 3, 8) + uri = self.path("test_array_schema_enumeration_nullable") + enmr = tiledb.Enumeration("e", False, ["alpha", "beta", "gamma"]) + dom = tiledb.Domain(tiledb.Dim("d", domain=(0, 2147483646), dtype="int64")) + att = tiledb.Attr("a", dtype="int8", nullable=True, enum_label="e") + schema = tiledb.ArraySchema(domain=dom, attrs=[att], enums=[enmr], sparse=True) + tiledb.Array.create(uri, schema) with tiledb.open(uri, "w") as A: - A[:] = {"attr1": data1, "attr2": data2, "attr3": data3} + dims = pa.array([1, 2, 3, 4, 5]) + data = pa.array([1.0, 2.0, None, 0, 1.0]) + A[dims] = data with tiledb.open(uri, "r") as A: - assert A.enum("enmr1") == enum1 - assert attr1.enum_label == "enmr1" - assert A.attr("attr1").enum_label == "enmr1" - - assert A.enum("enmr2") == enum2 - assert attr2.enum_label == "enmr2" - assert A.attr("attr2").enum_label == "enmr2" - - with self.assertRaises(tiledb.TileDBError) as excinfo: - assert A.enum("enmr3") == [] - assert " No enumeration named 'enmr3'" in str(excinfo.value) - assert attr3.enum_label is None - assert A.attr("attr3").enum_label is None - - if has_pandas(): - assert_array_equal(A.df[:]["attr1"].cat.codes, data1) - assert_array_equal(A.df[:]["attr2"].cat.codes, data2) - - assert_array_equal(A.df[:]["attr1"], A.multi_index[:]["attr1"]) - assert_array_equal(A.df[:]["attr2"], A.multi_index[:]["attr2"]) - - assert_array_equal(A.df[:]["attr1"], A[:]["attr1"]) - assert_array_equal(A.df[:]["attr2"], A[:]["attr2"]) + assert_array_equal(A[:]["a"].mask, [False, False, True, False, False]) + assert_array_equal(A[:]["a"], A.df[:]["a"]) From 40f17c711bb9948acdb53680f3cd8f1b05283a0c Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 27 Sep 2023 16:05:48 -0500 Subject: [PATCH 2/4] Support Writing Pandas df And On Dense and Sparse Arrays --- tiledb/libtiledb.pyx | 107 +++++++++++++++++++++++-------- tiledb/tests/test_enumeration.py | 24 +++++-- tiledb/tests/test_libtiledb.py | 33 ++++++++++ 3 files changed, 130 insertions(+), 34 deletions(-) diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index c1ba173c31..2293834a8d 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -1925,20 +1925,26 @@ cdef class DenseArrayImpl(Array): if self.view_attr: result = self.subarray(selection, attrs=(self.view_attr,)) return result[self.view_attr] - else: - result = self.subarray(selection) - for i in range(self.schema.nattr): - attr = self.schema.attr(i) - enum_label = attr.enum_label - if enum_label is not None: - values = self.enum(enum_label).values() + + result = self.subarray(selection) + for i in range(self.schema.nattr): + attr = self.schema.attr(i) + enum_label = attr.enum_label + if enum_label is not None: + values = self.enum(enum_label).values() + if attr.isnullable: + data = np.array([values[idx] for idx in result[attr.name].data]) + result[attr.name] = np.ma.array( + data, mask=~result[attr.name].mask) + else: result[attr.name] = np.array( - [ - values[idx] if idx < len(values) else None - for idx in result[attr.name] - ] - ) - return result + [values[idx] for idx in result[attr.name]]) + else: + if attr.isnullable: + result[attr.name] = np.ma.array(result[attr.name].data, + mask=~result[attr.name].mask) + + return result def __repr__(self): if self.isopen: @@ -2187,6 +2193,10 @@ cdef class DenseArrayImpl(Array): arr.shape = np.prod(output_shape) out[name] = arr + + if self.schema.has_attr(name) and self.attr(name).isnullable: + out[name] = np.ma.array(out[name], mask=results[name][2].astype(bool)) + return out def __setitem__(self, object selection, object val): @@ -2277,14 +2287,33 @@ cdef class DenseArrayImpl(Array): # Create list of attribute names and values for attr_idx in range(self.schema.nattr): attr = self.schema.attr(attr_idx) - k = attr.name - v = val[k] - attr = self.schema.attr(k) + name = attr.name + attr_val = val[name] + attributes.append(attr._internal_name) # object arrays are var-len and handled later - if type(v) is np.ndarray and v.dtype is not np.dtype('O'): - v = np.ascontiguousarray(v, dtype=attr.dtype) - values.append(v) + if type(attr_val) is np.ndarray and attr_val.dtype is not np.dtype('O'): + attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) + + try: + if attr.isvar: + # ensure that the value is array-convertible, for example: pandas.Series + attr_val = np.asarray(attr_val) + if attr.isnullable and name not in nullmaps: + nullmaps[name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8) + else: + if (np.issubdtype(attr.dtype, np.string_) and not + (np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))): + raise ValueError("Cannot write a string value to non-string " + "typed attribute '{}'!".format(name)) + + if attr.isnullable and name not in nullmaps: + nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask + attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) + except Exception as exc: + raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc + + values.append(attr_val) elif np.isscalar(val): for i in range(self.schema.nattr): @@ -2295,10 +2324,28 @@ cdef class DenseArrayImpl(Array): values.append(A) elif self.schema.nattr == 1: attr = self.schema.attr(0) + name = attr.name attributes.append(attr._internal_name) # object arrays are var-len and handled later if type(val) is np.ndarray and val.dtype is not np.dtype('O'): val = np.ascontiguousarray(val, dtype=attr.dtype) + try: + if attr.isvar: + # ensure that the value is array-convertible, for example: pandas.Series + val = np.asarray(val) + if attr.isnullable and name not in nullmaps: + nullmaps[name] = np.array([int(v is not None) for v in val], dtype=np.uint8) + else: + if (np.issubdtype(attr.dtype, np.string_) and not + (np.issubdtype(val.dtype, np.string_) or val.dtype == np.dtype('O'))): + raise ValueError("Cannot write a string value to non-string " + "typed attribute '{}'!".format(name)) + + if attr.isnullable and name not in nullmaps: + nullmaps[name] = ~np.ma.masked_invalid(val).mask + val = np.ascontiguousarray(val, dtype=attr.dtype) + except Exception as exc: + raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc values.append(val) elif self.view_attr is not None: # Support single-attribute assignment for multi-attr array @@ -2334,9 +2381,6 @@ cdef class DenseArrayImpl(Array): if not isinstance(val, np.ndarray): raise TypeError(f"Expected NumPy array for attribute '{key}' " f"validity bitmap, got {type(val)}") - if val.dtype != np.uint8: - raise TypeError(f"Expected NumPy uint8 array for attribute '{key}' " - f"validity bitmap, got {val.dtype}") _write_array( ctx_ptr, @@ -2774,16 +2818,16 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps): if attr.isvar: # ensure that the value is array-convertible, for example: pandas.Series attr_val = np.asarray(attr_val) - if attr.isnullable and attr.name not in nullmaps: - nullmaps[attr.name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8) + if attr.isnullable and name not in nullmaps: + nullmaps[name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8) else: if (np.issubdtype(attr.dtype, np.string_) and not (np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))): raise ValueError("Cannot write a string value to non-string " "typed attribute '{}'!".format(name)) - if attr.isnullable and attr.name not in nullmaps: - nullmaps[attr.name] = ~np.ma.masked_invalid(attr_val).mask + if attr.isnullable and name not in nullmaps: + nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) except Exception as exc: @@ -2927,9 +2971,16 @@ cdef class SparseArrayImpl(Array): values = self.enum(enum_label).values() if attr.isnullable: data = np.array([values[idx] for idx in result[attr.name].data]) - result[attr.name] = np.ma.array(data, mask=~result[attr.name].mask) + result[attr.name] = np.ma.array( + data, mask=~result[attr.name].mask) else: - result[attr.name] = np.array([values[idx] for idx in result[attr.name]]) + result[attr.name] = np.array( + [values[idx] for idx in result[attr.name]]) + else: + if attr.isnullable: + result[attr.name] = np.ma.array(result[attr.name].data, + mask=~result[attr.name].mask) + return result def query(self, attrs=None, cond=None, attr_cond=None, dims=None, diff --git a/tiledb/tests/test_enumeration.py b/tiledb/tests/test_enumeration.py index 5a4cd2d5e4..8ad3c40725 100644 --- a/tiledb/tests/test_enumeration.py +++ b/tiledb/tests/test_enumeration.py @@ -86,21 +86,33 @@ def test_array_schema_enumeration(self): not has_pyarrow() or not has_pandas(), reason="pyarrow and/or pandas not installed", ) - def test_array_schema_enumeration_nullable(self): + @pytest.mark.parametrize("sparse", [True, False]) + @pytest.mark.parametrize("pass_df", [True, False]) + def test_array_schema_enumeration_nullable(self, sparse, pass_df): import pyarrow as pa uri = self.path("test_array_schema_enumeration_nullable") enmr = tiledb.Enumeration("e", False, ["alpha", "beta", "gamma"]) - dom = tiledb.Domain(tiledb.Dim("d", domain=(0, 2147483646), dtype="int64")) + dom = tiledb.Domain(tiledb.Dim("d", domain=(1, 5), dtype="int64")) att = tiledb.Attr("a", dtype="int8", nullable=True, enum_label="e") - schema = tiledb.ArraySchema(domain=dom, attrs=[att], enums=[enmr], sparse=True) + schema = tiledb.ArraySchema( + domain=dom, attrs=[att], enums=[enmr], sparse=sparse + ) tiledb.Array.create(uri, schema) with tiledb.open(uri, "w") as A: dims = pa.array([1, 2, 3, 4, 5]) data = pa.array([1.0, 2.0, None, 0, 1.0]) - A[dims] = data + if pass_df: + dims = dims.to_pandas() + data = data.to_pandas() + + if sparse: + A[dims] = data + else: + A[:] = data with tiledb.open(uri, "r") as A: - assert_array_equal(A[:]["a"].mask, [False, False, True, False, False]) - assert_array_equal(A[:]["a"], A.df[:]["a"]) + expected_validity = [False, False, True, False, False] + assert_array_equal(A[:]["a"].mask, expected_validity) + assert_array_equal(A.df[:]["a"].isna(), expected_validity) diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py index 6698e8baa6..a6c584b394 100644 --- a/tiledb/tests/test_libtiledb.py +++ b/tiledb/tests/test_libtiledb.py @@ -27,6 +27,7 @@ assert_unordered_equal, fx_sparse_cell_order, # noqa: F401 has_pandas, + has_pyarrow, rand_ascii, rand_ascii_bytes, rand_utf8, @@ -381,6 +382,38 @@ def test_array_delete(self): assert tiledb.array_exists(uri) is False + @pytest.mark.skipif( + not has_pyarrow() or not has_pandas(), + reason="pyarrow and/or pandas not installed", + ) + @pytest.mark.parametrize("sparse", [True, False]) + @pytest.mark.parametrize("pass_df", [True, False]) + def test_array_write_nullable(self, sparse, pass_df): + import pyarrow as pa + + uri = self.path("test_array_write_nullable") + dom = tiledb.Domain(tiledb.Dim("d", domain=(1, 5), dtype="int64")) + att = tiledb.Attr("a", dtype="int8", nullable=True) + schema = tiledb.ArraySchema(domain=dom, attrs=[att], sparse=sparse) + tiledb.Array.create(uri, schema) + + with tiledb.open(uri, "w") as A: + dims = pa.array([1, 2, 3, 4, 5]) + data = pa.array([1.0, 2.0, None, 0, 1.0]) + if pass_df: + dims = dims.to_pandas() + data = data.to_pandas() + + if sparse: + A[dims] = data + else: + A[:] = data + + with tiledb.open(uri, "r") as A: + expected_validity = [False, False, True, False, False] + assert_array_equal(A[:]["a"].mask, expected_validity) + assert_array_equal(A.df[:]["a"].isna(), expected_validity) + class DenseArrayTest(DiskTestCase): def test_array_1d(self): From 4520eb283eb7c6b5d04783eb226cb42f253879de Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 27 Sep 2023 17:05:44 -0500 Subject: [PATCH 3/4] Use `nan_to_num` To Convert Invalid Values --- tiledb/libtiledb.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index 2293834a8d..86d9e71381 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -2309,7 +2309,7 @@ cdef class DenseArrayImpl(Array): if attr.isnullable and name not in nullmaps: nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask - attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) + attr_val = np.ascontiguousarray(np.nan_to_num(attr_val), dtype=attr.dtype) except Exception as exc: raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc @@ -2342,8 +2342,8 @@ cdef class DenseArrayImpl(Array): "typed attribute '{}'!".format(name)) if attr.isnullable and name not in nullmaps: - nullmaps[name] = ~np.ma.masked_invalid(val).mask - val = np.ascontiguousarray(val, dtype=attr.dtype) + nullmaps[name] = ~np.ma.fix_invalid(val).mask + val = np.ascontiguousarray(np.nan_to_num(val), dtype=attr.dtype) except Exception as exc: raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc values.append(val) @@ -2828,7 +2828,7 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps): if attr.isnullable and name not in nullmaps: nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask - attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) + attr_val = np.ascontiguousarray(np.nan_to_num(attr_val), dtype=attr.dtype) except Exception as exc: raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc From ff35eda58097eccbc87cab067a0702807139c419 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 27 Sep 2023 17:37:29 -0500 Subject: [PATCH 4/4] Only Use `nan_to_num` For Nullable Attrs --- tiledb/libtiledb.pyx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index 86d9e71381..8cac1c804b 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -2309,7 +2309,8 @@ cdef class DenseArrayImpl(Array): if attr.isnullable and name not in nullmaps: nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask - attr_val = np.ascontiguousarray(np.nan_to_num(attr_val), dtype=attr.dtype) + attr_val = np.nan_to_num(attr_val) + attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) except Exception as exc: raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc @@ -2343,7 +2344,8 @@ cdef class DenseArrayImpl(Array): if attr.isnullable and name not in nullmaps: nullmaps[name] = ~np.ma.fix_invalid(val).mask - val = np.ascontiguousarray(np.nan_to_num(val), dtype=attr.dtype) + val = np.nan_to_num(val) + val = np.ascontiguousarray(val, dtype=attr.dtype) except Exception as exc: raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc values.append(val) @@ -2828,7 +2830,8 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps): if attr.isnullable and name not in nullmaps: nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask - attr_val = np.ascontiguousarray(np.nan_to_num(attr_val), dtype=attr.dtype) + attr_val = np.nan_to_num(attr_val) + attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) except Exception as exc: raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc