Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional Support For Nullable Attributes #1836

Merged
merged 4 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tiledb/core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1183,7 +1183,8 @@ class PyQuery {
py::dict results;
for (auto &buffer_name : buffers_order_) {
auto bp = buffers_.at(buffer_name);
results[py::str(buffer_name)] = py::make_tuple(bp.data, bp.offsets);
results[py::str(buffer_name)] =
py::make_tuple(bp.data, bp.offsets, bp.validity);
}
return results;
}
Expand Down
113 changes: 90 additions & 23 deletions tiledb/libtiledb.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1925,15 +1925,26 @@ cdef class DenseArrayImpl(Array):
if self.view_attr:
result = self.subarray(selection, attrs=(self.view_attr,))
return result[self.view_attr]
else:
result = self.subarray(selection)
for i in range(self.schema.nattr):
attr = self.schema.attr(i)
enum_label = attr.enum_label
if enum_label is not None:
values = self.enum(enum_label).values()
result[attr.name] = np.array([values[idx] for idx in result[attr.name]])
return result

result = self.subarray(selection)
for i in range(self.schema.nattr):
attr = self.schema.attr(i)
enum_label = attr.enum_label
if enum_label is not None:
values = self.enum(enum_label).values()
if attr.isnullable:
data = np.array([values[idx] for idx in result[attr.name].data])
result[attr.name] = np.ma.array(
data, mask=~result[attr.name].mask)
else:
result[attr.name] = np.array(
[values[idx] for idx in result[attr.name]])
else:
if attr.isnullable:
result[attr.name] = np.ma.array(result[attr.name].data,
mask=~result[attr.name].mask)

return result

def __repr__(self):
if self.isopen:
Expand Down Expand Up @@ -2182,6 +2193,10 @@ cdef class DenseArrayImpl(Array):
arr.shape = np.prod(output_shape)

out[name] = arr

if self.schema.has_attr(name) and self.attr(name).isnullable:
out[name] = np.ma.array(out[name], mask=results[name][2].astype(bool))

return out

def __setitem__(self, object selection, object val):
Expand Down Expand Up @@ -2272,14 +2287,34 @@ cdef class DenseArrayImpl(Array):
# Create list of attribute names and values
for attr_idx in range(self.schema.nattr):
attr = self.schema.attr(attr_idx)
k = attr.name
v = val[k]
attr = self.schema.attr(k)
name = attr.name
attr_val = val[name]

attributes.append(attr._internal_name)
# object arrays are var-len and handled later
if type(v) is np.ndarray and v.dtype is not np.dtype('O'):
v = np.ascontiguousarray(v, dtype=attr.dtype)
values.append(v)
if type(attr_val) is np.ndarray and attr_val.dtype is not np.dtype('O'):
attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)

try:
if attr.isvar:
# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)
if attr.isnullable and name not in nullmaps:
nullmaps[name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8)
else:
if (np.issubdtype(attr.dtype, np.string_) and not
(np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))):
raise ValueError("Cannot write a string value to non-string "
"typed attribute '{}'!".format(name))

if attr.isnullable and name not in nullmaps:
nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask
attr_val = np.nan_to_num(attr_val)
attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)
except Exception as exc:
raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc

values.append(attr_val)

elif np.isscalar(val):
for i in range(self.schema.nattr):
Expand All @@ -2290,10 +2325,29 @@ cdef class DenseArrayImpl(Array):
values.append(A)
elif self.schema.nattr == 1:
attr = self.schema.attr(0)
name = attr.name
attributes.append(attr._internal_name)
# object arrays are var-len and handled later
if type(val) is np.ndarray and val.dtype is not np.dtype('O'):
val = np.ascontiguousarray(val, dtype=attr.dtype)
try:
if attr.isvar:
# ensure that the value is array-convertible, for example: pandas.Series
val = np.asarray(val)
if attr.isnullable and name not in nullmaps:
nullmaps[name] = np.array([int(v is not None) for v in val], dtype=np.uint8)
else:
if (np.issubdtype(attr.dtype, np.string_) and not
(np.issubdtype(val.dtype, np.string_) or val.dtype == np.dtype('O'))):
raise ValueError("Cannot write a string value to non-string "
"typed attribute '{}'!".format(name))

if attr.isnullable and name not in nullmaps:
nullmaps[name] = ~np.ma.fix_invalid(val).mask
val = np.nan_to_num(val)
val = np.ascontiguousarray(val, dtype=attr.dtype)
except Exception as exc:
raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc
values.append(val)
elif self.view_attr is not None:
# Support single-attribute assignment for multi-attr array
Expand Down Expand Up @@ -2329,9 +2383,6 @@ cdef class DenseArrayImpl(Array):
if not isinstance(val, np.ndarray):
raise TypeError(f"Expected NumPy array for attribute '{key}' "
f"validity bitmap, got {type(val)}")
if val.dtype != np.uint8:
raise TypeError(f"Expected NumPy uint8 array for attribute '{key}' "
f"validity bitmap, got {val.dtype}")

_write_array(
ctx_ptr,
Expand Down Expand Up @@ -2769,17 +2820,19 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps):
if attr.isvar:
# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)
if attr.isnullable and name not in nullmaps:
nullmaps[name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8)
else:
if (np.issubdtype(attr.dtype, np.string_) and not
(np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))):
raise ValueError("Cannot write a string value to non-string "
"typed attribute '{}'!".format(name))


if attr.isnullable and name not in nullmaps:
nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask
attr_val = np.nan_to_num(attr_val)
attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)

if attr.isnullable and attr.name not in nullmaps:
nullmaps[attr.name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8)

except Exception as exc:
raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc

Expand Down Expand Up @@ -2919,7 +2972,18 @@ cdef class SparseArrayImpl(Array):
enum_label = attr.enum_label
if enum_label is not None:
values = self.enum(enum_label).values()
result[attr.name] = np.array([values[idx] for idx in result[attr.name]])
if attr.isnullable:
data = np.array([values[idx] for idx in result[attr.name].data])
result[attr.name] = np.ma.array(
data, mask=~result[attr.name].mask)
else:
result[attr.name] = np.array(
[values[idx] for idx in result[attr.name]])
else:
if attr.isnullable:
result[attr.name] = np.ma.array(result[attr.name].data,
mask=~result[attr.name].mask)

return result

def query(self, attrs=None, cond=None, attr_cond=None, dims=None,
Expand Down Expand Up @@ -3207,6 +3271,9 @@ cdef class SparseArrayImpl(Array):
else:
arr.dtype = el_dtype
out[final_name] = arr

if self.schema.has_attr(final_name) and self.attr(final_name).isnullable:
out[final_name] = np.ma.array(out[final_name], mask=results[name][2])

return out

Expand Down
66 changes: 28 additions & 38 deletions tiledb/tests/test_enumeration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import tiledb

from .common import DiskTestCase, has_pandas
from .common import DiskTestCase, has_pandas, has_pyarrow


class EnumerationTest(DiskTestCase):
Expand Down Expand Up @@ -82,47 +82,37 @@ def test_array_schema_enumeration(self):
assert_array_equal(A.df[:]["attr1"], A[:]["attr1"])
assert_array_equal(A.df[:]["attr2"], A[:]["attr2"])

def test_array_schema_enumeration_nullable(self):
uri = self.path("test_array_schema_enumeration")
dom = tiledb.Domain(tiledb.Dim(domain=(1, 8), tile=1))
enum1 = tiledb.Enumeration("enmr1", False, np.arange(3) * 10)
enum2 = tiledb.Enumeration("enmr2", False, ["a", "bb", "ccc"])
attr1 = tiledb.Attr("attr1", dtype=np.int32, enum_label="enmr1")
attr2 = tiledb.Attr("attr2", dtype=np.int32, enum_label="enmr2")
attr3 = tiledb.Attr("attr3", dtype=np.int32)
@pytest.mark.skipif(
not has_pyarrow() or not has_pandas(),
reason="pyarrow and/or pandas not installed",
)
@pytest.mark.parametrize("sparse", [True, False])
@pytest.mark.parametrize("pass_df", [True, False])
def test_array_schema_enumeration_nullable(self, sparse, pass_df):
import pyarrow as pa

uri = self.path("test_array_schema_enumeration_nullable")
enmr = tiledb.Enumeration("e", False, ["alpha", "beta", "gamma"])
dom = tiledb.Domain(tiledb.Dim("d", domain=(1, 5), dtype="int64"))
att = tiledb.Attr("a", dtype="int8", nullable=True, enum_label="e")
schema = tiledb.ArraySchema(
domain=dom, attrs=(attr1, attr2, attr3), enums=(enum1, enum2)
domain=dom, attrs=[att], enums=[enmr], sparse=sparse
)
tiledb.Array.create(uri, schema)

data1 = np.random.randint(0, 3, 8)
data2 = np.random.randint(0, 3, 8)
data3 = np.random.randint(0, 3, 8)

with tiledb.open(uri, "w") as A:
A[:] = {"attr1": data1, "attr2": data2, "attr3": data3}

with tiledb.open(uri, "r") as A:
assert A.enum("enmr1") == enum1
assert attr1.enum_label == "enmr1"
assert A.attr("attr1").enum_label == "enmr1"
dims = pa.array([1, 2, 3, 4, 5])
data = pa.array([1.0, 2.0, None, 0, 1.0])
if pass_df:
dims = dims.to_pandas()
data = data.to_pandas()

assert A.enum("enmr2") == enum2
assert attr2.enum_label == "enmr2"
assert A.attr("attr2").enum_label == "enmr2"

with self.assertRaises(tiledb.TileDBError) as excinfo:
assert A.enum("enmr3") == []
assert " No enumeration named 'enmr3'" in str(excinfo.value)
assert attr3.enum_label is None
assert A.attr("attr3").enum_label is None
if sparse:
A[dims] = data
else:
A[:] = data

if has_pandas():
assert_array_equal(A.df[:]["attr1"].cat.codes, data1)
assert_array_equal(A.df[:]["attr2"].cat.codes, data2)

assert_array_equal(A.df[:]["attr1"], A.multi_index[:]["attr1"])
assert_array_equal(A.df[:]["attr2"], A.multi_index[:]["attr2"])

assert_array_equal(A.df[:]["attr1"], A[:]["attr1"])
assert_array_equal(A.df[:]["attr2"], A[:]["attr2"])
with tiledb.open(uri, "r") as A:
expected_validity = [False, False, True, False, False]
assert_array_equal(A[:]["a"].mask, expected_validity)
assert_array_equal(A.df[:]["a"].isna(), expected_validity)
33 changes: 33 additions & 0 deletions tiledb/tests/test_libtiledb.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
assert_unordered_equal,
fx_sparse_cell_order, # noqa: F401
has_pandas,
has_pyarrow,
rand_ascii,
rand_ascii_bytes,
rand_utf8,
Expand Down Expand Up @@ -381,6 +382,38 @@ def test_array_delete(self):

assert tiledb.array_exists(uri) is False

@pytest.mark.skipif(
not has_pyarrow() or not has_pandas(),
reason="pyarrow and/or pandas not installed",
)
@pytest.mark.parametrize("sparse", [True, False])
@pytest.mark.parametrize("pass_df", [True, False])
def test_array_write_nullable(self, sparse, pass_df):
import pyarrow as pa

uri = self.path("test_array_write_nullable")
dom = tiledb.Domain(tiledb.Dim("d", domain=(1, 5), dtype="int64"))
att = tiledb.Attr("a", dtype="int8", nullable=True)
schema = tiledb.ArraySchema(domain=dom, attrs=[att], sparse=sparse)
tiledb.Array.create(uri, schema)

with tiledb.open(uri, "w") as A:
dims = pa.array([1, 2, 3, 4, 5])
data = pa.array([1.0, 2.0, None, 0, 1.0])
if pass_df:
dims = dims.to_pandas()
data = data.to_pandas()

if sparse:
A[dims] = data
else:
A[:] = data

with tiledb.open(uri, "r") as A:
expected_validity = [False, False, True, False, False]
assert_array_equal(A[:]["a"].mask, expected_validity)
assert_array_equal(A.df[:]["a"].isna(), expected_validity)


class DenseArrayTest(DiskTestCase):
def test_array_1d(self):
Expand Down