From 4a4f4b19aa237fb47bd7c904df140f53d5c08207 Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis <36283973+kounelisagis@users.noreply.github.com> Date: Thu, 17 Oct 2024 15:36:49 +0300 Subject: [PATCH] Move `_setitem_impl_sparse` to Python (#2092) --- tiledb/aggregation.py | 2 +- tiledb/dataframe_.py | 2 +- tiledb/libtiledb.pxd | 1 - tiledb/libtiledb.pyx | 186 +--------------------------------------- tiledb/sparse_array.py | 190 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 194 insertions(+), 187 deletions(-) diff --git a/tiledb/aggregation.py b/tiledb/aggregation.py index a19864d023..d1ba5dd98c 100644 --- a/tiledb/aggregation.py +++ b/tiledb/aggregation.py @@ -41,7 +41,7 @@ def __getitem__(self, selection): q = PyAgg(array.ctx, array, layout, self.attr_to_aggs) - from .libtiledb import ( + from .array import ( index_as_tuple, index_domain_subarray, replace_ellipsis, diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index 489bb10f5d..d030c4d8f0 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -743,7 +743,7 @@ def _write_array( else: coords.append(df.index.get_level_values(k)) # TODO ensure correct col/dim ordering - tiledb.libtiledb._setitem_impl_sparse( + tiledb.sparse_array._setitem_impl_sparse( A, tuple(coords), write_dict, nullmaps ) diff --git a/tiledb/libtiledb.pxd b/tiledb/libtiledb.pxd index fe49b3c5c8..ba22ce9dc9 100644 --- a/tiledb/libtiledb.pxd +++ b/tiledb/libtiledb.pxd @@ -1101,7 +1101,6 @@ cdef extern from "tiledb/tiledb_experimental.h": # Free helper functions cpdef unicode ustring(object s) -cpdef check_error(object ctx, int rc) cdef _raise_tiledb_error(tiledb_error_t* err_ptr) cdef _raise_ctx_err(tiledb_ctx_t* ctx_ptr, int rc) diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx index efa98fda8c..d7876c9b00 100644 --- a/tiledb/libtiledb.pyx +++ b/tiledb/libtiledb.pyx @@ -2,25 +2,22 @@ #cython: embedsignature=True #cython: auto_pickle=False -from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_IsValid, PyCapsule_New +from cpython.pycapsule cimport PyCapsule_GetPointer from cpython.version cimport PY_MAJOR_VERSION -from .domain_indexer import DomainIndexer include "common.pxi" include "indexing.pyx" include "libmetadata.pyx" -import io -import warnings import collections.abc -from collections import OrderedDict -from json import dumps as json_dumps, loads as json_loads +from json import loads as json_loads -from ._generated_version import version_tuple as tiledbpy_version from .cc import TileDBError from .ctx import Config, Ctx, default_ctx +from .domain_indexer import DomainIndexer from .vfs import VFS from .sparse_array import SparseArrayImpl from .dense_array import DenseArrayImpl +from .array import Array ############################################################################### # Numpy initialization code (critical) # @@ -353,12 +350,6 @@ cdef _raise_ctx_err(tiledb_ctx_t* ctx_ptr, int rc): _raise_tiledb_error(err_ptr) -cpdef check_error(object ctx, int rc): - cdef tiledb_ctx_t* ctx_ptr = PyCapsule_GetPointer( - ctx.__capsule__(), "ctx") - _raise_ctx_err(ctx_ptr, rc) - - cpdef unicode ustring(object s): """Coerce a python object to a unicode string""" @@ -384,7 +375,6 @@ cdef bytes unicode_path(object path): # # ############################################################################### -from .array import _tiledb_datetime_extent, index_as_tuple, replace_ellipsis, replace_scalars_slice, check_for_floats, index_domain_subarray, Array cdef class Query(object): """ @@ -758,171 +748,3 @@ def write_direct_dense(self: Array, np.ndarray array not None, **kw): tiledb_subarray_free(&subarray_ptr) tiledb_query_free(&query_ptr) return - -# point query index a tiledb array (zips) columnar index vectors -def index_domain_coords(dom, idx, check_ndim): - """ - Returns a (zipped) coordinate array representation - given coordinate indices in numpy's point indexing format - """ - ndim = len(idx) - - if check_ndim: - if ndim != dom.ndim: - raise IndexError("sparse index ndim must match domain ndim: " - "{0!r} != {1!r}".format(ndim, dom.ndim)) - - domain_coords = [] - for dim, sel in zip(dom, idx): - dim_is_string = (np.issubdtype(dim.dtype, np.str_) or - np.issubdtype(dim.dtype, np.bytes_)) - - if dim_is_string: - try: - # ensure strings contain only ASCII characters - domain_coords.append(np.array(sel, dtype=np.bytes_, ndmin=1)) - except Exception as exc: - raise TileDBError(f'Dim\' strings may only contain ASCII characters') - else: - domain_coords.append(np.array(sel, dtype=dim.dtype, ndmin=1)) - - idx = tuple(domain_coords) - - # check that all sparse coordinates are the same size and dtype - dim0 = dom.dim(0) - dim0_type = dim0.dtype - len0 = len(idx[0]) - for dim_idx in range(ndim): - dim_dtype = dom.dim(dim_idx).dtype - if len(idx[dim_idx]) != len0: - raise IndexError("sparse index dimension length mismatch") - - if np.issubdtype(dim_dtype, np.str_) or np.issubdtype(dim_dtype, np.bytes_): - if not (np.issubdtype(idx[dim_idx].dtype, np.str_) or \ - np.issubdtype(idx[dim_idx].dtype, np.bytes_)): - raise IndexError("sparse index dimension dtype mismatch") - elif idx[dim_idx].dtype != dim_dtype: - raise IndexError("sparse index dimension dtype mismatch") - - return idx - -def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps): - cdef tiledb_ctx_t* ctx_ptr = safe_ctx_ptr(self.ctx) - cdef tiledb_array_t* array_ptr = PyCapsule_GetPointer(self.array.__capsule__(), "array") - cdef dict labels = dict() - - if not self.isopen or self.mode != 'w': - raise TileDBError("SparseArray is not opened for writing") - - set_dims_only = val is None - sparse_attributes = list() - sparse_values = list() - idx = index_as_tuple(selection) - sparse_coords = list(index_domain_coords(self.schema.domain, idx, not set_dims_only)) - - if set_dims_only: - _write_array( - ctx_ptr, - array_ptr, - self, - None, - sparse_coords, - sparse_attributes, - sparse_values, - labels, - nullmaps, - self.last_fragment_info, - True, - ) - return - - if not isinstance(val, dict): - if self.nattr > 1: - raise ValueError("Expected dict-like object {name: value} for multi-attribute " - "array.") - val = dict({self.attr(0).name: val}) - - # Create dictionary for label names and values from the dictionary - labels = { - name: - (data - if not type(data) is np.ndarray or data.dtype is np.dtype('O') - else np.ascontiguousarray(data, dtype=self.schema.dim_label(name).dtype)) - for name, data in val.items() - if self.schema.has_dim_label(name) - } - - # must iterate in Attr order to ensure that value order matches - for attr_idx in range(self.schema.nattr): - attr = self.attr(attr_idx) - name = attr.name - attr_val = val[name] - - try: - # ensure that the value is array-convertible, for example: pandas.Series - attr_val = np.asarray(attr_val) - - if attr.isvar: - if attr.isnullable and name not in nullmaps: - nullmaps[name] = np.array( - [int(v is not None) for v in attr_val], dtype=np.uint8) - else: - if (np.issubdtype(attr.dtype, np.bytes_) - and not (np.issubdtype(attr_val.dtype, np.bytes_) - or attr_val.dtype == np.dtype('O'))): - raise ValueError("Cannot write a string value to non-string " - "typed attribute '{}'!".format(name)) - - if attr.isnullable and name not in nullmaps: - try: - nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask - except Exception as exc: - nullmaps[name] = np.array( - [int(v is not None) for v in attr_val], dtype=np.uint8) - - if np.issubdtype(attr.dtype, np.bytes_): - attr_val = np.array(["" if v is None else v for v in attr_val]) - else: - attr_val = np.nan_to_num(attr_val) - attr_val = np.array([0 if v is None else v for v in attr_val]) - attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) - - except Exception as exc: - raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc - - # set nullmap if nullable attribute does not have a nullmap already set - if attr.isnullable and attr.name not in nullmaps: - nullmaps[attr.name] = np.ones(attr_val.shape) - - # if dtype is ASCII, ensure all characters are valid - if attr.isascii: - try: - np.asarray(attr_val, dtype=np.bytes_) - except Exception as exc: - raise TileDBError(f'dtype of attr {attr.name} is "ascii" but attr_val contains invalid ASCII characters') - - ncells = sparse_coords[0].shape[0] - if attr_val.size != ncells: - raise ValueError("value length ({}) does not match " - "coordinate length ({})".format(attr_val.size, ncells)) - sparse_attributes.append(attr._internal_name) - sparse_values.append(attr_val) - - if (len(sparse_attributes) + len(labels) != len(val.keys())) \ - or (len(sparse_values) + len(labels) != len(val.values())): - raise TileDBError("Sparse write input data count does not match number of attributes") - - _write_array( - ctx_ptr, - array_ptr, - self, - None, - sparse_coords, - sparse_attributes, - sparse_values, - labels, - nullmaps, - self.last_fragment_info, - True, - ) - return diff --git a/tiledb/sparse_array.py b/tiledb/sparse_array.py index 7268834e05..94dc8e209e 100644 --- a/tiledb/sparse_array.py +++ b/tiledb/sparse_array.py @@ -14,6 +14,194 @@ ) +# point query index a tiledb array (zips) columnar index vectors +def index_domain_coords(dom, idx, check_ndim): + """ + Returns a (zipped) coordinate array representation + given coordinate indices in numpy's point indexing format + """ + ndim = len(idx) + + if check_ndim: + if ndim != dom.ndim: + raise IndexError( + "sparse index ndim must match domain ndim: " + "{0!r} != {1!r}".format(ndim, dom.ndim) + ) + + domain_coords = [] + for dim, sel in zip(dom, idx): + dim_is_string = np.issubdtype(dim.dtype, np.str_) or np.issubdtype( + dim.dtype, np.bytes_ + ) + + if dim_is_string: + try: + # ensure strings contain only ASCII characters + domain_coords.append(np.array(sel, dtype=np.bytes_, ndmin=1)) + except Exception as exc: + raise tiledb.TileDBError( + f"Dim' strings may only contain ASCII characters" + ) + else: + domain_coords.append(np.array(sel, dtype=dim.dtype, ndmin=1)) + + idx = tuple(domain_coords) + + # check that all sparse coordinates are the same size and dtype + dim0 = dom.dim(0) + dim0_type = dim0.dtype + len0 = len(idx[0]) + for dim_idx in range(ndim): + dim_dtype = dom.dim(dim_idx).dtype + if len(idx[dim_idx]) != len0: + raise IndexError("sparse index dimension length mismatch") + + if np.issubdtype(dim_dtype, np.str_) or np.issubdtype(dim_dtype, np.bytes_): + if not ( + np.issubdtype(idx[dim_idx].dtype, np.str_) + or np.issubdtype(idx[dim_idx].dtype, np.bytes_) + ): + raise IndexError("sparse index dimension dtype mismatch") + elif idx[dim_idx].dtype != dim_dtype: + raise IndexError("sparse index dimension dtype mismatch") + + return idx + + +def _setitem_impl_sparse(self, selection, val, nullmaps: dict): + labels = dict() + + if not self.isopen or self.mode != "w": + raise tiledb.TileDBError("SparseArray is not opened for writing") + + set_dims_only = val is None + sparse_attributes = list() + sparse_values = list() + idx = index_as_tuple(selection) + sparse_coords = list( + index_domain_coords(self.schema.domain, idx, not set_dims_only) + ) + + from .libtiledb import _write_array_wrapper + + if set_dims_only: + _write_array_wrapper( + self, + None, + sparse_coords, + sparse_attributes, + sparse_values, + labels, + nullmaps, + True, + ) + return + + if not isinstance(val, dict): + if self.nattr > 1: + raise ValueError( + "Expected dict-like object {name: value} for multi-attribute " "array." + ) + val = dict({self.attr(0).name: val}) + + # Create dictionary for label names and values from the dictionary + labels = { + name: ( + data + if not type(data) is np.ndarray or data.dtype is np.dtype("O") + else np.ascontiguousarray(data, dtype=self.schema.dim_label(name).dtype) + ) + for name, data in val.items() + if self.schema.has_dim_label(name) + } + + # must iterate in Attr order to ensure that value order matches + for attr_idx in range(self.schema.nattr): + attr = self.attr(attr_idx) + name = attr.name + attr_val = val[name] + + try: + # ensure that the value is array-convertible, for example: pandas.Series + attr_val = np.asarray(attr_val) + + if attr.isvar: + if attr.isnullable and name not in nullmaps: + nullmaps[name] = np.array( + [int(v is not None) for v in attr_val], dtype=np.uint8 + ) + else: + if np.issubdtype(attr.dtype, np.bytes_) and not ( + np.issubdtype(attr_val.dtype, np.bytes_) + or attr_val.dtype == np.dtype("O") + ): + raise ValueError( + "Cannot write a string value to non-string " + "typed attribute '{}'!".format(name) + ) + + if attr.isnullable and name not in nullmaps: + try: + nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask + except Exception as exc: + nullmaps[name] = np.array( + [int(v is not None) for v in attr_val], dtype=np.uint8 + ) + + if np.issubdtype(attr.dtype, np.bytes_): + attr_val = np.array(["" if v is None else v for v in attr_val]) + else: + attr_val = np.nan_to_num(attr_val) + attr_val = np.array([0 if v is None else v for v in attr_val]) + attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype) + + except Exception as exc: + raise ValueError( + f"NumPy array conversion check failed for attr '{name}'" + ) from exc + + # set nullmap if nullable attribute does not have a nullmap already set + if attr.isnullable and attr.name not in nullmaps: + nullmaps[attr.name] = np.ones(attr_val.shape) + + # if dtype is ASCII, ensure all characters are valid + if attr.isascii: + try: + np.asarray(attr_val, dtype=np.bytes_) + except Exception as exc: + raise tiledb.TileDBError( + f'dtype of attr {attr.name} is "ascii" but attr_val contains invalid ASCII characters' + ) + + ncells = sparse_coords[0].shape[0] + if attr_val.size != ncells: + raise ValueError( + "value length ({}) does not match " + "coordinate length ({})".format(attr_val.size, ncells) + ) + sparse_attributes.append(attr._internal_name) + sparse_values.append(attr_val) + + if (len(sparse_attributes) + len(labels) != len(val.keys())) or ( + len(sparse_values) + len(labels) != len(val.values()) + ): + raise tiledb.TileDBError( + "Sparse write input data count does not match number of attributes" + ) + + _write_array_wrapper( + self, + None, + sparse_coords, + sparse_attributes, + sparse_values, + labels, + nullmaps, + True, + ) + + class SparseArrayImpl(Array): """Class representing a sparse TileDB array (internal). @@ -60,8 +248,6 @@ def __setitem__(self, selection, val): ... "a2": np.array([3, 4])} """ - from .libtiledb import _setitem_impl_sparse - _setitem_impl_sparse(self, selection, val, dict()) def __getitem__(self, selection):