Skip to content

Commit

Permalink
Move _setitem_impl_sparse to Python (#2092)
Browse files Browse the repository at this point in the history
  • Loading branch information
kounelisagis committed Oct 23, 2024
1 parent 19e7812 commit 4a4f4b1
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 187 deletions.
2 changes: 1 addition & 1 deletion tiledb/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __getitem__(self, selection):

q = PyAgg(array.ctx, array, layout, self.attr_to_aggs)

from .libtiledb import (
from .array import (
index_as_tuple,
index_domain_subarray,
replace_ellipsis,
Expand Down
2 changes: 1 addition & 1 deletion tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ def _write_array(
else:
coords.append(df.index.get_level_values(k))
# TODO ensure correct col/dim ordering
tiledb.libtiledb._setitem_impl_sparse(
tiledb.sparse_array._setitem_impl_sparse(
A, tuple(coords), write_dict, nullmaps
)

Expand Down
1 change: 0 additions & 1 deletion tiledb/libtiledb.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1101,7 +1101,6 @@ cdef extern from "tiledb/tiledb_experimental.h":

# Free helper functions
cpdef unicode ustring(object s)
cpdef check_error(object ctx, int rc)
cdef _raise_tiledb_error(tiledb_error_t* err_ptr)
cdef _raise_ctx_err(tiledb_ctx_t* ctx_ptr, int rc)

Expand Down
186 changes: 4 additions & 182 deletions tiledb/libtiledb.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,22 @@
#cython: embedsignature=True
#cython: auto_pickle=False

from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_IsValid, PyCapsule_New
from cpython.pycapsule cimport PyCapsule_GetPointer
from cpython.version cimport PY_MAJOR_VERSION
from .domain_indexer import DomainIndexer

include "common.pxi"
include "indexing.pyx"
include "libmetadata.pyx"
import io
import warnings
import collections.abc
from collections import OrderedDict
from json import dumps as json_dumps, loads as json_loads
from json import loads as json_loads

from ._generated_version import version_tuple as tiledbpy_version
from .cc import TileDBError
from .ctx import Config, Ctx, default_ctx
from .domain_indexer import DomainIndexer
from .vfs import VFS
from .sparse_array import SparseArrayImpl
from .dense_array import DenseArrayImpl
from .array import Array

###############################################################################
# Numpy initialization code (critical) #
Expand Down Expand Up @@ -353,12 +350,6 @@ cdef _raise_ctx_err(tiledb_ctx_t* ctx_ptr, int rc):
_raise_tiledb_error(err_ptr)


cpdef check_error(object ctx, int rc):
cdef tiledb_ctx_t* ctx_ptr = <tiledb_ctx_t*>PyCapsule_GetPointer(
ctx.__capsule__(), "ctx")
_raise_ctx_err(ctx_ptr, rc)


cpdef unicode ustring(object s):
"""Coerce a python object to a unicode string"""

Expand All @@ -384,7 +375,6 @@ cdef bytes unicode_path(object path):
# #
###############################################################################

from .array import _tiledb_datetime_extent, index_as_tuple, replace_ellipsis, replace_scalars_slice, check_for_floats, index_domain_subarray, Array

cdef class Query(object):
"""
Expand Down Expand Up @@ -758,171 +748,3 @@ def write_direct_dense(self: Array, np.ndarray array not None, **kw):
tiledb_subarray_free(&subarray_ptr)
tiledb_query_free(&query_ptr)
return

# point query index a tiledb array (zips) columnar index vectors
def index_domain_coords(dom, idx, check_ndim):
"""
Returns a (zipped) coordinate array representation
given coordinate indices in numpy's point indexing format
"""
ndim = len(idx)

if check_ndim:
if ndim != dom.ndim:
raise IndexError("sparse index ndim must match domain ndim: "
"{0!r} != {1!r}".format(ndim, dom.ndim))

domain_coords = []
for dim, sel in zip(dom, idx):
dim_is_string = (np.issubdtype(dim.dtype, np.str_) or
np.issubdtype(dim.dtype, np.bytes_))

if dim_is_string:
try:
# ensure strings contain only ASCII characters
domain_coords.append(np.array(sel, dtype=np.bytes_, ndmin=1))
except Exception as exc:
raise TileDBError(f'Dim\' strings may only contain ASCII characters')
else:
domain_coords.append(np.array(sel, dtype=dim.dtype, ndmin=1))

idx = tuple(domain_coords)

# check that all sparse coordinates are the same size and dtype
dim0 = dom.dim(0)
dim0_type = dim0.dtype
len0 = len(idx[0])
for dim_idx in range(ndim):
dim_dtype = dom.dim(dim_idx).dtype
if len(idx[dim_idx]) != len0:
raise IndexError("sparse index dimension length mismatch")

if np.issubdtype(dim_dtype, np.str_) or np.issubdtype(dim_dtype, np.bytes_):
if not (np.issubdtype(idx[dim_idx].dtype, np.str_) or \
np.issubdtype(idx[dim_idx].dtype, np.bytes_)):
raise IndexError("sparse index dimension dtype mismatch")
elif idx[dim_idx].dtype != dim_dtype:
raise IndexError("sparse index dimension dtype mismatch")

return idx

def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps):
cdef tiledb_ctx_t* ctx_ptr = safe_ctx_ptr(self.ctx)
cdef tiledb_array_t* array_ptr = <tiledb_array_t*>PyCapsule_GetPointer(self.array.__capsule__(), "array")
cdef dict labels = dict()

if not self.isopen or self.mode != 'w':
raise TileDBError("SparseArray is not opened for writing")

set_dims_only = val is None
sparse_attributes = list()
sparse_values = list()
idx = index_as_tuple(selection)
sparse_coords = list(index_domain_coords(self.schema.domain, idx, not set_dims_only))

if set_dims_only:
_write_array(
ctx_ptr,
array_ptr,
self,
None,
sparse_coords,
sparse_attributes,
sparse_values,
labels,
nullmaps,
self.last_fragment_info,
True,
)
return

if not isinstance(val, dict):
if self.nattr > 1:
raise ValueError("Expected dict-like object {name: value} for multi-attribute "
"array.")
val = dict({self.attr(0).name: val})

# Create dictionary for label names and values from the dictionary
labels = {
name:
(data
if not type(data) is np.ndarray or data.dtype is np.dtype('O')
else np.ascontiguousarray(data, dtype=self.schema.dim_label(name).dtype))
for name, data in val.items()
if self.schema.has_dim_label(name)
}

# must iterate in Attr order to ensure that value order matches
for attr_idx in range(self.schema.nattr):
attr = self.attr(attr_idx)
name = attr.name
attr_val = val[name]

try:
# ensure that the value is array-convertible, for example: pandas.Series
attr_val = np.asarray(attr_val)

if attr.isvar:
if attr.isnullable and name not in nullmaps:
nullmaps[name] = np.array(
[int(v is not None) for v in attr_val], dtype=np.uint8)
else:
if (np.issubdtype(attr.dtype, np.bytes_)
and not (np.issubdtype(attr_val.dtype, np.bytes_)
or attr_val.dtype == np.dtype('O'))):
raise ValueError("Cannot write a string value to non-string "
"typed attribute '{}'!".format(name))

if attr.isnullable and name not in nullmaps:
try:
nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask
except Exception as exc:
nullmaps[name] = np.array(
[int(v is not None) for v in attr_val], dtype=np.uint8)

if np.issubdtype(attr.dtype, np.bytes_):
attr_val = np.array(["" if v is None else v for v in attr_val])
else:
attr_val = np.nan_to_num(attr_val)
attr_val = np.array([0 if v is None else v for v in attr_val])
attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)

except Exception as exc:
raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc

# set nullmap if nullable attribute does not have a nullmap already set
if attr.isnullable and attr.name not in nullmaps:
nullmaps[attr.name] = np.ones(attr_val.shape)

# if dtype is ASCII, ensure all characters are valid
if attr.isascii:
try:
np.asarray(attr_val, dtype=np.bytes_)
except Exception as exc:
raise TileDBError(f'dtype of attr {attr.name} is "ascii" but attr_val contains invalid ASCII characters')

ncells = sparse_coords[0].shape[0]
if attr_val.size != ncells:
raise ValueError("value length ({}) does not match "
"coordinate length ({})".format(attr_val.size, ncells))
sparse_attributes.append(attr._internal_name)
sparse_values.append(attr_val)

if (len(sparse_attributes) + len(labels) != len(val.keys())) \
or (len(sparse_values) + len(labels) != len(val.values())):
raise TileDBError("Sparse write input data count does not match number of attributes")

_write_array(
ctx_ptr,
array_ptr,
self,
None,
sparse_coords,
sparse_attributes,
sparse_values,
labels,
nullmaps,
self.last_fragment_info,
True,
)
return
Loading

0 comments on commit 4a4f4b1

Please sign in to comment.