Skip to content

Commit

Permalink
[Backport release-1.15][c++] Fix offsets for nullable columns (#3659)
Browse files Browse the repository at this point in the history
* merge

* std::format -> fmt::format for C++17

---------

Co-authored-by: nguyenv <[email protected]>
  • Loading branch information
johnkerl and nguyenv authored Feb 3, 2025
1 parent 9843f18 commit 100a660
Show file tree
Hide file tree
Showing 10 changed files with 232 additions and 91 deletions.
2 changes: 1 addition & 1 deletion apis/python/src/tiledbsoma/managed_query.cc
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ void load_managed_query(py::module& m) {
data.size(),
(const void*)data_info.ptr,
static_cast<uint64_t*>(nullptr),
static_cast<uint8_t*>(nullptr));
std::nullopt);
py::gil_scoped_acquire acquire;
})
.def(
Expand Down
105 changes: 89 additions & 16 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pyarrow as pa
import pytest
import somacore
from numpy.testing import assert_array_equal
from pandas.api.types import union_categoricals

import tiledbsoma as soma
Expand Down Expand Up @@ -256,8 +257,8 @@ def test_dataframe_with_enumeration(tmp_path):

with soma.DataFrame.open(tmp_path.as_posix()) as sdf:
df = sdf.read().concat()
np.testing.assert_array_equal(df["myint"].chunk(0).dictionary, enums["enmr1"])
np.testing.assert_array_equal(df["myfloat"].chunk(0).dictionary, enums["enmr2"])
assert_array_equal(df["myint"].chunk(0).dictionary, enums["enmr1"])
assert_array_equal(df["myfloat"].chunk(0).dictionary, enums["enmr2"])


@pytest.fixture
Expand Down Expand Up @@ -2025,28 +2026,100 @@ def test_arrow_table_sliced_writer(tmp_path):
sdf.write(table[:])

with soma.DataFrame.open(uri) as sdf:
pdf = sdf.read().concat().to_pandas()
pdf = sdf.read().concat()

np.testing.assert_array_equal(pdf["myint"], pydict["myint"])
np.testing.assert_array_equal(pdf["mystring"], pydict["mystring"])
np.testing.assert_array_equal(pdf["mybool"], pydict["mybool"])
assert_array_equal(pdf["myint"], pydict["myint"])
assert_array_equal(pdf["mystring"], pydict["mystring"])
assert_array_equal(pdf["mybool"], pydict["mybool"])

np.testing.assert_array_equal(pdf["myenumint"], pydict["myenumint"])
np.testing.assert_array_equal(pdf["myenumstr"], pydict["myenumstr"])
np.testing.assert_array_equal(pdf["myenumbool"], pydict["myenumbool"])
assert_array_equal(pdf["myenumint"], pydict["myenumint"])
assert_array_equal(pdf["myenumstr"], pydict["myenumstr"])
assert_array_equal(pdf["myenumbool"], pydict["myenumbool"])

with soma.DataFrame.open(uri, mode="w") as sdf:
mid = num_rows // 2
sdf.write(table[:mid])
sdf.write(table[mid:])

with soma.DataFrame.open(uri) as sdf:
pdf = sdf.read().concat().to_pandas()
pdf = sdf.read().concat()

np.testing.assert_array_equal(pdf["myint"], pydict["myint"])
np.testing.assert_array_equal(pdf["mystring"], pydict["mystring"])
np.testing.assert_array_equal(pdf["mybool"], pydict["mybool"])
assert_array_equal(pdf["myint"], pydict["myint"])
assert_array_equal(pdf["mystring"], pydict["mystring"])
assert_array_equal(pdf["mybool"], pydict["mybool"])

np.testing.assert_array_equal(pdf["myenumint"], pydict["myenumint"])
np.testing.assert_array_equal(pdf["myenumstr"], pydict["myenumstr"])
np.testing.assert_array_equal(pdf["myenumbool"], pydict["myenumbool"])
assert_array_equal(pdf["myenumint"], pydict["myenumint"])
assert_array_equal(pdf["myenumstr"], pydict["myenumstr"])
assert_array_equal(pdf["myenumbool"], pydict["myenumbool"])


def test_arrow_table_validity_with_slicing(tmp_path):
uri = tmp_path.as_posix()
num_rows = 10
domain = ((0, np.iinfo(np.int64).max - 2050),)

schema = pa.schema(
[
("myint", pa.int32()),
("mystring", pa.large_string()),
("mybool", pa.bool_()),
("mydatetime", pa.timestamp("s")),
("myenum", pa.dictionary(pa.int64(), pa.large_string())),
]
)

soma.DataFrame.create(uri, schema=schema, domain=domain)

pydict = {}
pydict["soma_joinid"] = [None, 1, 2, 3, 4, 5, 6, 7, 8, 9]
pydict["myint"] = [1, 2, 3, 4, 5, 6, None, 8, None, None]
pydict["mystring"] = ["g1", "g2", "g3", None, "g2", "g3", "g1", None, "g3", "g1"]
pydict["mybool"] = [True, True, True, False, True, False, None, False, None, None]
pydict["mydatetime"] = [
np.datetime64("NaT", "s"),
np.datetime64(1, "s"),
np.datetime64(2, "s"),
np.datetime64("NaT", "s"),
np.datetime64(4, "s"),
np.datetime64(5, "s"),
np.datetime64(6, "s"),
np.datetime64(7, "s"),
np.datetime64("NaT", "s"),
np.datetime64(9, "s"),
]
pydict["myenum"] = pd.Categorical(
["g1", "g2", "g3", None, "g2", "g3", "g1", None, "g3", "g1"]
)
table = pa.Table.from_pydict(pydict)

with soma.DataFrame.open(uri, "w") as A:
with raises_no_typeguard(soma.SOMAError):
# soma_joinid cannot be nullable
A.write(table)

pydict["soma_joinid"] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
table = pa.Table.from_pydict(pydict)

with soma.DataFrame.open(uri, "w") as A:
A.write(table)

with soma.DataFrame.open(uri) as A:
pdf = A.read().concat()
assert_array_equal(pdf["myint"], table["myint"])
assert_array_equal(pdf["mystring"], table["mystring"])
assert_array_equal(pdf["mybool"], table["mybool"])
assert_array_equal(pdf["mydatetime"], table["mydatetime"])
assert_array_equal(pdf["myenum"], table["myenum"])

with soma.DataFrame.open(uri, "w") as A:
mid = num_rows // 2
A.write(table[:mid])
A.write(table[mid:])

with soma.DataFrame.open(uri) as A:
pdf = A.read().concat()
assert_array_equal(pdf["myint"], table["myint"])
assert_array_equal(pdf["mystring"], table["mystring"])
assert_array_equal(pdf["mybool"], table["mybool"])
assert_array_equal(pdf["mydatetime"], table["mydatetime"])
assert_array_equal(pdf["myenum"], table["myenum"])
35 changes: 35 additions & 0 deletions apis/python/tests/test_sparse_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2007,3 +2007,38 @@ def test(path, tiledb_config):
},
)
gc.collect()


def test_sparse_nd_array_null(tmp_path):
uri = tmp_path.as_posix()

pydict = {
"soma_dim_0": pa.array([None, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
"soma_data": pa.array(
[None, 0, None, 1, 2, None, None, 3, 4, 5], type=pa.float64()
),
}
table = pa.Table.from_pydict(pydict)

soma.SparseNDArray.create(uri, type=pa.int64(), shape=(10,))

with soma.SparseNDArray.open(uri, "w") as A:
with raises_no_typeguard(soma.SOMAError):
# soma_joinid cannot be nullable
A.write(table[:5])
A.write(table[5:])

pydict["soma_dim_0"] = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
table = pa.Table.from_pydict(pydict)

with soma.SparseNDArray.open(uri, "w") as A:
A.write(table[:5])
A.write(table[5:])

with soma.SparseNDArray.open(uri) as A:
pdf = A.read().tables().concat()

# soma_data is a non-nullable attribute. In ManagedQuery.set_array_data,
# any null values present in non-nullable attributes get casted to
# fill values. In the case for float64, the fill value is 0
np.testing.assert_array_equal(pdf["soma_data"], table["soma_data"].fill_null(0))
15 changes: 12 additions & 3 deletions libtiledbsoma/src/soma/column_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,23 @@ ColumnBuffer::~ColumnBuffer() {

void ColumnBuffer::attach(Query& query, std::optional<Subarray> subarray) {
auto is_write = query.query_type() == TILEDB_WRITE;
bool is_dense = query.array().schema().array_type() == TILEDB_DENSE;
auto is_dim = query.array().schema().domain().has_dimension(name_);
auto schema = query.array().schema();
bool is_dense = schema.array_type() == TILEDB_DENSE;
auto is_dim = schema.domain().has_dimension(name_);
auto use_subarray = is_write && is_dense && is_dim;

if (use_subarray && !subarray.has_value()) {
throw TileDBSOMAError(
"Subarray must be provided to ColumnBuffer to attach to Query");
"[ColumnBuffer::attach] Subarray must be provided to ColumnBuffer "
"to attach to Query");
}

if (!validity_.empty() && is_dim) {
throw TileDBSOMAError(fmt::format(
"[ColumnBuffer::attach] Validity buffer passed for dimension '{}'",
name_));
}

return use_subarray ? attach_subarray(*subarray) : attach_buffer(query);
}

Expand Down
14 changes: 5 additions & 9 deletions libtiledbsoma/src/soma/column_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class ColumnBuffer {
uint64_t num_elems,
const void* data,
T* offsets,
uint8_t* validity = nullptr) {
const std::optional<std::vector<uint8_t>>& validity = std::nullopt) {
num_cells_ = num_elems;

// Ensure the offset type is either uint32_t* or uint64_t*
Expand All @@ -146,14 +146,10 @@ class ColumnBuffer {
(std::byte*)data, (std::byte*)data + num_elems * type_size_);
}

if (is_nullable_) {
if (validity != nullptr) {
for (uint64_t i = 0; i < num_elems; ++i) {
uint8_t byte = validity[i / 8];
uint8_t bit = (byte >> (i % 8)) & 0x01;
validity_.push_back(bit);
}
} else {
if (validity.has_value()) {
validity_ = *validity;
} else {
if (is_nullable_) {
validity_.assign(num_elems, 1); // Default all to valid (1)
}
}
Expand Down
54 changes: 32 additions & 22 deletions libtiledbsoma/src/soma/managed_query.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include "utils/common.h"
#include "utils/logger.h"
#include "utils/util.h"

namespace tiledbsoma {

using namespace tiledb;
Expand Down Expand Up @@ -805,7 +806,7 @@ void ManagedQuery::_cast_dictionary_values(
array->length,
(const void*)index_to_value.data(),
(uint64_t*)nullptr,
(uint8_t*)value_array->buffers[0]);
std::nullopt); // validities are set by index column
}

template <>
Expand Down Expand Up @@ -862,7 +863,7 @@ void ManagedQuery::_cast_dictionary_values<std::string>(
value_offsets.size() - 1,
(const void*)index_to_value.data(),
(uint64_t*)value_offsets.data(),
(uint8_t*)value_array->buffers[0]);
std::nullopt); // validities are set by index column
}

template <>
Expand All @@ -875,8 +876,7 @@ void ManagedQuery::_cast_dictionary_values<bool>(
auto value_array = array->dictionary;

std::vector<int64_t> indexes = _get_index_vector(schema, array);
std::vector<uint8_t> values = util::cast_bit_to_uint8(
value_schema, value_array);
std::vector<uint8_t> values = _cast_bool_data(value_schema, value_array);
std::vector<uint8_t> index_to_value;

for (auto i : indexes) {
Expand All @@ -888,7 +888,7 @@ void ManagedQuery::_cast_dictionary_values<bool>(
array->length,
(const void*)index_to_value.data(),
(uint64_t*)nullptr,
(uint8_t*)value_array->buffers[0]);
std::nullopt); // validities are set by index column
}

template <typename UserType>
Expand Down Expand Up @@ -981,13 +981,9 @@ bool ManagedQuery::_cast_column_aux<std::string>(
array->n_buffers));
}

const char* data = (const char*)array->buffers[2];
uint8_t* validity = (uint8_t*)array->buffers[0];
const void* data = array->buffers[2];
std::optional<std::vector<uint8_t>> validity = _cast_validity_buffer(array);

// If this is a table-slice, slice into the validity buffer.
if (validity != nullptr) {
validity += array->offset;
}
// If this is a table-slice, do *not* slice into the data
// buffer since it is indexed via offsets, which we slice
// into below.
Expand All @@ -996,14 +992,12 @@ bool ManagedQuery::_cast_column_aux<std::string>(
(strcmp(schema->format, "Z") == 0)) {
// If this is a table-slice, slice into the offsets buffer.
uint64_t* offset = (uint64_t*)array->buffers[1] + array->offset;
setup_write_column(
schema->name, array->length, (const void*)data, offset, validity);
setup_write_column(schema->name, array->length, data, offset, validity);

} else {
// If this is a table-slice, slice into the offsets buffer.
uint32_t* offset = (uint32_t*)array->buffers[1] + array->offset;
setup_write_column(
schema->name, array->length, (const void*)data, offset, validity);
setup_write_column(schema->name, array->length, data, offset, validity);
}
return false;
}
Expand All @@ -1013,18 +1007,14 @@ bool ManagedQuery::_cast_column_aux<bool>(
ArrowSchema* schema, ArrowArray* array, ArraySchemaEvolution se) {
(void)se; // se is unused in bool specialization

auto casted = util::cast_bit_to_uint8(schema, array);
uint8_t* validity = (uint8_t*)array->buffers[0];
if (validity != nullptr) {
validity += array->offset;
}
auto casted = _cast_bool_data(schema, array);

setup_write_column(
schema->name,
array->length,
(const void*)casted.data(),
(uint64_t*)nullptr,
(uint8_t*)validity);
_cast_validity_buffer(array));
return false;
}

Expand Down Expand Up @@ -1101,7 +1091,7 @@ bool ManagedQuery::_extend_and_evolve_schema(
if (strcmp(value_schema->format, "b") == 0) {
// Specially handle Boolean types as their representation in Arrow (bit)
// is different from what is in TileDB (uint8_t)
auto casted = util::cast_bit_to_uint8(value_schema, value_array);
auto casted = _cast_bool_data(value_schema, value_array);
enums_in_write.assign(casted.data(), casted.data() + num_elems);
} else {
// General case
Expand Down Expand Up @@ -1256,4 +1246,24 @@ bool ManagedQuery::_extend_and_evolve_schema<std::string>(
}
return false;
}

std::vector<uint8_t> ManagedQuery::_cast_bool_data(
ArrowSchema* schema, ArrowArray* array) {
if (strcmp(schema->format, "b") != 0) {
throw TileDBSOMAError(fmt::format(
"_cast_bit_to_uint8 expected column format to be 'b' but saw "
"{}",
schema->format));
}

const uint8_t* data = reinterpret_cast<const uint8_t*>(array->buffers[1]);
return *util::bitmap_to_uint8(data, array->length, array->offset);
}

std::optional<std::vector<uint8_t>> ManagedQuery::_cast_validity_buffer(
ArrowArray* array) {
const uint8_t* validity = reinterpret_cast<const uint8_t*>(
array->buffers[0]);
return util::bitmap_to_uint8(validity, array->length, array->offset);
}
}; // namespace tiledbsoma
Loading

0 comments on commit 100a660

Please sign in to comment.