Skip to content

Commit

Permalink
Fix checks and dict for empty strings
Browse files Browse the repository at this point in the history
  • Loading branch information
ypatia committed Mar 2, 2023
1 parent fcceef3 commit c80b8dc
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 12 deletions.
83 changes: 83 additions & 0 deletions test/src/unit-cppapi-filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -731,3 +731,86 @@ TEST_CASE(
if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);
}

TEST_CASE(
"C++ API: Filter empty strings with RLE or Dictionary encoding",
"[cppapi][filter][rle-strings][dict-strings][empty-strings]") {
using namespace tiledb;
Context ctx;
VFS vfs(ctx);
std::string array_name = "cpp_unit_array";

if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);

auto f = GENERATE(TILEDB_FILTER_RLE, TILEDB_FILTER_DICTIONARY);

// Create array with string dimension and one attribute
ArraySchema schema(ctx, TILEDB_SPARSE);

FilterList filters(ctx);
filters.add_filter({ctx, f});

auto d0 = Dimension::create(ctx, "d0", TILEDB_STRING_ASCII, nullptr, nullptr);
d0.set_filter_list(filters);

Domain domain(ctx);
domain.add_dimensions(d0);
schema.set_domain(domain);

auto a0 = Attribute::create<int32_t>(ctx, "a0");
schema.add_attributes(a0);
schema.set_allows_dups(true);

Array::create(array_name, schema);

// Write empty strings to the array in 2 different ways
int elements = 10;
auto full_buffer_of_empty_strings = GENERATE(true, false);
std::vector<char> d0_buf(0);
if (full_buffer_of_empty_strings) {
d0_buf.assign(elements, 0);
}

std::vector<uint64_t> d0_offsets_buf(elements, 0);
std::vector<int> a0_buf(elements, 42);

Array array_w(ctx, array_name, TILEDB_WRITE);
Query query_w(ctx, array_w);
query_w.set_layout(TILEDB_UNORDERED)
.set_data_buffer("d0", d0_buf)
.set_offsets_buffer("d0", d0_offsets_buf)
.set_data_buffer("a0", a0_buf);
query_w.submit();
array_w.close();

// Read all data and check no error and data correct
std::vector<std::byte> d0_read_buf(1 << 20);
std::vector<uint64_t> d0_offsets_read_buf(1 << 20);
std::vector<int32_t> a0_read_buf(1 << 20);

Array array_r(ctx, array_name, TILEDB_READ);
Query query_r(ctx, array_r);
query_r.set_layout(TILEDB_UNORDERED);
query_r.set_data_buffer("d0", d0_read_buf)
.set_offsets_buffer("d0", d0_offsets_read_buf)
.set_data_buffer("a0", a0_read_buf);

auto st = query_r.submit();
REQUIRE(st == Query::Status::COMPLETE);

auto results = query_r.result_buffer_elements();
auto num_offsets = results["d0"].first;
CHECK(num_offsets == 10);
// auto num_elements = results["d0"].second;

for (uint64_t i = 0; i < num_offsets; i++) {
CHECK(a0_read_buf[i] == 42);
}

array_r.close();

// Clean up
if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);
}
2 changes: 1 addition & 1 deletion tiledb/sm/compressors/dict_compressor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void DictEncoding::decompress(
const uint8_t word_id_size,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || word_id_size == 0) {
if (input.empty() || word_id_size == 0) {
throw std::logic_error(
"Failed decompressing dictionary-encoded strings; empty input "
"arguments.");
Expand Down
15 changes: 11 additions & 4 deletions tiledb/sm/compressors/dict_compressor.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,15 @@ class DictEncoding {
const span<const std::string> dict,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || dict.size() == 0) {
if (input.empty()) {
throw std::logic_error(
"Empty arguments when decompressing dictionary encoded strings.");
}

if (dict.empty() || output.size() == 0) {
return;
}

T word_id = 0;
size_t in_index = 0, out_index = 0, offset_index = 0;

Expand All @@ -214,11 +218,14 @@ class DictEncoding {
std::vector<std::byte> serialized_dict(dict_size);
size_t out_index = 0;
for (const auto& dict_entry : dict) {
// extra care for empty strings
auto entry_size = dict_entry.empty() ? 1 : dict_entry.size();
auto entry_data = dict_entry.empty() ? "" : dict_entry.data();
utils::endianness::encode_be<T>(
static_cast<T>(dict_entry.size()), &serialized_dict[out_index]);
static_cast<T>(entry_size), &serialized_dict[out_index]);
out_index += sizeof(T);
memcpy(&serialized_dict[out_index], dict_entry.data(), dict_entry.size());
out_index += dict_entry.size();
memcpy(&serialized_dict[out_index], entry_data, entry_size);
out_index += entry_size;
}

serialized_dict.resize(out_index);
Expand Down
8 changes: 1 addition & 7 deletions tiledb/sm/compressors/rle_compressor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,6 @@ uint64_t RLE::overhead(uint64_t nbytes, uint64_t value_size) {
}

uint8_t RLE::compute_bytesize(uint64_t param_length) {
if (param_length == 0) {
throw std::logic_error(
"Cannot compute RLE parameter bytesize for zero length");
}

if (param_length <= std::numeric_limits<uint8_t>::max()) {
return 1;
} else if (param_length <= std::numeric_limits<uint16_t>::max()) {
Expand Down Expand Up @@ -260,8 +255,7 @@ Status RLE::decompress(
const uint8_t string_len_size,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || rle_len_size == 0 ||
string_len_size == 0) {
if (input.empty() || rle_len_size == 0 || string_len_size == 0) {
return LOG_STATUS(Status_CompressionError(
"Failed decompressing strings with RLE; empty input arguments"));
}
Expand Down

0 comments on commit c80b8dc

Please sign in to comment.