Skip to content

Commit

Permalink
Fix support for empty strings for Dictionary and RLE encodings (#3938)
Browse files Browse the repository at this point in the history
This is a problem found in VCF where if using a string dimension compressed with a dictionary filter, we get an error if the value of the string dimension is always an empty string ("") :

"terminate called after throwing an instance of 'tiledb::TileDBError'
  what():  [TileDB::Task] Error: Caught std::exception: Failed decompressing dictionary-encoded strings; empty input arguments."

The failure would not occur when reading an array that was written using a string dimension data buffer that contained N entries with value 0, but only when writing using an empty string dimension data buffer .

I also added support for encoding empty strings with RLE by removing a check (this was previously done for Dictionary in https://github.com/TileDB-Inc/TileDB/pull/3493/files .

Co-authored-by: Paul J. Davis <[email protected]>
  • Loading branch information
ypatia and davisp authored Mar 8, 2023
1 parent f567c01 commit 1307810
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 15 deletions.
2 changes: 1 addition & 1 deletion test/src/unit-compression-rle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ TEST_CASE(
TEST_CASE(
"Compression-RLE: Test bytesize computation",
"[compression][rle][rle-strings]") {
REQUIRE_THROWS_AS(RLE::compute_bytesize(0), std::logic_error);
CHECK(RLE::compute_bytesize(0) == 1);
CHECK(RLE::compute_bytesize(1) == 1);
CHECK(RLE::compute_bytesize(0xff) == 1);
CHECK(RLE::compute_bytesize(0x100) == 2);
Expand Down
101 changes: 101 additions & 0 deletions test/src/unit-cppapi-filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -731,3 +731,104 @@ TEST_CASE(
if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);
}

TEST_CASE(
"C++ API: Filter buffer with some empty strings with RLE or Dictionary "
"encoding",
"[cppapi][filter][rle-strings][dict-strings][empty-strings]") {
using namespace tiledb;
Context ctx;
VFS vfs(ctx);
std::string array_name = "cpp_unit_array";

if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);

auto f = GENERATE(TILEDB_FILTER_RLE, TILEDB_FILTER_DICTIONARY);

// Create array with string dimension and one attribute
ArraySchema schema(ctx, TILEDB_SPARSE);

FilterList filters(ctx);
filters.add_filter({ctx, f});

auto d0 = Dimension::create(ctx, "d0", TILEDB_STRING_ASCII, nullptr, nullptr);
d0.set_filter_list(filters);

Domain domain(ctx);
domain.add_dimensions(d0);
schema.set_domain(domain);

auto a0 = Attribute::create<int32_t>(ctx, "a0");
schema.add_attributes(a0);
schema.set_allows_dups(true);

Array::create(array_name, schema);

std::vector<char> d0_buf;
std::vector<uint64_t> d0_offsets_buf;
std::vector<int> a0_buf;

SECTION("Only empty strings in data buffer") {
// Write this data buffer to the array:
// ["\0","\0","\0","\0","\0","\0","\0","\0","\0","\0"]
// As data we use the d0 array empty, as already defined.
d0_offsets_buf.assign(10, 0);
a0_buf.assign(10, 42);
}

SECTION("Combine empty and strings of nulls in data buffer") {
// Write this data buffer to the array:
// ["\0","\0","\0","\0","\0","\0","\0","\0","\0","\0\0\0\0\0\0\0\0\0\0"]
d0_buf.assign(10, 0);
d0_offsets_buf.assign(10, 0);
a0_buf.assign(10, 42);
}

SECTION("Combine empty and non-empty strings in data buffer") {
// Write this data buffer to the array of strings: ["a", "bb", "", "c", ""]
d0_buf.assign({'a', 'b', 'b', 'c'});
d0_offsets_buf.assign({0, 1, 3, 3, 4});
a0_buf.assign({42, 42, 42, 42, 42});
}

Array array_w(ctx, array_name, TILEDB_WRITE);
Query query_w(ctx, array_w);
query_w.set_layout(TILEDB_UNORDERED)
.set_data_buffer("d0", d0_buf)
.set_offsets_buffer("d0", d0_offsets_buf)
.set_data_buffer("a0", a0_buf);
query_w.submit();
array_w.close();

// Read all data and check no error and data correct
std::vector<std::byte> d0_read_buf(1 << 20);
std::vector<uint64_t> d0_offsets_read_buf(1 << 20);
std::vector<int32_t> a0_read_buf(1 << 20);

Array array_r(ctx, array_name, TILEDB_READ);
Query query_r(ctx, array_r);
query_r.set_layout(TILEDB_UNORDERED);
query_r.set_data_buffer("d0", d0_read_buf)
.set_offsets_buffer("d0", d0_offsets_read_buf)
.set_data_buffer("a0", a0_read_buf);

auto st = query_r.submit();
REQUIRE(st == Query::Status::COMPLETE);

auto results = query_r.result_buffer_elements();
auto num_offsets = results["d0"].first;
CHECK(num_offsets == d0_offsets_buf.size());
auto str_len = results["d0"].second;
CHECK(str_len == d0_buf.size());

for (uint64_t i = 0; i < num_offsets; i++) {
CHECK(a0_read_buf[i] == 42);
}

array_r.close();

// Clean up
if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);
}
2 changes: 1 addition & 1 deletion tiledb/sm/compressors/dict_compressor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void DictEncoding::decompress(
const uint8_t word_id_size,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || word_id_size == 0) {
if (input.empty() || word_id_size == 0) {
throw std::logic_error(
"Failed decompressing dictionary-encoded strings; empty input "
"arguments.");
Expand Down
14 changes: 10 additions & 4 deletions tiledb/sm/compressors/dict_compressor.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ class DictEncoding {
const span<const std::string> dict,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || dict.size() == 0) {
if (input.empty() || dict.size() == 0) {
throw std::logic_error(
"Empty arguments when decompressing dictionary encoded strings.");
}
Expand All @@ -196,7 +196,9 @@ class DictEncoding {
in_index += sizeof(T);
assert(word_id < dict.size());
const auto& word = dict[word_id];
memcpy(&output[out_index], word.data(), word.size());
if (word.size() > 0) {
memcpy(&output[out_index], word.data(), word.size());
}
output_offsets[offset_index++] = out_index;
out_index += word.size();
}
Expand Down Expand Up @@ -246,8 +248,12 @@ class DictEncoding {
// increment past the size element to the per-word data block
in_index += sizeof(T);
// construct string in place
dict.emplace_back(
reinterpret_cast<const char*>(&serialized_dict[in_index]), str_len);
if (str_len > 0) {
dict.emplace_back(
reinterpret_cast<const char*>(&serialized_dict[in_index]), str_len);
} else {
dict.emplace_back();
}
in_index += str_len;
}

Expand Down
8 changes: 1 addition & 7 deletions tiledb/sm/compressors/rle_compressor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,6 @@ uint64_t RLE::overhead(uint64_t nbytes, uint64_t value_size) {
}

uint8_t RLE::compute_bytesize(uint64_t param_length) {
if (param_length == 0) {
throw std::logic_error(
"Cannot compute RLE parameter bytesize for zero length");
}

if (param_length <= std::numeric_limits<uint8_t>::max()) {
return 1;
} else if (param_length <= std::numeric_limits<uint16_t>::max()) {
Expand Down Expand Up @@ -260,8 +255,7 @@ Status RLE::decompress(
const uint8_t string_len_size,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || rle_len_size == 0 ||
string_len_size == 0) {
if (input.empty() || rle_len_size == 0 || string_len_size == 0) {
return LOG_STATUS(Status_CompressionError(
"Failed decompressing strings with RLE; empty input arguments"));
}
Expand Down
6 changes: 4 additions & 2 deletions tiledb/sm/compressors/rle_compressor.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ class RLE {
const span<const std::byte> input,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || output_offsets.empty())
if (input.empty() || output_offsets.empty())
return;

const uint64_t run_size = sizeof(T);
Expand All @@ -168,7 +168,9 @@ class RLE {
string_length = utils::endianness::decode_be<P>(&input[in_index]);
in_index += str_len_size;
for (uint64_t j = 0; j < run_length; j++) {
memcpy(&output[out_offset], &input[in_index], string_length);
if (string_length > 0) {
memcpy(&output[out_offset], &input[in_index], string_length);
}
output_offsets[offset_index++] = out_offset;
out_offset += string_length;
}
Expand Down

0 comments on commit 1307810

Please sign in to comment.