Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix support for empty strings for Dictionary and RLE encodings #3938

Merged
merged 7 commits into from
Mar 8, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion test/src/unit-compression-rle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ TEST_CASE(
TEST_CASE(
"Compression-RLE: Test bytesize computation",
"[compression][rle][rle-strings]") {
REQUIRE_THROWS_AS(RLE::compute_bytesize(0), std::logic_error);
CHECK(RLE::compute_bytesize(0) == 1);
CHECK(RLE::compute_bytesize(1) == 1);
CHECK(RLE::compute_bytesize(0xff) == 1);
CHECK(RLE::compute_bytesize(0x100) == 2);
Expand Down
82 changes: 82 additions & 0 deletions test/src/unit-cppapi-filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -731,3 +731,85 @@ TEST_CASE(
if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);
}

TEST_CASE(
"C++ API: Filter empty strings with RLE or Dictionary encoding",
"[cppapi][filter][rle-strings][dict-strings][empty-strings]") {
using namespace tiledb;
Context ctx;
VFS vfs(ctx);
std::string array_name = "cpp_unit_array";

if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);

auto f = GENERATE(TILEDB_FILTER_RLE, TILEDB_FILTER_DICTIONARY);

// Create array with string dimension and one attribute
ArraySchema schema(ctx, TILEDB_SPARSE);

FilterList filters(ctx);
filters.add_filter({ctx, f});

auto d0 = Dimension::create(ctx, "d0", TILEDB_STRING_ASCII, nullptr, nullptr);
d0.set_filter_list(filters);

Domain domain(ctx);
domain.add_dimensions(d0);
schema.set_domain(domain);

auto a0 = Attribute::create<int32_t>(ctx, "a0");
schema.add_attributes(a0);
schema.set_allows_dups(true);

Array::create(array_name, schema);

// Write empty strings to the array in 2 different ways
int elements = 10;
auto full_buffer_of_empty_strings = GENERATE(true, false);
std::vector<char> d0_buf(0);
if (full_buffer_of_empty_strings) {
d0_buf.assign(elements, 0);
}

std::vector<uint64_t> d0_offsets_buf(elements, 0);
std::vector<int> a0_buf(elements, 42);

Array array_w(ctx, array_name, TILEDB_WRITE);
Query query_w(ctx, array_w);
query_w.set_layout(TILEDB_UNORDERED)
.set_data_buffer("d0", d0_buf)
.set_offsets_buffer("d0", d0_offsets_buf)
.set_data_buffer("a0", a0_buf);
query_w.submit();
array_w.close();

// Read all data and check no error and data correct
std::vector<std::byte> d0_read_buf(1 << 20);
std::vector<uint64_t> d0_offsets_read_buf(1 << 20);
std::vector<int32_t> a0_read_buf(1 << 20);

Array array_r(ctx, array_name, TILEDB_READ);
Query query_r(ctx, array_r);
query_r.set_layout(TILEDB_UNORDERED);
query_r.set_data_buffer("d0", d0_read_buf)
.set_offsets_buffer("d0", d0_offsets_read_buf)
.set_data_buffer("a0", a0_read_buf);

auto st = query_r.submit();
REQUIRE(st == Query::Status::COMPLETE);

auto results = query_r.result_buffer_elements();
auto num_offsets = results["d0"].first;
CHECK(num_offsets == 10);

for (uint64_t i = 0; i < num_offsets; i++) {
CHECK(a0_read_buf[i] == 42);
}

array_r.close();

// Clean up
if (vfs.is_dir(array_name))
vfs.remove_dir(array_name);
}
2 changes: 1 addition & 1 deletion tiledb/sm/compressors/dict_compressor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void DictEncoding::decompress(
const uint8_t word_id_size,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || word_id_size == 0) {
if (input.empty() || word_id_size == 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having just read the code, I'm pretty sure the removal of checking output.empty() here and in dict_compressor.h:186 is the entire fix for the issue.

Copy link
Member Author

@ypatia ypatia Mar 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, there are 2 reasons those 2 checks are not enough:

  • In deserialize_dictionary, if I have an empty string as entry, I am not properly incrementing the index to the serialized_dictionary (in_index) in dict_compressor.h:261 and I am accessing out of bounds. I special-cased for str_len = 0 there and reverted the rest of the changes, can you check if you like that solution or you have a better idea on how to handle that?
  • In decompress I still need the special case for output.empty() in dict_compressor.h:193 because otherwise I get a SIGABRT because I am violating a contract of the implementation of span we are using that assumes that when writing to a span at some index idx : TCB_SPAN_EXPECT(idx < size()); , and in case of an empty output we get a failing 0 < 0 .

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My fix is still not correct for the all empty strings case because of the last special case I mentioned where I exit if output.empty(), because I don't reconstruct the offsets_buffer. If I do reconstruct them I get a heap buffer overflow though, I am still fighting it to understand why.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yeah, that span behavior is odd since we're indexing with a zero length array. How does this approach look to you?

https://github.com/TileDB-Inc/TileDB/compare/pd/ch25823/support_for_empty_strings_dict_rle

Locally it passes the new test with --enable-debug --enable-assertions on the bootstrap line. I haven't run a full test suite with it though.

throw std::logic_error(
"Failed decompressing dictionary-encoded strings; empty input "
"arguments.");
Expand Down
17 changes: 13 additions & 4 deletions tiledb/sm/compressors/dict_compressor.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,17 @@ class DictEncoding {
const span<const std::string> dict,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || dict.size() == 0) {
if (input.empty() || dict.size() == 0) {
throw std::logic_error(
"Empty arguments when decompressing dictionary encoded strings.");
}

// this can be the case if the compressed buffer was empty, eg. representing
// empty strings
if (output.size() == 0) {
return;
}

T word_id = 0;
size_t in_index = 0, out_index = 0, offset_index = 0;

Expand All @@ -214,11 +220,14 @@ class DictEncoding {
std::vector<std::byte> serialized_dict(dict_size);
size_t out_index = 0;
for (const auto& dict_entry : dict) {
// extra care for empty strings
auto entry_size = dict_entry.empty() ? 1 : dict_entry.size();
auto entry_data = dict_entry.empty() ? "" : dict_entry.data();
utils::endianness::encode_be<T>(
static_cast<T>(dict_entry.size()), &serialized_dict[out_index]);
static_cast<T>(entry_size), &serialized_dict[out_index]);
out_index += sizeof(T);
memcpy(&serialized_dict[out_index], dict_entry.data(), dict_entry.size());
out_index += dict_entry.size();
memcpy(&serialized_dict[out_index], entry_data, entry_size);
out_index += entry_size;
}

serialized_dict.resize(out_index);
Expand Down
8 changes: 1 addition & 7 deletions tiledb/sm/compressors/rle_compressor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,6 @@ uint64_t RLE::overhead(uint64_t nbytes, uint64_t value_size) {
}

uint8_t RLE::compute_bytesize(uint64_t param_length) {
if (param_length == 0) {
throw std::logic_error(
"Cannot compute RLE parameter bytesize for zero length");
}

if (param_length <= std::numeric_limits<uint8_t>::max()) {
return 1;
} else if (param_length <= std::numeric_limits<uint16_t>::max()) {
Expand Down Expand Up @@ -260,8 +255,7 @@ Status RLE::decompress(
const uint8_t string_len_size,
span<std::byte> output,
span<uint64_t> output_offsets) {
if (input.empty() || output.empty() || rle_len_size == 0 ||
string_len_size == 0) {
if (input.empty() || rle_len_size == 0 || string_len_size == 0) {
return LOG_STATUS(Status_CompressionError(
"Failed decompressing strings with RLE; empty input arguments"));
}
Expand Down