From c80b8dc4542622560578b2df4771078e1a36715c Mon Sep 17 00:00:00 2001 From: Ypatia Tsavliri Date: Thu, 2 Mar 2023 17:14:03 +0200 Subject: [PATCH] Fix checks and dict for empty strings --- test/src/unit-cppapi-filter.cc | 83 ++++++++++++++++++++++++ tiledb/sm/compressors/dict_compressor.cc | 2 +- tiledb/sm/compressors/dict_compressor.h | 15 +++-- tiledb/sm/compressors/rle_compressor.cc | 8 +-- 4 files changed, 96 insertions(+), 12 deletions(-) diff --git a/test/src/unit-cppapi-filter.cc b/test/src/unit-cppapi-filter.cc index 4cb5d4a0bdab..e1b9469472bf 100644 --- a/test/src/unit-cppapi-filter.cc +++ b/test/src/unit-cppapi-filter.cc @@ -731,3 +731,86 @@ TEST_CASE( if (vfs.is_dir(array_name)) vfs.remove_dir(array_name); } + +TEST_CASE( + "C++ API: Filter empty strings with RLE or Dictionary encoding", + "[cppapi][filter][rle-strings][dict-strings][empty-strings]") { + using namespace tiledb; + Context ctx; + VFS vfs(ctx); + std::string array_name = "cpp_unit_array"; + + if (vfs.is_dir(array_name)) + vfs.remove_dir(array_name); + + auto f = GENERATE(TILEDB_FILTER_RLE, TILEDB_FILTER_DICTIONARY); + + // Create array with string dimension and one attribute + ArraySchema schema(ctx, TILEDB_SPARSE); + + FilterList filters(ctx); + filters.add_filter({ctx, f}); + + auto d0 = Dimension::create(ctx, "d0", TILEDB_STRING_ASCII, nullptr, nullptr); + d0.set_filter_list(filters); + + Domain domain(ctx); + domain.add_dimensions(d0); + schema.set_domain(domain); + + auto a0 = Attribute::create(ctx, "a0"); + schema.add_attributes(a0); + schema.set_allows_dups(true); + + Array::create(array_name, schema); + + // Write empty strings to the array in 2 different ways + int elements = 10; + auto full_buffer_of_empty_strings = GENERATE(true, false); + std::vector d0_buf(0); + if (full_buffer_of_empty_strings) { + d0_buf.assign(elements, 0); + } + + std::vector d0_offsets_buf(elements, 0); + std::vector a0_buf(elements, 42); + + Array array_w(ctx, array_name, TILEDB_WRITE); + Query query_w(ctx, array_w); + query_w.set_layout(TILEDB_UNORDERED) + .set_data_buffer("d0", d0_buf) + .set_offsets_buffer("d0", d0_offsets_buf) + .set_data_buffer("a0", a0_buf); + query_w.submit(); + array_w.close(); + + // Read all data and check no error and data correct + std::vector d0_read_buf(1 << 20); + std::vector d0_offsets_read_buf(1 << 20); + std::vector a0_read_buf(1 << 20); + + Array array_r(ctx, array_name, TILEDB_READ); + Query query_r(ctx, array_r); + query_r.set_layout(TILEDB_UNORDERED); + query_r.set_data_buffer("d0", d0_read_buf) + .set_offsets_buffer("d0", d0_offsets_read_buf) + .set_data_buffer("a0", a0_read_buf); + + auto st = query_r.submit(); + REQUIRE(st == Query::Status::COMPLETE); + + auto results = query_r.result_buffer_elements(); + auto num_offsets = results["d0"].first; + CHECK(num_offsets == 10); + // auto num_elements = results["d0"].second; + + for (uint64_t i = 0; i < num_offsets; i++) { + CHECK(a0_read_buf[i] == 42); + } + + array_r.close(); + + // Clean up + if (vfs.is_dir(array_name)) + vfs.remove_dir(array_name); +} diff --git a/tiledb/sm/compressors/dict_compressor.cc b/tiledb/sm/compressors/dict_compressor.cc index 623be2b8320f..df5774dcb51a 100644 --- a/tiledb/sm/compressors/dict_compressor.cc +++ b/tiledb/sm/compressors/dict_compressor.cc @@ -68,7 +68,7 @@ void DictEncoding::decompress( const uint8_t word_id_size, span output, span output_offsets) { - if (input.empty() || output.empty() || word_id_size == 0) { + if (input.empty() || word_id_size == 0) { throw std::logic_error( "Failed decompressing dictionary-encoded strings; empty input " "arguments."); diff --git a/tiledb/sm/compressors/dict_compressor.h b/tiledb/sm/compressors/dict_compressor.h index d265fb56ff39..9bcb652c77bc 100644 --- a/tiledb/sm/compressors/dict_compressor.h +++ b/tiledb/sm/compressors/dict_compressor.h @@ -183,11 +183,15 @@ class DictEncoding { const span dict, span output, span output_offsets) { - if (input.empty() || output.empty() || dict.size() == 0) { + if (input.empty()) { throw std::logic_error( "Empty arguments when decompressing dictionary encoded strings."); } + if (dict.empty() || output.size() == 0) { + return; + } + T word_id = 0; size_t in_index = 0, out_index = 0, offset_index = 0; @@ -214,11 +218,14 @@ class DictEncoding { std::vector serialized_dict(dict_size); size_t out_index = 0; for (const auto& dict_entry : dict) { + // extra care for empty strings + auto entry_size = dict_entry.empty() ? 1 : dict_entry.size(); + auto entry_data = dict_entry.empty() ? "" : dict_entry.data(); utils::endianness::encode_be( - static_cast(dict_entry.size()), &serialized_dict[out_index]); + static_cast(entry_size), &serialized_dict[out_index]); out_index += sizeof(T); - memcpy(&serialized_dict[out_index], dict_entry.data(), dict_entry.size()); - out_index += dict_entry.size(); + memcpy(&serialized_dict[out_index], entry_data, entry_size); + out_index += entry_size; } serialized_dict.resize(out_index); diff --git a/tiledb/sm/compressors/rle_compressor.cc b/tiledb/sm/compressors/rle_compressor.cc index d69e141d636c..549a4f0c82f1 100644 --- a/tiledb/sm/compressors/rle_compressor.cc +++ b/tiledb/sm/compressors/rle_compressor.cc @@ -146,11 +146,6 @@ uint64_t RLE::overhead(uint64_t nbytes, uint64_t value_size) { } uint8_t RLE::compute_bytesize(uint64_t param_length) { - if (param_length == 0) { - throw std::logic_error( - "Cannot compute RLE parameter bytesize for zero length"); - } - if (param_length <= std::numeric_limits::max()) { return 1; } else if (param_length <= std::numeric_limits::max()) { @@ -260,8 +255,7 @@ Status RLE::decompress( const uint8_t string_len_size, span output, span output_offsets) { - if (input.empty() || output.empty() || rle_len_size == 0 || - string_len_size == 0) { + if (input.empty() || rle_len_size == 0 || string_len_size == 0) { return LOG_STATUS(Status_CompressionError( "Failed decompressing strings with RLE; empty input arguments")); }