From 2a29deb3dbcd195342008697df49ee30a5a9e507 Mon Sep 17 00:00:00 2001 From: Seth Shelnutt Date: Sat, 27 Aug 2022 12:47:07 -0400 Subject: [PATCH] Dictionary encoding should handle zero length strings Zero length strings are valid and supported by TileDB. The zero length encoding had some safety checks were got tripped up. This adjusts the checks and adds a unit test to ensure support going forward. --- .../compressors/test/unit_dict_compressor.cc | 99 +++++++++++++++++++ tiledb/sm/filter/compression_filter.cc | 4 - 2 files changed, 99 insertions(+), 4 deletions(-) diff --git a/tiledb/sm/compressors/test/unit_dict_compressor.cc b/tiledb/sm/compressors/test/unit_dict_compressor.cc index 153e84e75a0..59b3da7828e 100644 --- a/tiledb/sm/compressors/test/unit_dict_compressor.cc +++ b/tiledb/sm/compressors/test/unit_dict_compressor.cc @@ -272,4 +272,103 @@ TEMPLATE_LIST_TEST_CASE( tiledb::sm::DictEncoding::deserialize_dictionary(serialized_dict); // Check results CHECK(dict == dictionary_ref); +} + + +TEST_CASE( + "Compression-Dictionary: Test compression of empty strings", + "[compression][dict]") { + std::vector uncompressed_v = {""}; + std::vector uncompressed; + // the reference here is crucial, otherwise a temp string is created and + // therefore string_view will outlive it + for (const std::string& str : uncompressed_v) { + uncompressed.emplace_back(str); + } + + // Allocate the compressed array - we know the size will be equal to input + std::vector compressed(uncompressed.size()); + auto dict = + tiledb::sm::DictEncoding::compress(uncompressed, compressed); + CHECK(dict == uncompressed); + + std::vector exp_compressed{0}; + CHECK( + memcmp( + exp_compressed.data(), + reinterpret_cast(compressed.data()), + exp_compressed.size()) == 0); + + // Decompress the previously compressed array + const char* exp_decompressed = + ""; + + // In this test we allocate atleast 1 byte to avoid empty input buffer + std::vector decompressed(1); + const auto num_strings = 1; + std::vector decompressed_offsets(num_strings); + tiledb::sm::DictEncoding::decompress( + compressed, uncompressed_v, decompressed, decompressed_offsets); + + // In decompressed array there are only chars, so compare using memcpy + CHECK( + memcmp( + exp_decompressed, + reinterpret_cast(decompressed.data()), + decompressed.size()) == 0); + + std::vector expected_offsets{0}; + for (uint32_t i = 0; i < expected_offsets.size(); i++) { + CHECK(expected_offsets[i] == decompressed_offsets[i]); + } +} + + +TEST_CASE( + "Compression-Dictionary: Test compression of mixed empty strings", + "[compression][dict]") { + std::vector uncompressed_v = {"", "a"}; + std::vector uncompressed; + // the reference here is crucial, otherwise a temp string is created and + // therefore string_view will outlive it + for (const std::string& str : uncompressed_v) { + uncompressed.emplace_back(str); + } + + // Allocate the compressed array - we know the size will be equal to input + std::vector compressed(uncompressed.size()); + auto dict = + tiledb::sm::DictEncoding::compress(uncompressed, compressed); + CHECK(dict == uncompressed); + + std::vector exp_compressed{0, 1}; + CHECK( + memcmp( + exp_compressed.data(), + reinterpret_cast(compressed.data()), + exp_compressed.size()) == 0); + + // Decompress the previously compressed array + const char* exp_decompressed = + "" + "a"; + + const auto exp_decomp_size = strlen(exp_decompressed); + std::vector decompressed(exp_decomp_size); + const auto num_strings = 2; + std::vector decompressed_offsets(num_strings); + tiledb::sm::DictEncoding::decompress( + compressed, uncompressed_v, decompressed, decompressed_offsets); + + // In decompressed array there are only chars, so compare using memcpy + CHECK( + memcmp( + exp_decompressed, + reinterpret_cast(decompressed.data()), + decompressed.size()) == 0); + + std::vector expected_offsets{0, 0}; + for (uint32_t i = 0; i < expected_offsets.size(); i++) { + CHECK(expected_offsets[i] == decompressed_offsets[i]); + } } \ No newline at end of file diff --git a/tiledb/sm/filter/compression_filter.cc b/tiledb/sm/filter/compression_filter.cc index f1418ddbbd3..2c5f4df7027 100644 --- a/tiledb/sm/filter/compression_filter.cc +++ b/tiledb/sm/filter/compression_filter.cc @@ -458,10 +458,6 @@ CompressionFilter::create_input_view( } uint8_t CompressionFilter::compute_bytesize(uint64_t param_length) { - if (param_length == 0) { - throw std::logic_error("Cannot compute bytesize for zero length"); - } - if (param_length <= std::numeric_limits::max()) { return 1; } else if (param_length <= std::numeric_limits::max()) {