diff --git a/tiledb/sm/compressors/test/unit_dict_compressor.cc b/tiledb/sm/compressors/test/unit_dict_compressor.cc index 153e84e75a0..59b3da7828e 100644 --- a/tiledb/sm/compressors/test/unit_dict_compressor.cc +++ b/tiledb/sm/compressors/test/unit_dict_compressor.cc @@ -272,4 +272,103 @@ TEMPLATE_LIST_TEST_CASE( tiledb::sm::DictEncoding::deserialize_dictionary(serialized_dict); // Check results CHECK(dict == dictionary_ref); +} + + +TEST_CASE( + "Compression-Dictionary: Test compression of empty strings", + "[compression][dict]") { + std::vector uncompressed_v = {""}; + std::vector uncompressed; + // the reference here is crucial, otherwise a temp string is created and + // therefore string_view will outlive it + for (const std::string& str : uncompressed_v) { + uncompressed.emplace_back(str); + } + + // Allocate the compressed array - we know the size will be equal to input + std::vector compressed(uncompressed.size()); + auto dict = + tiledb::sm::DictEncoding::compress(uncompressed, compressed); + CHECK(dict == uncompressed); + + std::vector exp_compressed{0}; + CHECK( + memcmp( + exp_compressed.data(), + reinterpret_cast(compressed.data()), + exp_compressed.size()) == 0); + + // Decompress the previously compressed array + const char* exp_decompressed = + ""; + + // In this test we allocate atleast 1 byte to avoid empty input buffer + std::vector decompressed(1); + const auto num_strings = 1; + std::vector decompressed_offsets(num_strings); + tiledb::sm::DictEncoding::decompress( + compressed, uncompressed_v, decompressed, decompressed_offsets); + + // In decompressed array there are only chars, so compare using memcpy + CHECK( + memcmp( + exp_decompressed, + reinterpret_cast(decompressed.data()), + decompressed.size()) == 0); + + std::vector expected_offsets{0}; + for (uint32_t i = 0; i < expected_offsets.size(); i++) { + CHECK(expected_offsets[i] == decompressed_offsets[i]); + } +} + + +TEST_CASE( + "Compression-Dictionary: Test compression of mixed empty strings", + "[compression][dict]") { + std::vector uncompressed_v = {"", "a"}; + std::vector uncompressed; + // the reference here is crucial, otherwise a temp string is created and + // therefore string_view will outlive it + for (const std::string& str : uncompressed_v) { + uncompressed.emplace_back(str); + } + + // Allocate the compressed array - we know the size will be equal to input + std::vector compressed(uncompressed.size()); + auto dict = + tiledb::sm::DictEncoding::compress(uncompressed, compressed); + CHECK(dict == uncompressed); + + std::vector exp_compressed{0, 1}; + CHECK( + memcmp( + exp_compressed.data(), + reinterpret_cast(compressed.data()), + exp_compressed.size()) == 0); + + // Decompress the previously compressed array + const char* exp_decompressed = + "" + "a"; + + const auto exp_decomp_size = strlen(exp_decompressed); + std::vector decompressed(exp_decomp_size); + const auto num_strings = 2; + std::vector decompressed_offsets(num_strings); + tiledb::sm::DictEncoding::decompress( + compressed, uncompressed_v, decompressed, decompressed_offsets); + + // In decompressed array there are only chars, so compare using memcpy + CHECK( + memcmp( + exp_decompressed, + reinterpret_cast(decompressed.data()), + decompressed.size()) == 0); + + std::vector expected_offsets{0, 0}; + for (uint32_t i = 0; i < expected_offsets.size(); i++) { + CHECK(expected_offsets[i] == decompressed_offsets[i]); + } } \ No newline at end of file diff --git a/tiledb/sm/filter/compression_filter.cc b/tiledb/sm/filter/compression_filter.cc index f1418ddbbd3..2c5f4df7027 100644 --- a/tiledb/sm/filter/compression_filter.cc +++ b/tiledb/sm/filter/compression_filter.cc @@ -458,10 +458,6 @@ CompressionFilter::create_input_view( } uint8_t CompressionFilter::compute_bytesize(uint64_t param_length) { - if (param_length == 0) { - throw std::logic_error("Cannot compute bytesize for zero length"); - } - if (param_length <= std::numeric_limits::max()) { return 1; } else if (param_length <= std::numeric_limits::max()) {