TileDB-Inc · Shelnutt2 · Sep 1, 2022 · Aug 27, 2022
diff --git a/tiledb/sm/compressors/test/unit_dict_compressor.cc b/tiledb/sm/compressors/test/unit_dict_compressor.cc
@@ -272,4 +272,103 @@ TEMPLATE_LIST_TEST_CASE(
       tiledb::sm::DictEncoding::deserialize_dictionary<T>(serialized_dict);
   // Check results
   CHECK(dict == dictionary_ref);
+}
+
+
+TEST_CASE(
+    "Compression-Dictionary: Test compression of empty strings",
+    "[compression][dict]") {
+  std::vector<std::string> uncompressed_v = {""};
+  std::vector<std::string_view> uncompressed;
+  // the reference here is crucial, otherwise a temp string is created and
+  // therefore string_view will outlive it
+  for (const std::string& str : uncompressed_v) {
+    uncompressed.emplace_back(str);
+  }
+
+  // Allocate the compressed array - we know the size will be equal to input
+  std::vector<std::byte> compressed(uncompressed.size());
+  auto dict =
+      tiledb::sm::DictEncoding::compress<uint8_t>(uncompressed, compressed);
+  CHECK(dict == uncompressed);
+
+  std::vector<uint8_t> exp_compressed{0};
+  CHECK(
+      memcmp(
+          exp_compressed.data(),
+          reinterpret_cast<uint8_t*>(compressed.data()),
+          exp_compressed.size()) == 0);
+
+  // Decompress the previously compressed array
+  const char* exp_decompressed =
+      "";
+
+  // In this test we allocate atleast 1 byte to avoid empty input buffer
+  std::vector<std::byte> decompressed(1);
+  const auto num_strings = 1;
+  std::vector<uint64_t> decompressed_offsets(num_strings);
+  tiledb::sm::DictEncoding::decompress<uint8_t>(
+      compressed, uncompressed_v, decompressed, decompressed_offsets);
+
+  // In decompressed array there are only chars, so compare using memcpy
+  CHECK(
+      memcmp(
+          exp_decompressed,
+          reinterpret_cast<const char*>(decompressed.data()),
+          decompressed.size()) == 0);
+
+  std::vector<uint64_t> expected_offsets{0};
+  for (uint32_t i = 0; i < expected_offsets.size(); i++) {
+    CHECK(expected_offsets[i] == decompressed_offsets[i]);
+  }
+}
+
+
+TEST_CASE(
+    "Compression-Dictionary: Test compression of mixed empty strings",
+    "[compression][dict]") {
+  std::vector<std::string> uncompressed_v = {"", "a"};
+  std::vector<std::string_view> uncompressed;
+  // the reference here is crucial, otherwise a temp string is created and
+  // therefore string_view will outlive it
+  for (const std::string& str : uncompressed_v) {
+    uncompressed.emplace_back(str);
+  }
+
+  // Allocate the compressed array - we know the size will be equal to input
+  std::vector<std::byte> compressed(uncompressed.size());
+  auto dict =
+      tiledb::sm::DictEncoding::compress<uint8_t>(uncompressed, compressed);
+  CHECK(dict == uncompressed);
+
+  std::vector<uint8_t> exp_compressed{0, 1};
+  CHECK(
+      memcmp(
+          exp_compressed.data(),
+          reinterpret_cast<uint8_t*>(compressed.data()),
+          exp_compressed.size()) == 0);
+
+  // Decompress the previously compressed array
+  const char* exp_decompressed =
+      ""
+      "a";
+
+  const auto exp_decomp_size = strlen(exp_decompressed);
+  std::vector<std::byte> decompressed(exp_decomp_size);
+  const auto num_strings = 2;
+  std::vector<uint64_t> decompressed_offsets(num_strings);
+  tiledb::sm::DictEncoding::decompress<uint8_t>(
+      compressed, uncompressed_v, decompressed, decompressed_offsets);
+
+  // In decompressed array there are only chars, so compare using memcpy
+  CHECK(
+      memcmp(
+          exp_decompressed,
+          reinterpret_cast<const char*>(decompressed.data()),
+          decompressed.size()) == 0);
+
+  std::vector<uint64_t> expected_offsets{0, 0};
+  for (uint32_t i = 0; i < expected_offsets.size(); i++) {
+    CHECK(expected_offsets[i] == decompressed_offsets[i]);
+  }
 }
diff --git a/tiledb/sm/filter/compression_filter.cc b/tiledb/sm/filter/compression_filter.cc
@@ -458,10 +458,6 @@ CompressionFilter::create_input_view(
 }
 
 uint8_t CompressionFilter::compute_bytesize(uint64_t param_length) {
-  if (param_length == 0) {
-    throw std::logic_error("Cannot compute bytesize for zero length");
-  }
-
   if (param_length <= std::numeric_limits<uint8_t>::max()) {
     return 1;
   } else if (param_length <= std::numeric_limits<uint16_t>::max()) {