TileDB-Inc · ypatia · Mar 8, 2023 · Mar 2, 2023 · Mar 3, 2023 · Mar 6, 2023
diff --git a/test/src/unit-compression-rle.cc b/test/src/unit-compression-rle.cc
@@ -298,7 +298,7 @@ TEST_CASE(
 TEST_CASE(
     "Compression-RLE: Test bytesize computation",
     "[compression][rle][rle-strings]") {
-  REQUIRE_THROWS_AS(RLE::compute_bytesize(0), std::logic_error);
+  CHECK(RLE::compute_bytesize(0) == 1);
   CHECK(RLE::compute_bytesize(1) == 1);
   CHECK(RLE::compute_bytesize(0xff) == 1);
   CHECK(RLE::compute_bytesize(0x100) == 2);

diff --git a/test/src/unit-cppapi-filter.cc b/test/src/unit-cppapi-filter.cc
@@ -731,3 +731,85 @@ TEST_CASE(
   if (vfs.is_dir(array_name))
     vfs.remove_dir(array_name);
 }
+
+TEST_CASE(
+    "C++ API: Filter empty strings with RLE or Dictionary encoding",
+    "[cppapi][filter][rle-strings][dict-strings][empty-strings]") {
+  using namespace tiledb;
+  Context ctx;
+  VFS vfs(ctx);
+  std::string array_name = "cpp_unit_array";
+
+  if (vfs.is_dir(array_name))
+    vfs.remove_dir(array_name);
+
+  auto f = GENERATE(TILEDB_FILTER_RLE, TILEDB_FILTER_DICTIONARY);
+
+  // Create array with string dimension and one attribute
+  ArraySchema schema(ctx, TILEDB_SPARSE);
+
+  FilterList filters(ctx);
+  filters.add_filter({ctx, f});
+
+  auto d0 = Dimension::create(ctx, "d0", TILEDB_STRING_ASCII, nullptr, nullptr);
+  d0.set_filter_list(filters);
+
+  Domain domain(ctx);
+  domain.add_dimensions(d0);
+  schema.set_domain(domain);
+
+  auto a0 = Attribute::create<int32_t>(ctx, "a0");
+  schema.add_attributes(a0);
+  schema.set_allows_dups(true);
+
+  Array::create(array_name, schema);
+
+  // Write empty strings to the array in 2 different ways
+  int elements = 10;
+  auto full_buffer_of_empty_strings = GENERATE(true, false);
+  std::vector<char> d0_buf(0);
+  if (full_buffer_of_empty_strings) {
+    d0_buf.assign(elements, 0);
+  }
+
+  std::vector<uint64_t> d0_offsets_buf(elements, 0);
+  std::vector<int> a0_buf(elements, 42);
+
+  Array array_w(ctx, array_name, TILEDB_WRITE);
+  Query query_w(ctx, array_w);
+  query_w.set_layout(TILEDB_UNORDERED)
+      .set_data_buffer("d0", d0_buf)
+      .set_offsets_buffer("d0", d0_offsets_buf)
+      .set_data_buffer("a0", a0_buf);
+  query_w.submit();
+  array_w.close();
+
+  // Read all data and check no error and data correct
+  std::vector<std::byte> d0_read_buf(1 << 20);
+  std::vector<uint64_t> d0_offsets_read_buf(1 << 20);
+  std::vector<int32_t> a0_read_buf(1 << 20);
+
+  Array array_r(ctx, array_name, TILEDB_READ);
+  Query query_r(ctx, array_r);
+  query_r.set_layout(TILEDB_UNORDERED);
+  query_r.set_data_buffer("d0", d0_read_buf)
+      .set_offsets_buffer("d0", d0_offsets_read_buf)
+      .set_data_buffer("a0", a0_read_buf);
+
+  auto st = query_r.submit();
+  REQUIRE(st == Query::Status::COMPLETE);
+
+  auto results = query_r.result_buffer_elements();
+  auto num_offsets = results["d0"].first;
+  CHECK(num_offsets == 10);
+
+  for (uint64_t i = 0; i < num_offsets; i++) {
+    CHECK(a0_read_buf[i] == 42);
+  }
+
+  array_r.close();
+
+  // Clean up
+  if (vfs.is_dir(array_name))
+    vfs.remove_dir(array_name);
+}
diff --git a/tiledb/sm/compressors/dict_compressor.cc b/tiledb/sm/compressors/dict_compressor.cc
@@ -68,7 +68,7 @@ void DictEncoding::decompress(
     const uint8_t word_id_size,
     span<std::byte> output,
     span<uint64_t> output_offsets) {
-  if (input.empty() || output.empty() || word_id_size == 0) {
+  if (input.empty() || word_id_size == 0) {
     throw std::logic_error(
         "Failed decompressing dictionary-encoded strings; empty input "
         "arguments.");

diff --git a/tiledb/sm/compressors/dict_compressor.h b/tiledb/sm/compressors/dict_compressor.h
@@ -183,11 +183,17 @@ class DictEncoding {
       const span<const std::string> dict,
       span<std::byte> output,
       span<uint64_t> output_offsets) {
-    if (input.empty() || output.empty() || dict.size() == 0) {
+    if (input.empty() || dict.size() == 0) {
       throw std::logic_error(
           "Empty arguments when decompressing dictionary encoded strings.");
     }
 
+    // this can be the case if the compressed buffer was empty, eg. representing
+    // empty strings
+    if (output.size() == 0) {
+      return;
+    }
+
     T word_id = 0;
     size_t in_index = 0, out_index = 0, offset_index = 0;
 
@@ -214,11 +220,14 @@ class DictEncoding {
     std::vector<std::byte> serialized_dict(dict_size);
     size_t out_index = 0;
     for (const auto& dict_entry : dict) {
+      // extra care for empty strings
+      auto entry_size = dict_entry.empty() ? 1 : dict_entry.size();
+      auto entry_data = dict_entry.empty() ? "" : dict_entry.data();
       utils::endianness::encode_be<T>(
-          static_cast<T>(dict_entry.size()), &serialized_dict[out_index]);
+          static_cast<T>(entry_size), &serialized_dict[out_index]);
       out_index += sizeof(T);
-      memcpy(&serialized_dict[out_index], dict_entry.data(), dict_entry.size());
-      out_index += dict_entry.size();
+      memcpy(&serialized_dict[out_index], entry_data, entry_size);
+      out_index += entry_size;
     }
 
     serialized_dict.resize(out_index);

diff --git a/tiledb/sm/compressors/rle_compressor.cc b/tiledb/sm/compressors/rle_compressor.cc
@@ -146,11 +146,6 @@ uint64_t RLE::overhead(uint64_t nbytes, uint64_t value_size) {
 }
 
 uint8_t RLE::compute_bytesize(uint64_t param_length) {
-  if (param_length == 0) {
-    throw std::logic_error(
-        "Cannot compute RLE parameter bytesize for zero length");
-  }
-
   if (param_length <= std::numeric_limits<uint8_t>::max()) {
     return 1;
   } else if (param_length <= std::numeric_limits<uint16_t>::max()) {
@@ -260,8 +255,7 @@ Status RLE::decompress(
     const uint8_t string_len_size,
     span<std::byte> output,
     span<uint64_t> output_offsets) {
-  if (input.empty() || output.empty() || rle_len_size == 0 ||
-      string_len_size == 0) {
+  if (input.empty() || rle_len_size == 0 || string_len_size == 0) {
     return LOG_STATUS(Status_CompressionError(
         "Failed decompressing strings with RLE; empty input arguments"));
   }