Fix support for empty strings for Dictionary and RLE encodings (#3938)

This is a problem found in VCF where if using a string dimension compressed with a dictionary filter, we get an error if the value of the string dimension is always an empty string ("") : "terminate called after throwing an instance of 'tiledb::TileDBError' what(): [TileDB::Task] Error: Caught std::exception: Failed decompressing dictionary-encoded strings; empty input arguments." The failure would not occur when reading an array that was written using a string dimension data buffer that contained N entries with value 0, but only when writing using an empty string dimension data buffer . I also added support for encoding empty strings with RLE by removing a check (this was previously done for Dictionary in https://github.com/TileDB-Inc/TileDB/pull/3493/files . Co-authored-by: Paul J. Davis <[email protected]>
TileDB-Inc · Mar 8, 2023 · 1307810 · 1307810
1 parent f567c01
commit 1307810
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 15 deletions.
diff --git a/test/src/unit-compression-rle.cc b/test/src/unit-compression-rle.cc
@@ -298,7 +298,7 @@ TEST_CASE(
 TEST_CASE(
     "Compression-RLE: Test bytesize computation",
     "[compression][rle][rle-strings]") {
-  REQUIRE_THROWS_AS(RLE::compute_bytesize(0), std::logic_error);
+  CHECK(RLE::compute_bytesize(0) == 1);
   CHECK(RLE::compute_bytesize(1) == 1);
   CHECK(RLE::compute_bytesize(0xff) == 1);
   CHECK(RLE::compute_bytesize(0x100) == 2);

diff --git a/test/src/unit-cppapi-filter.cc b/test/src/unit-cppapi-filter.cc
@@ -731,3 +731,104 @@ TEST_CASE(
   if (vfs.is_dir(array_name))
     vfs.remove_dir(array_name);
 }
+
+TEST_CASE(
+    "C++ API: Filter buffer with some empty strings with RLE or Dictionary "
+    "encoding",
+    "[cppapi][filter][rle-strings][dict-strings][empty-strings]") {
+  using namespace tiledb;
+  Context ctx;
+  VFS vfs(ctx);
+  std::string array_name = "cpp_unit_array";
+
+  if (vfs.is_dir(array_name))
+    vfs.remove_dir(array_name);
+
+  auto f = GENERATE(TILEDB_FILTER_RLE, TILEDB_FILTER_DICTIONARY);
+
+  // Create array with string dimension and one attribute
+  ArraySchema schema(ctx, TILEDB_SPARSE);
+
+  FilterList filters(ctx);
+  filters.add_filter({ctx, f});
+
+  auto d0 = Dimension::create(ctx, "d0", TILEDB_STRING_ASCII, nullptr, nullptr);
+  d0.set_filter_list(filters);
+
+  Domain domain(ctx);
+  domain.add_dimensions(d0);
+  schema.set_domain(domain);
+
+  auto a0 = Attribute::create<int32_t>(ctx, "a0");
+  schema.add_attributes(a0);
+  schema.set_allows_dups(true);
+
+  Array::create(array_name, schema);
+
+  std::vector<char> d0_buf;
+  std::vector<uint64_t> d0_offsets_buf;
+  std::vector<int> a0_buf;
+
+  SECTION("Only empty strings in data buffer") {
+    // Write this data buffer to the array:
+    // ["\0","\0","\0","\0","\0","\0","\0","\0","\0","\0"]
+    // As data we use the d0 array empty, as already defined.
+    d0_offsets_buf.assign(10, 0);
+    a0_buf.assign(10, 42);
+  }
+
+  SECTION("Combine empty and strings of nulls in data buffer") {
+    // Write this data buffer to the array:
+    // ["\0","\0","\0","\0","\0","\0","\0","\0","\0","\0\0\0\0\0\0\0\0\0\0"]
+    d0_buf.assign(10, 0);
+    d0_offsets_buf.assign(10, 0);
+    a0_buf.assign(10, 42);
+  }
+
+  SECTION("Combine empty and non-empty strings in data buffer") {
+    // Write this data buffer to the array of strings: ["a", "bb", "", "c", ""]
+    d0_buf.assign({'a', 'b', 'b', 'c'});
+    d0_offsets_buf.assign({0, 1, 3, 3, 4});
+    a0_buf.assign({42, 42, 42, 42, 42});
+  }
+
+  Array array_w(ctx, array_name, TILEDB_WRITE);
+  Query query_w(ctx, array_w);
+  query_w.set_layout(TILEDB_UNORDERED)
+      .set_data_buffer("d0", d0_buf)
+      .set_offsets_buffer("d0", d0_offsets_buf)
+      .set_data_buffer("a0", a0_buf);
+  query_w.submit();
+  array_w.close();
+
+  // Read all data and check no error and data correct
+  std::vector<std::byte> d0_read_buf(1 << 20);
+  std::vector<uint64_t> d0_offsets_read_buf(1 << 20);
+  std::vector<int32_t> a0_read_buf(1 << 20);
+
+  Array array_r(ctx, array_name, TILEDB_READ);
+  Query query_r(ctx, array_r);
+  query_r.set_layout(TILEDB_UNORDERED);
+  query_r.set_data_buffer("d0", d0_read_buf)
+      .set_offsets_buffer("d0", d0_offsets_read_buf)
+      .set_data_buffer("a0", a0_read_buf);
+
+  auto st = query_r.submit();
+  REQUIRE(st == Query::Status::COMPLETE);
+
+  auto results = query_r.result_buffer_elements();
+  auto num_offsets = results["d0"].first;
+  CHECK(num_offsets == d0_offsets_buf.size());
+  auto str_len = results["d0"].second;
+  CHECK(str_len == d0_buf.size());
+
+  for (uint64_t i = 0; i < num_offsets; i++) {
+    CHECK(a0_read_buf[i] == 42);
+  }
+
+  array_r.close();
+
+  // Clean up
+  if (vfs.is_dir(array_name))
+    vfs.remove_dir(array_name);
+}
diff --git a/tiledb/sm/compressors/dict_compressor.cc b/tiledb/sm/compressors/dict_compressor.cc
@@ -68,7 +68,7 @@ void DictEncoding::decompress(
     const uint8_t word_id_size,
     span<std::byte> output,
     span<uint64_t> output_offsets) {
-  if (input.empty() || output.empty() || word_id_size == 0) {
+  if (input.empty() || word_id_size == 0) {
     throw std::logic_error(
         "Failed decompressing dictionary-encoded strings; empty input "
         "arguments.");

diff --git a/tiledb/sm/compressors/dict_compressor.h b/tiledb/sm/compressors/dict_compressor.h
@@ -183,7 +183,7 @@ class DictEncoding {
       const span<const std::string> dict,
       span<std::byte> output,
       span<uint64_t> output_offsets) {
-    if (input.empty() || output.empty() || dict.size() == 0) {
+    if (input.empty() || dict.size() == 0) {
       throw std::logic_error(
           "Empty arguments when decompressing dictionary encoded strings.");
     }
@@ -196,7 +196,9 @@ class DictEncoding {
       in_index += sizeof(T);
       assert(word_id < dict.size());
       const auto& word = dict[word_id];
-      memcpy(&output[out_index], word.data(), word.size());
+      if (word.size() > 0) {
+        memcpy(&output[out_index], word.data(), word.size());
+      }
       output_offsets[offset_index++] = out_index;
       out_index += word.size();
     }
@@ -246,8 +248,12 @@ class DictEncoding {
       // increment past the size element to the per-word data block
       in_index += sizeof(T);
       // construct string in place
-      dict.emplace_back(
-          reinterpret_cast<const char*>(&serialized_dict[in_index]), str_len);
+      if (str_len > 0) {
+        dict.emplace_back(
+            reinterpret_cast<const char*>(&serialized_dict[in_index]), str_len);
+      } else {
+        dict.emplace_back();
+      }
       in_index += str_len;
     }
 

diff --git a/tiledb/sm/compressors/rle_compressor.cc b/tiledb/sm/compressors/rle_compressor.cc
@@ -146,11 +146,6 @@ uint64_t RLE::overhead(uint64_t nbytes, uint64_t value_size) {
 }
 
 uint8_t RLE::compute_bytesize(uint64_t param_length) {
-  if (param_length == 0) {
-    throw std::logic_error(
-        "Cannot compute RLE parameter bytesize for zero length");
-  }
-
   if (param_length <= std::numeric_limits<uint8_t>::max()) {
     return 1;
   } else if (param_length <= std::numeric_limits<uint16_t>::max()) {
@@ -260,8 +255,7 @@ Status RLE::decompress(
     const uint8_t string_len_size,
     span<std::byte> output,
     span<uint64_t> output_offsets) {
-  if (input.empty() || output.empty() || rle_len_size == 0 ||
-      string_len_size == 0) {
+  if (input.empty() || rle_len_size == 0 || string_len_size == 0) {
     return LOG_STATUS(Status_CompressionError(
         "Failed decompressing strings with RLE; empty input arguments"));
   }

diff --git a/tiledb/sm/compressors/rle_compressor.h b/tiledb/sm/compressors/rle_compressor.h
@@ -150,7 +150,7 @@ class RLE {
       const span<const std::byte> input,
       span<std::byte> output,
       span<uint64_t> output_offsets) {
-    if (input.empty() || output.empty() || output_offsets.empty())
+    if (input.empty() || output_offsets.empty())
       return;
 
     const uint64_t run_size = sizeof(T);
@@ -168,7 +168,9 @@ class RLE {
       string_length = utils::endianness::decode_be<P>(&input[in_index]);
       in_index += str_len_size;
       for (uint64_t j = 0; j < run_length; j++) {
-        memcpy(&output[out_offset], &input[in_index], string_length);
+        if (string_length > 0) {
+          memcpy(&output[out_offset], &input[in_index], string_length);
+        }
         output_offsets[offset_index++] = out_offset;
         out_offset += string_length;
       }