From 42d78b406835f6ca717215acb24091ca0880f2ce Mon Sep 17 00:00:00 2001 From: Joe Maley Date: Thu, 17 Dec 2020 09:58:10 -0500 Subject: [PATCH] Optimize ResultTile::compute_results_sparse This makes three optimizations to `ResultTile::compute_results_sparse`: 1. When executing on a dim_idx > 0, values in `r_bitmap` may be zero. In this scenario, we can skip checking the associated coordinate for an intersection. This saves us one string comparison for each coordinate. 2. When executing on a dim_idx > 0, we may have many contiguous zeroes in `r_bitmap`. In this scenario, we can do a `memcmp` in place of a loop that checks each individual coordinate. This size of this `memcmp` is fixed to 256. 3. When executing on dim_idx == 0, we know that coordinates are sorted. For any arbitrary subrange in the list of coordinate values, we know that all values in the subrange are identical if the first and last values are equal. In this patch, the list of coordinate values are partitioned into 6 ranges. If the first and last values match in one of these ranges, we perform a single computation for `r_bitmap` and apply it to all values in the range. * This patch has two fixed values, 256 (see #2) and 6 (see #3). I intentionally did not put these into our misc/constants.h because these are only applicable in this path. ** For #3, I tried an approach where I start with a single range and split it for each time there is not a match. This could be done either recursively or iteratively. I did not try doing it recursively because recursive routines can not be inlined and would be too expensive. I tried an iterative implementation and it was significantly slower than 6 fixed ranges because of the state it had to track. --- tiledb/sm/query/reader.cc | 9 +- tiledb/sm/query/result_tile.cc | 240 ++++++++++++++++++++++++++------- tiledb/sm/query/result_tile.h | 34 ++++- 3 files changed, 226 insertions(+), 57 deletions(-) diff --git a/tiledb/sm/query/reader.cc b/tiledb/sm/query/reader.cc index 78024fe8840a..43b588c0f82f 100644 --- a/tiledb/sm/query/reader.cc +++ b/tiledb/sm/query/reader.cc @@ -622,6 +622,7 @@ Status Reader::compute_range_result_coords( std::vector* result_coords) { auto coords_num = tile->cell_num(); auto dim_num = array_schema_->dim_num(); + auto cell_order = array_schema_->cell_order(); auto range_coords = subarray->get_range_coords(range_idx); if (array_schema_->dense()) { @@ -650,9 +651,13 @@ Status Reader::compute_range_result_coords( // Compute result and overwritten bitmap per dimension for (unsigned d = 0; d < dim_num; ++d) { - const auto& ranges = subarray->ranges_for_dim(d); + // For col-major cell ordering, iterate the dimensions + // in reverse. + const unsigned dim_idx = + cell_order == Layout::COL_MAJOR ? dim_num - d - 1 : d; + const auto& ranges = subarray->ranges_for_dim(dim_idx); RETURN_NOT_OK(tile->compute_results_sparse( - d, ranges[range_coords[d]], &result_bitmap)); + dim_idx, ranges[range_coords[dim_idx]], &result_bitmap, cell_order)); } // Gather results diff --git a/tiledb/sm/query/result_tile.cc b/tiledb/sm/query/result_tile.cc index afc78cb67210..d2809ce1a110 100644 --- a/tiledb/sm/query/result_tile.cc +++ b/tiledb/sm/query/result_tile.cc @@ -425,19 +425,71 @@ void ResultTile::compute_results_dense( } } +inline uint8_t ResultTile::str_coord_intersects( + const uint64_t c_offset, + const uint64_t c_size, + const char* const buff_str, + const std::string& range_start, + const std::string& range_end) { + // Test against start + bool geq_start = true; + bool all_chars_match = true; + uint64_t min_size = std::min(c_size, range_start.size()); + for (uint64_t i = 0; i < min_size; ++i) { + if (buff_str[c_offset + i] < range_start[i]) { + geq_start = false; + all_chars_match = false; + break; + } else if (buff_str[c_offset + i] > range_start[i]) { + all_chars_match = false; + break; + } // Else characters match + } + if (geq_start && all_chars_match && c_size < range_start.size()) { + geq_start = false; + } + + // Test against end + bool seq_end = false; + if (geq_start) { + seq_end = true; + all_chars_match = true; + min_size = std::min(c_size, range_end.size()); + for (uint64_t i = 0; i < min_size; ++i) { + if (buff_str[c_offset + i] > range_end[i]) { + geq_start = false; + break; + } else if (buff_str[c_offset + i] < range_end[i]) { + all_chars_match = false; + break; + } // Else characters match + } + if (seq_end && all_chars_match && c_size > range_end.size()) { + seq_end = false; + } + } + + return geq_start && seq_end; +} + template <> void ResultTile::compute_results_sparse( const ResultTile* result_tile, unsigned dim_idx, const Range& range, - std::vector* result_bitmap) { + std::vector* result_bitmap, + const Layout& cell_order) { auto coords_num = result_tile->cell_num(); + auto dim_num = result_tile->domain()->dim_num(); auto range_start = range.start_str(); - auto range_start_size = (uint64_t)range_start.size(); auto range_end = range.end_str(); - auto range_end_size = (uint64_t)range_end.size(); auto& r_bitmap = (*result_bitmap); + // Sanity check. + assert(coords_num != 0); + if (coords_num == 0) + return; + // Get coordinate tile const auto& coord_tile = result_tile->coord_tile(dim_idx); @@ -458,53 +510,125 @@ void ResultTile::compute_results_sparse( coord_tile_str.chunked_buffer()->get_contiguous_unsafe()); auto buff_str_size = coord_tile_str.size(); - // Compute results - uint64_t offset = 0, pos = 0, str_size = 0, min_size = 0; - bool geq_start = false, seq_end = false, all_chars_match = false; - for (; pos < coords_num; ++pos) { - offset = buff_off[pos]; - str_size = (pos < coords_num - 1) ? buff_off[pos + 1] - offset : - buff_str_size - offset; - - // Test against start - geq_start = true; - all_chars_match = true; - min_size = std::min(str_size, range_start_size); - for (uint64_t i = 0; i < min_size; ++i) { - if (buff_str[offset + i] < range_start[i]) { - geq_start = false; - all_chars_match = false; - break; - } else if (buff_str[offset + i] > range_start[i]) { - all_chars_match = false; - break; - } // Else characters match - } - if (geq_start && all_chars_match && str_size < range_start_size) { - geq_start = false; - } - - // Test against end - if (geq_start) { - seq_end = true; - all_chars_match = true; - min_size = std::min(str_size, range_end_size); - for (uint64_t i = 0; i < min_size; ++i) { - if (buff_str[offset + i] > range_end[i]) { - geq_start = false; - break; - } else if (buff_str[offset + i] < range_end[i]) { - all_chars_match = false; - break; - } // Else characters match - } - if (seq_end && all_chars_match && str_size > range_end_size) { - seq_end = false; + // For row-major cell orders, the first dimension is sorted. + // For col-major cell orders, the last dimension is sorted. + // If the coordinate value at index 0 is equivalent to the + // coordinate value at index 100, we know that all coordinates + // between them are also equivalent. In this scenario where + // coordinate value i=0 matches the coordinate value at i=100, + // we can calculate `r_bitmap[0]` and apply assign its value to + // `r_bitmap[i]` where i in the range [1, 100]. + // + // Calculating `r_bitmap[i]` is expensive for strings because + // it invokes a string comparison. We partition the coordinates + // into a fixed number of ranges. For each partition, we will + // compare the start and ending coordinate values to see if + // they are equivalent. If so, we save the expense of computing + // `r_bitmap` for each corresponding coordinate. If they do + // not match, we must compare each coordinate in the partition. + static const uint64_t c_partition_num = 6; + const bool is_sorted_dim = + ((cell_order == Layout::ROW_MAJOR && dim_idx == 0) || + (cell_order == Layout::COL_MAJOR && dim_idx == dim_num - 1)); + if (is_sorted_dim && coords_num > c_partition_num) { + // We calculate the size of each partition by dividing the total + // number of coordinates by the number of partitions. If this + // does not evenly divide, we will append the remaining coordinates + // onto the last partition. + const uint64_t c_partition_size_div = coords_num / c_partition_num; + const uint64_t c_partition_size_rem = coords_num % c_partition_num; + + // Loop over each coordinate partition. + for (uint64_t p = 0; p < c_partition_num; ++p) { + // Calculate the size of this partition. + const uint64_t c_partition_size = + c_partition_size_div + + (p == (c_partition_num - 1) ? c_partition_size_rem : 0); + + // Calculate the position of the first and last coordinates + // in this partition. + const uint64_t first_c_pos = p * c_partition_size_div; + const uint64_t last_c_pos = first_c_pos + c_partition_size - 1; + + // The coordinate values are determined by their offset and size + // within `buff_str`. Calculate the offset and size for the + // first and last coordinates in this partition. + const uint64_t first_c_offset = buff_off[first_c_pos]; + const uint64_t last_c_offset = buff_off[last_c_pos]; + const uint64_t first_c_size = buff_off[first_c_pos + 1] - first_c_offset; + const uint64_t last_c_size = (last_c_pos == coords_num - 1) ? + buff_str_size - last_c_offset : + buff_off[last_c_pos + 1] - last_c_offset; + + // Fetch the coordinate values for the first and last coordinates + // in this partition. + const char* const first_c_coord = &buff_str[first_c_offset]; + const char* const second_coord = &buff_str[last_c_offset]; + + // Check if the first and last coordinates have an identical + // string value. + if (first_c_size == last_c_size && + strncmp(first_c_coord, second_coord, first_c_size) == 0) { + assert(r_bitmap[0] == 1); + const uint8_t intersects = str_coord_intersects( + first_c_offset, first_c_size, buff_str, range_start, range_end); + memset(&r_bitmap[first_c_pos], intersects, c_partition_size); + } else { + // Compute results + uint64_t c_offset = 0, c_size = 0; + for (uint64_t pos = first_c_pos; pos <= last_c_pos; ++pos) { + c_offset = buff_off[pos]; + c_size = (pos < last_c_pos) ? buff_off[pos + 1] - c_offset : + buff_str_size - c_offset; + r_bitmap[pos] = str_coord_intersects( + c_offset, c_size, buff_str, range_start, range_end); + } } } - // Set the result - r_bitmap[pos] &= (geq_start && seq_end); + // We've calculated `r_bitmap` for all coordinate values in + // the sorted dimension. + return; + } + + // Here, we're computing on all other dimensions. After the first + // dimension, it is possible that coordinates have already been determined + // to not intersect with the range in an earlier dimension. For example, + // if `dim_idx` is 2 for a row-major cell order, we have already checked + // this coordinate for an intersection in dimension-0 and dimension-1. If + // either did not intersect a coordindate, its associated value in + // `r_bitmap` will be zero. To optimize for the scenario where there are + // many contiguous zeroes, we can `memcmp` a large range of `r_bitmap` + // values to avoid a comparison on each individual element. + // + // Often, a memory comparison for one byte is as quick as comparing + // 4 or 8 bytes. We will only get a benefit if we successfully + // find a `memcmp` on a much larger range. + static const uint64_t zeroed_size = coords_num < 256 ? coords_num : 256; + for (uint64_t i = 0; i < coords_num; i += zeroed_size) { + const uint64_t partition_size = + (i < coords_num - zeroed_size) ? zeroed_size : coords_num - i; + + // Check if all `r_bitmap` values are zero between `i` and + // `partition_size`. + if (r_bitmap[i] == 0 && + memcmp(&r_bitmap[i], &r_bitmap[i + 1], partition_size - 1) == 0) + continue; + + // Here, we know that there is at least one `1` value within + // the `r_bitmap` values within this partition. We must check + // each value for an intersection. + uint64_t c_offset = 0, c_size = 0; + for (uint64_t pos = i; pos < i + partition_size; ++pos) { + if (r_bitmap[pos] == 0) + continue; + + c_offset = buff_off[pos]; + c_size = (pos < coords_num - 1) ? buff_off[pos + 1] - c_offset : + buff_str_size - c_offset; + r_bitmap[pos] = str_coord_intersects( + c_offset, c_size, buff_str, range_start, range_end); + } } } @@ -513,13 +637,22 @@ void ResultTile::compute_results_sparse( const ResultTile* result_tile, unsigned dim_idx, const Range& range, - std::vector* result_bitmap) { + std::vector* result_bitmap, + const Layout& cell_order) { + // We do not use `cell_order` for this template type. + (void)cell_order; + + // For easy reference. auto coords_num = result_tile->cell_num(); auto r = (const T*)range.data(); auto stores_zipped_coords = result_tile->stores_zipped_coords(); auto dim_num = result_tile->domain()->dim_num(); auto& r_bitmap = (*result_bitmap); + + // Variables for the loop over `coords_num`. T c; + const T& r0 = r[0]; + const T& r1 = r[1]; // Handle separate coordinate tiles if (!stores_zipped_coords) { @@ -532,8 +665,9 @@ void ResultTile::compute_results_sparse( static_cast(chunked_buffer->get_contiguous_unsafe()); for (uint64_t pos = 0; pos < coords_num; ++pos) { c = coords[pos]; - r_bitmap[pos] &= (uint8_t)(c >= r[0] && c <= r[1]); + r_bitmap[pos] &= (uint8_t)(c >= r0 && c <= r1); } + return; } @@ -548,7 +682,7 @@ void ResultTile::compute_results_sparse( static_cast(chunked_buffer->get_contiguous_unsafe()); for (uint64_t pos = 0; pos < coords_num; ++pos) { c = coords[pos * dim_num + dim_idx]; - r_bitmap[pos] &= (uint8_t)(c >= r[0] && c <= r[1]); + r_bitmap[pos] &= (uint8_t)(c >= r0 && c <= r1); } } @@ -574,9 +708,11 @@ Status ResultTile::compute_results_dense( Status ResultTile::compute_results_sparse( unsigned dim_idx, const Range& range, - std::vector* result_bitmap) const { + std::vector* result_bitmap, + const Layout& cell_order) const { assert(compute_results_sparse_func_[dim_idx] != nullptr); - compute_results_sparse_func_[dim_idx](this, dim_idx, range, result_bitmap); + compute_results_sparse_func_[dim_idx]( + this, dim_idx, range, result_bitmap, cell_order); return Status::Ok(); } diff --git a/tiledb/sm/query/result_tile.h b/tiledb/sm/query/result_tile.h index 2e470f12d8a2..16b4077c94fd 100644 --- a/tiledb/sm/query/result_tile.h +++ b/tiledb/sm/query/result_tile.h @@ -38,6 +38,7 @@ #include #include +#include "tiledb/sm/enums/layout.h" #include "tiledb/sm/misc/constants.h" #include "tiledb/sm/misc/types.h" #include "tiledb/sm/tile/tile.h" @@ -199,7 +200,8 @@ class ResultTile { const ResultTile* result_tile, unsigned dim_idx, const Range& range, - std::vector* result_bitmap); + std::vector* result_bitmap, + const Layout& cell_order); /** * Applicable only to sparse tiles of dense arrays. @@ -229,7 +231,8 @@ class ResultTile { Status compute_results_sparse( unsigned dim_idx, const Range& range, - std::vector* result_bitmap) const; + std::vector* result_bitmap, + const Layout& cell_order) const; private: /* ********************************* */ @@ -283,7 +286,11 @@ class ResultTile { * for each dimension, based on the dimension datatype. */ std::vector*)>> + const ResultTile*, + unsigned, + const Range&, + std::vector*, + const Layout&)>> compute_results_sparse_func_; /* ********************************* */ @@ -298,6 +305,27 @@ class ResultTile { /** Implements coord() for unzipped coordinates. */ const void* unzipped_coord(uint64_t pos, unsigned dim_idx) const; + + /** + * A helper routine used in `compute_results_sparse` to + * determine if a given string-valued coordinate intersects + * the given start and end range. + * + * @param c_offset The offset of the coordinate value + * within `buff_str`. + * @param c_cize The size of the coordinate value. + * @param buff_str The buffer containing the coordinate value + * at `c_offset`. + * @param range_start The starting range value. + * @param range_end The ending range value. + * @return uint8_t 0 for no intersection, 1 for instersection. + */ + static uint8_t str_coord_intersects( + const uint64_t c_offset, + const uint64_t c_size, + const char* const buff_str, + const std::string& range_start, + const std::string& range_end); }; } // namespace sm