Skip to content

Commit

Permalink
[#20852] docdb: Add mtime and checksum to block cache key prefix
Browse files Browse the repository at this point in the history
Summary:
Problem:
In RocksDB, the file id used for block cache key prefix is generated using file inode information (inode device ID, inode number, and inode generation).
However, file systems like XFS and ext4 reuse inodes(inode number, and inode generation), which means the block cache key for new files may point to stale entries from deleted files.

Fix:
The mtime and the checksum of the meta-block are added to the prefix key. The mtime (4 bytes) is finalized once the data is flushed and
remains unchanged even if a hard link is created. The checksum (4 bytes) is calculated based on the meta-block content of the SST file.
With the combination of inode, mtime and checksum, a conflict will be extremely unlikely.
Jira: DB-9839

Test Plan:
1. existing rocksdb unit test.
2. Added a new unit test: ybd tsan --cxx-test table_test --gtest_filter TableTest.BlockCacheWithHardlink --clang17

Reviewers: timur, arybochkin, rthallam

Reviewed By: timur

Subscribers: ybase

Differential Revision: https://phorge.dev.yugabyte.com/D39729
  • Loading branch information
huapengy committed Nov 26, 2024
1 parent 7f0770d commit 26e2c1f
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 5 deletions.
2 changes: 2 additions & 0 deletions src/yb/rocksdb/db/db_block_cache_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

DECLARE_double(cache_single_touch_ratio);
DECLARE_bool(cache_overflow_single_touch);
DECLARE_bool(TEST_allow_table_option_compressed_block_cache);

namespace rocksdb {

Expand Down Expand Up @@ -181,6 +182,7 @@ TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) {

#ifdef SNAPPY
TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) {
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_allow_table_option_compressed_block_cache) = true;
ReadOptions read_options;
auto table_options = GetTableOptions();
auto options = GetOptions(table_options);
Expand Down
2 changes: 2 additions & 0 deletions src/yb/rocksdb/db/db_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ using std::shared_ptr;

DECLARE_bool(use_priority_thread_pool_for_compactions);
DECLARE_bool(use_priority_thread_pool_for_flushes);
DECLARE_bool(TEST_allow_table_option_compressed_block_cache);

namespace rocksdb {

Expand Down Expand Up @@ -2025,6 +2026,7 @@ TEST_F(DBTest, CompressedCache) {
if (!Snappy_Supported()) {
return;
}
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_allow_table_option_compressed_block_cache) = true;
int num_iter = 80;

// Run this test three iterations.
Expand Down
3 changes: 3 additions & 0 deletions src/yb/rocksdb/db/db_test_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#include "yb/util/random_util.h"
#include "yb/util/status_log.h"

DECLARE_bool(TEST_allow_table_option_compressed_block_cache);

using std::string;
using std::unique_ptr;

Expand Down Expand Up @@ -344,6 +346,7 @@ Options DBHolder::CurrentOptions(
options.num_levels = 8;
break;
case kCompressedBlockCache:
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_allow_table_option_compressed_block_cache) = true;
options.allow_mmap_writes = true;
table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
break;
Expand Down
6 changes: 6 additions & 0 deletions src/yb/rocksdb/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ struct BlockBasedTableOptions {

// If non-NULL use the specified cache for compressed blocks.
// If NULL, rocksdb will not use a compressed block cache.
// 'block_cache_compressed' should be nullptr for now. If you want to configure it,
// the logic for generating the block cache key prefix must be modified.
// With 'block_cache_compressed' enabled, data will be put into the block cache
// during file generation. However, both the mtime and the CRC32 of the
// meta-block required for generating the block cache key prefix are not
// available during this process.
std::shared_ptr<Cache> block_cache_compressed = nullptr;

// Approximate size of user data packed per block, in bytes. Note that the
Expand Down
16 changes: 16 additions & 0 deletions src/yb/rocksdb/table/block_based_table_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@
#include "yb/util/mem_tracker.h"
#include "yb/util/status_log.h"

DEFINE_test_flag(bool, allow_table_option_compressed_block_cache, false, "If true, the table option"
"compressed_block_cache can be enabled. Can only be enabled in unit test for now");

namespace rocksdb {

extern const char kHashIndexPrefixesBlock[];
Expand Down Expand Up @@ -433,12 +436,25 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
rep_->filter_block_builder->StartBlock(0);
}
if (table_options.block_cache_compressed.get() != nullptr) {
// The mtime/checksum is used as a key for the block cache. However, during file creation,
// the mtime/checksum is not finalized, which prevents us from constructing the block cache key.
// Currently, we are not using block_cache_compressed, so this assert is in place
// to prevent misconfiguration.
// To enable the compressed block cache, you must change its key prefix to avoid using
// mtime/checksum. Otherwise, the block populated during the file build process will
// never be accessed, even though the correctness will not be affected.
if (!FLAGS_TEST_allow_table_option_compressed_block_cache) {
LOG(FATAL) << "compressed block cache should not be enabled";
}

GenerateCachePrefix(
table_options.block_cache_compressed.get(), metadata_file->writable_file(),
/* meta_block_checksum = */ 0,
&rep_->metadata_writer->compressed_cache_key_prefix);
if (rep_->is_split_sst()) {
GenerateCachePrefix(
table_options.block_cache_compressed.get(), data_file->writable_file(),
/* meta_block_checksum = */ 0,
&rep_->metadata_writer->compressed_cache_key_prefix);
}
}
Expand Down
31 changes: 29 additions & 2 deletions src/yb/rocksdb/table/block_based_table_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

#include "yb/rocksdb/table/block.h"
#include "yb/rocksdb/table/format.h"
#include "yb/rocksdb/util/crc32c.h"
#include "yb/rocksdb/util/xxhash.h"

#include "yb/ash/wait_state.h"

Expand Down Expand Up @@ -51,8 +53,9 @@ inline Status ReadBlockFromFile(

// The longest prefix of the cache key used to identify blocks.
// We are using the fact that we know the size of the unique ID for Posix files.
// Contains: inode + mtime (encoded with fixed32) + checksum (encoded with fixed32).
static constexpr size_t kMaxCacheKeyPrefixSize =
yb::FileWithUniqueId::kPosixFileUniqueIdMaxSize + 1;
yb::FileWithUniqueId::kPosixFileUniqueIdMaxSize + 2 * sizeof(uint32_t) + 1;
static constexpr size_t kCacheKeyBufferSize =
block_based_table::kMaxCacheKeyPrefixSize + yb::kMaxVarint64Length;

Expand All @@ -73,7 +76,8 @@ inline Slice GetCacheKey(const CacheKeyPrefixBuffer& cache_key_prefix, const Blo

// Generate a cache key prefix from the file. Used for both data and metadata files.
inline void GenerateCachePrefix(
Cache* cc, yb::FileWithUniqueId* file, CacheKeyPrefixBuffer* prefix) {
Cache* cc, yb::FileWithUniqueId* file, uint32_t meta_block_checksum,
CacheKeyPrefixBuffer* prefix) {
// generate an id from the file
prefix->size = file->GetUniqueId(prefix->data);

Expand All @@ -82,9 +86,32 @@ inline void GenerateCachePrefix(
if (prefix->size == 0) {
char* end = EncodeVarint64(prefix->data, cc->NewId());
prefix->size = static_cast<size_t>(end - prefix->data);
} else {
// Add meta-block checksum
EncodeFixed32(prefix->data + prefix->size, meta_block_checksum);
prefix->size += sizeof(uint32_t);
}
}

// The function is used to compute a checksum of meta-block to construct block cache key.
// The checksum may be different from one stored in disk. Only support kCRC32c and kxxHash
// for now. Different checksum algorithms with lower collision rate may be used in future.
inline uint32_t ComputeChecksumForBlockCacheKey(ChecksumType type, const Slice& block_contents) {
switch(type) {
case kNoChecksum:
case kCRC32c: {
return crc32c::Value(block_contents.data(), block_contents.size());
}
case kxxHash: {
void* xxh = XXH32_init(0);
XXH32_update(xxh, block_contents.data(),
static_cast<uint32_t>(block_contents.size()));
return XXH32_digest(xxh);
}
}
FATAL_INVALID_ENUM_VALUE(ChecksumType, type);
}

} // namespace block_based_table

} // namespace rocksdb
20 changes: 17 additions & 3 deletions src/yb/rocksdb/table/block_based_table_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,8 @@ struct BlockBasedTable::Rep {

DataIndexLoadMode data_index_load_mode = static_cast<DataIndexLoadMode>(0);
yb::MemTrackerPtr mem_tracker;
// The checksum of the meta block used to construct block cache key prefix.
uint32_t meta_block_checksum;
};

struct BlockBasedTable::BlockRetrievalInfo {
Expand Down Expand Up @@ -485,12 +487,12 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep,
reader_with_cache_prefix->compressed_cache_key_prefix.size = 0;
if (rep->table_options.block_cache != nullptr) {
GenerateCachePrefix(rep->table_options.block_cache.get(),
reader_with_cache_prefix->reader->file(),
reader_with_cache_prefix->reader->file(), rep->meta_block_checksum,
&reader_with_cache_prefix->cache_key_prefix);
}
if (rep->table_options.block_cache_compressed != nullptr) {
GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
reader_with_cache_prefix->reader->file(),
reader_with_cache_prefix->reader->file(), rep->meta_block_checksum,
&reader_with_cache_prefix->compressed_cache_key_prefix);
}
}
Expand Down Expand Up @@ -601,14 +603,16 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
rep->footer = footer;
rep->index_type = table_options.index_type;
rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
SetupCacheKeyPrefix(rep, rep->base_reader_with_cache_prefix.get());
unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));

// Read meta index
std::unique_ptr<Block> meta;
std::unique_ptr<InternalIterator> meta_iter;
RETURN_NOT_OK(ReadMetaBlock(rep, &meta, &meta_iter));

// cache key prefix needs the checksum of meta-block populated by ReadMetaBlock
SetupCacheKeyPrefix(rep, rep->base_reader_with_cache_prefix.get());

RETURN_NOT_OK(new_table->ReadPropertiesBlock(meta_iter.get()));

RETURN_NOT_OK(new_table->SetupFilter(meta_iter.get()));
Expand Down Expand Up @@ -903,6 +907,16 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
return s;
}

// Instead of reading the checksum of the meta-block from disk, it is simpler and more
// efficient to compute it based on the block data. Some benefits include:
// (1) Minimizing changes: Reading the checksum from disk would require modifications in many
// functions.
// (2) The function is executed only once when a file is opened, so the overhead is minimal.
// (3) Checksum calculation is limited to the meta-block, ensuring that reading other blocks
// remains unaffected.
rep->meta_block_checksum = block_based_table::ComputeChecksumForBlockCacheKey(
rep->table_options.checksum, Slice(meta->data(), meta->size()));

*meta_block = std::move(meta);
// meta block uses bytewise comparator.
iter->reset(
Expand Down
38 changes: 38 additions & 0 deletions src/yb/rocksdb/table/table_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
#include "yb/rocksdb/util/statistics.h"
#include "yb/rocksdb/util/testharness.h"
#include "yb/rocksdb/util/testutil.h"
#include "yb/rocksdb/utilities/checkpoint.h"

#include "yb/util/enums.h"
#include "yb/util/string_util.h"
Expand Down Expand Up @@ -3028,6 +3029,43 @@ TEST_F(TableTest, MiddleOfMiddleKey) {
delete db;
}

// Open the first DB and generate some keys. Then open the second DB using a checkpoint
// from the first one. Verify that both databases share the same block cache key prefix.
TEST_F(TableTest, YB_LINUX_ONLY_TEST(BlockCacheWithHardlink)) {
BlockBasedTableOptions table_options;
table_options.block_size = 1024;
table_options.block_cache = NewLRUCache(16_MB / FLAGS_cache_single_touch_ratio);

rocksdb::Options options;
options.compaction_style = rocksdb::kCompactionStyleNone;
options.num_levels = 1;
options.create_if_missing = true;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));

const std::string kDBPath = test::TmpDir() + "/hardlink";
ASSERT_OK(DestroyDB(kDBPath, options));
rocksdb::DB* db;
ASSERT_OK(rocksdb::DB::Open(options, kDBPath, &db));

GenerateSSTFile(db, 0, 10);

const auto mkey = ASSERT_RESULT(db->GetMiddleKey());
const auto tw_1 = ASSERT_RESULT(db->TEST_GetLargestSstTableReader());
auto* table_reader_1 = dynamic_cast<BlockBasedTable*>(tw_1);
ASSERT_TRUE(table_reader_1->TEST_KeyInCache(ReadOptions(), mkey));

LOG(INFO) << "Opening checkpoint db";
auto checkpoint_dir = kDBPath + "/checkpoints";
ASSERT_OK(checkpoint::CreateCheckpoint(db, checkpoint_dir));
rocksdb::DB* checkpoint_db;
ASSERT_OK(rocksdb::DB::Open(options, checkpoint_dir, &checkpoint_db));
const auto tw_2 = ASSERT_RESULT(checkpoint_db->TEST_GetLargestSstTableReader());
auto* table_reader_2 = dynamic_cast<BlockBasedTable*>(tw_2);
ASSERT_TRUE(table_reader_2->TEST_KeyInCache(ReadOptions(), mkey));

delete db;
delete checkpoint_db;
}
} // namespace rocksdb

int main(int argc, char** argv) {
Expand Down
4 changes: 4 additions & 0 deletions src/yb/util/file_system_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include <sys/syscall.h>
#endif // __linux__

#include "yb/rocksdb/util/coding.h"
#include "yb/util/coding-inl.h"
#include "yb/util/coding.h"
#include "yb/util/debug/trace_event.h"
#include "yb/util/errno.h"
Expand Down Expand Up @@ -105,6 +107,8 @@ size_t GetUniqueIdFromFile(int fd, uint8_t* id) {
rid = EncodeVarint64(rid, buf.st_dev);
rid = EncodeVarint64(rid, buf.st_ino);
rid = EncodeVarint64(rid, version);
InlineEncodeFixed32(rid, static_cast<uint32_t>(buf.st_mtime));
rid += sizeof(uint32_t);
DCHECK_GE(rid, id);
return rid - id;
}
Expand Down

0 comments on commit 26e2c1f

Please sign in to comment.