Skip to content

Commit

Permalink
Support range deletion tombstones in IngestExternalFile SSTs
Browse files Browse the repository at this point in the history
Upstream PR: facebook#3778.

This change adds a `DeleteRange` method to `SstFileWriter` and adds
support for ingesting SSTs with range deletion tombstones. This is
important for applications that need to atomically ingest SSTs while
clearing out any existing keys in a given key range.
  • Loading branch information
nvanbenschoten committed Jul 10, 2018
1 parent 05b7615 commit d43a96c
Show file tree
Hide file tree
Showing 11 changed files with 365 additions and 59 deletions.
2 changes: 1 addition & 1 deletion USERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasu
Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights

## CockroachDB
CockroachDB is an open-source geo-replicated transactional database (still in development). They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach
CockroachDB is an open-source geo-replicated transactional database. They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach

## DNANexus
DNANexus is using RocksDB to speed up processing of genomics data.
Expand Down
106 changes: 97 additions & 9 deletions db/external_sst_file_basic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ class ExternalSSTFileBasicTest : public DBTestBase {

Status GenerateAndAddExternalFile(
const Options options, std::vector<int> keys,
const std::vector<ValueType>& value_types, int file_id,
const std::vector<ValueType>& value_types,
std::vector<std::pair<int, int>> range_deletions, int file_id,
std::map<std::string, std::string>* true_data) {
assert(value_types.size() == 1 || keys.size() == value_types.size());
std::string file_path = sst_files_dir_ + ToString(file_id);
Expand All @@ -49,6 +50,29 @@ class ExternalSSTFileBasicTest : public DBTestBase {
if (!s.ok()) {
return s;
}
for (size_t i = 0; i < range_deletions.size(); i++) {
// Account for the effect of range deletions on true_data before
// all point operators, even though sst_file_writer.DeleteRange
// must be called before other sst_file_writer methods. This is
// because point writes take precedence over range deletions
// in the same ingested sst.
std::string start_key = Key(range_deletions[i].first);
std::string end_key = Key(range_deletions[i].second);
s = sst_file_writer.DeleteRange(start_key, end_key);
if (!s.ok()) {
sst_file_writer.Finish();
return s;
}
auto start_key_it = true_data->find(start_key);
if (start_key_it == true_data->end()) {
start_key_it = true_data->upper_bound(start_key);
}
auto end_key_it = true_data->find(end_key);
if (end_key_it == true_data->end()) {
end_key_it = true_data->upper_bound(end_key);
}
true_data->erase(start_key_it, end_key_it);
}
for (size_t i = 0; i < keys.size(); i++) {
std::string key = Key(keys[i]);
std::string value = Key(keys[i]) + ToString(file_id);
Expand Down Expand Up @@ -86,6 +110,14 @@ class ExternalSSTFileBasicTest : public DBTestBase {
return s;
}

Status GenerateAndAddExternalFile(
const Options options, std::vector<int> keys,
const std::vector<ValueType>& value_types, int file_id,
std::map<std::string, std::string>* true_data) {
return GenerateAndAddExternalFile(options, keys, value_types, {}, file_id,
true_data);
}

Status GenerateAndAddExternalFile(
const Options options, std::vector<int> keys, const ValueType value_type,
int file_id, std::map<std::string, std::string>* true_data) {
Expand Down Expand Up @@ -126,9 +158,14 @@ TEST_F(ExternalSSTFileBasicTest, Basic) {
ASSERT_EQ(file1_info.num_entries, 100);
ASSERT_EQ(file1_info.smallest_key, Key(0));
ASSERT_EQ(file1_info.largest_key, Key(99));
ASSERT_EQ(file1_info.num_range_del_entries, 0);
ASSERT_EQ(file1_info.smallest_range_del_key, "");
ASSERT_EQ(file1_info.largest_range_del_key, "");
// sst_file_writer already finished, cannot add this value
s = sst_file_writer.Put(Key(100), "bad_val");
ASSERT_FALSE(s.ok()) << s.ToString();
s = sst_file_writer.DeleteRange(Key(100), Key(200));
ASSERT_FALSE(s.ok()) << s.ToString();

DestroyAndReopen(options);
// Add file using file path
Expand Down Expand Up @@ -189,6 +226,7 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) {
ASSERT_EQ(file3_info.num_entries, 15);
ASSERT_EQ(file3_info.smallest_key, Key(110));
ASSERT_EQ(file3_info.largest_key, Key(124));

s = DeprecatedAddFile({file1}, true /* move file */);
ASSERT_TRUE(s.ok()) << s.ToString();
ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
Expand All @@ -197,8 +235,8 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) {
ASSERT_TRUE(s.ok()) << s.ToString();
ASSERT_OK(env_->FileExists(file2));

// This file have overlapping values with the existing data
s = DeprecatedAddFile({file2}, true /* move file */);
// This file has overlapping values with the existing data
s = DeprecatedAddFile({file3}, true /* move file */);
ASSERT_FALSE(s.ok()) << s.ToString();
ASSERT_OK(env_->FileExists(file3));

Expand All @@ -223,7 +261,6 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {

ASSERT_OK(GenerateAndAddExternalFile(options, {10, 11, 12, 13},
ValueType::kTypeValue, file_id++,

&true_data));
// File dont overwrite any keys, No seqno needed
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
Expand Down Expand Up @@ -319,7 +356,6 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {

ASSERT_OK(GenerateAndAddExternalFile(options, {10, 11, 12, 13},
ValueType::kTypeValue, file_id++,

&true_data));
// File dont overwrite any keys, No seqno needed
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
Expand All @@ -345,6 +381,24 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
// File overwrite some keys, a seqno will be assigned
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);

ASSERT_OK(GenerateAndAddExternalFile(options, {120},
{ValueType::kTypeValue}, {{120, 135}},
file_id++, &true_data));
// File overwrite some keys, a seqno will be assigned
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);

ASSERT_OK(GenerateAndAddExternalFile(options, {}, {}, {{110, 120}},
file_id++, &true_data));
// The range deletion ends on a key, but it doesn't actually delete
// this key because the largest key in the range is exclusive. Still,
// if counts as an overlap so a new seqno will be assigned.
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);

ASSERT_OK(GenerateAndAddExternalFile(options, {}, {}, {{100, 109}},
file_id++, &true_data));
// File dont overwrite any keys, No seqno needed
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);

// Write some keys through normal write path
for (int i = 0; i < 50; i++) {
ASSERT_OK(Put(Key(i), "memtable"));
Expand Down Expand Up @@ -451,6 +505,29 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
// File overwrite some keys, a seqno will be assigned
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);

ASSERT_OK(GenerateAndAddExternalFile(
options, {150, 151, 152},
{ValueType::kTypeValue, ValueType::kTypeMerge,
ValueType::kTypeDeletion},
{{150, 160}, {180, 190}}, file_id++, &true_data));
// File dont overwrite any keys, No seqno needed
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);

ASSERT_OK(GenerateAndAddExternalFile(
options, {150, 151, 152},
{ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
{{200, 250}}, file_id++, &true_data));
// File overwrite some keys, a seqno will be assigned
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);

ASSERT_OK(GenerateAndAddExternalFile(
options, {300, 301, 302},
{ValueType::kTypeValue, ValueType::kTypeMerge,
ValueType::kTypeDeletion},
{{1, 2}, {152, 154}}, file_id++, &true_data));
// File overwrite some keys, a seqno will be assigned
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);

// Write some keys through normal write path
for (int i = 0; i < 50; i++) {
ASSERT_OK(Put(Key(i), "memtable"));
Expand All @@ -466,8 +543,9 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);

ASSERT_OK(GenerateAndAddExternalFile(
options, {40, 41, 42}, {ValueType::kTypeValue, ValueType::kTypeDeletion,
ValueType::kTypeDeletion},
options, {40, 41, 42},
{ValueType::kTypeValue, ValueType::kTypeDeletion,
ValueType::kTypeDeletion},
file_id++, &true_data));
// File overwrite some keys, a seqno will be assigned
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
Expand Down Expand Up @@ -589,7 +667,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
ASSERT_OK(GenerateAndAddExternalFile(
options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
file_id++, &true_data));
{{65, 70}, {70, 85}}, file_id++, &true_data));
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
ASSERT_EQ(2, NumTableFilesAtLevel(0));
ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
Expand All @@ -605,6 +683,16 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));

// overlaps with L5 file but not memtable or L0 file, so flush is skipped and
// file is ingested into L4
ASSERT_OK(GenerateAndAddExternalFile(options, {}, {}, {{5, 15}}, file_id++,
&true_data));
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
ASSERT_EQ(2, NumTableFilesAtLevel(0));
ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 2));
ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));

// ingested file overlaps with memtable, so flush is triggered before the file
// is ingested such that the ingested data is considered newest. So L0 file
// count increases by two.
Expand All @@ -623,7 +711,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
// seqnum.
ASSERT_OK(GenerateAndAddExternalFile(
options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
file_id++, &true_data));
{{160, 200}}, file_id++, &true_data));
ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
ASSERT_EQ(4, NumTableFilesAtLevel(0));
ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
Expand Down
66 changes: 50 additions & 16 deletions db/external_sst_file_ingestion_job.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ Status ExternalSstFileIngestionJob::Prepare(
}

for (IngestedFileInfo& f : files_to_ingest_) {
if (f.num_entries == 0) {
if (f.num_entries == 0 && f.num_range_deletions == 0) {
return Status::InvalidArgument("File contain no entries");
}

Expand Down Expand Up @@ -341,6 +341,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
}
// Get number of entries in table
file_to_ingest->num_entries = props->num_entries;
file_to_ingest->num_range_deletions = props->num_range_deletions;

ParsedInternalKey key;
ReadOptions ro;
Expand All @@ -351,26 +352,59 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
// updating the block cache.
ro.fill_cache = false;
std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(ro));
std::unique_ptr<InternalIterator> range_del_iter(
table_reader->NewRangeTombstoneIterator(ro));

// Get first (smallest) key from file
// Get first (smallest) and last (largest) key from file.
bool bounds_set = false;
iter->SeekToFirst();
if (!ParseInternalKey(iter->key(), &key)) {
return Status::Corruption("external file have corrupted keys");
}
if (key.sequence != 0) {
return Status::Corruption("external file have non zero sequence number");
}
file_to_ingest->smallest_user_key = key.user_key.ToString();
if (iter->Valid()) {
if (!ParseInternalKey(iter->key(), &key)) {
return Status::Corruption("external file have corrupted keys");
}
if (key.sequence != 0) {
return Status::Corruption("external file have non zero sequence number");
}
file_to_ingest->smallest_user_key = key.user_key.ToString();

// Get last (largest) key from file
iter->SeekToLast();
if (!ParseInternalKey(iter->key(), &key)) {
return Status::Corruption("external file have corrupted keys");
iter->SeekToLast();
if (!ParseInternalKey(iter->key(), &key)) {
return Status::Corruption("external file have corrupted keys");
}
if (key.sequence != 0) {
return Status::Corruption("external file have non zero sequence number");
}
file_to_ingest->largest_user_key = key.user_key.ToString();

bounds_set = true;
}
if (key.sequence != 0) {
return Status::Corruption("external file have non zero sequence number");

// We may need to adjust these key bounds, depending on whether any range
// deletion tombstones extend past them.
const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
if (range_del_iter != nullptr) {
range_del_iter->SeekToFirst();
if (range_del_iter->Valid()) {
for (; range_del_iter->Valid(); range_del_iter->Next()) {
if (!ParseInternalKey(range_del_iter->key(), &key)) {
return Status::Corruption("external file have corrupted keys");
}
RangeTombstone tombstone(key, range_del_iter->value());

if (!bounds_set ||
ucmp->Compare(tombstone.start_key_,
file_to_ingest->smallest_user_key) < 0) {
file_to_ingest->smallest_user_key = tombstone.start_key_.ToString();
}
if (!bounds_set ||
ucmp->Compare(tombstone.end_key_,
file_to_ingest->largest_user_key) > 0) {
file_to_ingest->largest_user_key = tombstone.end_key_.ToString();
}
bounds_set = true;
}
}
}
file_to_ingest->largest_user_key = key.user_key.ToString();

file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);

Expand Down
2 changes: 2 additions & 0 deletions db/external_sst_file_ingestion_job.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ struct IngestedFileInfo {
uint64_t file_size;
// total number of keys in external file
uint64_t num_entries;
// total number of range deletions in external file
uint64_t num_range_deletions;
// Id of column family this file shoule be ingested into
uint32_t cf_id;
// TableProperties read from external file
Expand Down
Loading

0 comments on commit d43a96c

Please sign in to comment.