From d8cf203f1547fc07afc57dad7a564607fe32125c Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Tue, 4 Mar 2025 16:28:49 -0500 Subject: [PATCH] db: add DB.Excise Add an Excise method to the DB allowing an excise without an accompanying ingestion. This will be used in CockroachDB to recover from corruption by excising out corrupted sstables. For now, this implementation uses DB.ingest. I think a future refactor should be able to clean these code paths up and potentially refactor excise to be independent of the ingest codepath. Today, it's necessary to reuse the logic to perform excises as 'flushable ingests.' Closes #4286. --- checkpoint_test.go | 2 +- compaction_picker_test.go | 2 +- data_test.go | 13 ++++- db_test.go | 9 +++- excise.go | 31 +++++++++++ ingest.go | 107 ++++++++++++++++++++------------------ ingest_test.go | 24 +++------ metrics_test.go | 2 +- table_stats_test.go | 2 +- testdata/determinism | 9 ++++ testdata/excise | 30 +++++++---- 11 files changed, 148 insertions(+), 83 deletions(-) diff --git a/checkpoint_test.go b/checkpoint_test.go index 112d37b738..7f1a16c7d1 100644 --- a/checkpoint_test.go +++ b/checkpoint_test.go @@ -108,7 +108,7 @@ func testCheckpointImpl(t *testing.T, ddFile string, createOnShared bool) { // Hacky but the command doesn't expect a db string. Get rid of it. td.CmdArgs = td.CmdArgs[1:] - if err := runIngestAndExciseCmd(td, d, mem); err != nil { + if err := runIngestAndExciseCmd(td, d); err != nil { return err.Error() } return "" diff --git a/compaction_picker_test.go b/compaction_picker_test.go index 9667aa4ba9..9571ae9474 100644 --- a/compaction_picker_test.go +++ b/compaction_picker_test.go @@ -1386,7 +1386,7 @@ func TestCompactionPickerPickFile(t *testing.T) { return "" case "ingest-and-excise": - if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil { + if err := runIngestAndExciseCmd(td, d); err != nil { return err.Error() } return "" diff --git a/data_test.go b/data_test.go index 87b6ec70db..58e2e53cbd 100644 --- a/data_test.go +++ b/data_test.go @@ -1252,7 +1252,18 @@ func (d *DB) waitTableStats() { } } -func runIngestAndExciseCmd(td *datadriven.TestData, d *DB, fs vfs.FS) error { +func runExciseCmd(td *datadriven.TestData, d *DB) error { + if len(td.CmdArgs) != 2 { + return errors.New("excise expects two arguments: ") + } + exciseSpan := KeyRange{ + Start: []byte(td.CmdArgs[0].String()), + End: []byte(td.CmdArgs[1].String()), + } + return d.Excise(context.Background(), exciseSpan) +} + +func runIngestAndExciseCmd(td *datadriven.TestData, d *DB) error { var exciseSpan KeyRange paths := make([]string, 0, len(td.CmdArgs)) for i, arg := range td.CmdArgs { diff --git a/db_test.go b/db_test.go index f3f3bbce0d..6a13f1dfca 100644 --- a/db_test.go +++ b/db_test.go @@ -2198,6 +2198,13 @@ func TestDeterminism(t *testing.T) { require.NoError(t, runBuildCmd(td, d, fs)) return "" }) + case "excise": + return addStep(td, func(td *datadriven.TestData) string { + if err := runExciseCmd(td, d); err != nil { + return err.Error() + } + return "" + }) case "flush": return addStep(td, func(td *datadriven.TestData) string { _, err := d.AsyncFlush() @@ -2208,7 +2215,7 @@ func TestDeterminism(t *testing.T) { }) case "ingest-and-excise": return addStep(td, func(td *datadriven.TestData) string { - if err := runIngestAndExciseCmd(td, d, fs); err != nil { + if err := runIngestAndExciseCmd(td, d); err != nil { return err.Error() } return "" diff --git a/excise.go b/excise.go index 02c58f3f6b..6c8332907c 100644 --- a/excise.go +++ b/excise.go @@ -8,11 +8,42 @@ import ( "context" "slices" + "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/invariants" "github.com/cockroachdb/pebble/internal/manifest" ) +// Excise atomically deletes all data overlapping with the provided span. All +// data overlapping with the span is removed, including from open snapshots. +// Only currently-open iterators will still observe the removed data. Excise may +// initiate a flush if there exists unflushed data overlapping the excise span. +func (d *DB) Excise(ctx context.Context, span KeyRange) error { + if err := d.closed.Load(); err != nil { + panic(err) + } + if d.opts.ReadOnly { + return ErrReadOnly + } + if invariants.Enabled { + // Excise is only supported on prefix keys. + if d.opts.Comparer.Split(span.Start) != len(span.Start) { + panic("IngestAndExcise called with suffixed start key") + } + if d.opts.Comparer.Split(span.End) != len(span.End) { + panic("IngestAndExcise called with suffixed end key") + } + } + if v := d.FormatMajorVersion(); v < FormatVirtualSSTables { + return errors.Errorf( + "store has format major version %d; Excise requires at least %d", + v, FormatVirtualSSTables, + ) + } + _, err := d.ingest(ctx, nil, nil, span, nil) + return err +} + // excise updates ve to include a replacement of the file m with new virtual // sstables that exclude exciseSpan, returning a slice of newly-created files if // any. If the entirety of m is deleted by exciseSpan, no new sstables are added diff --git a/ingest.go b/ingest.go index e86b76e20a..7da88934f1 100644 --- a/ingest.go +++ b/ingest.go @@ -1466,7 +1466,7 @@ func (d *DB) ingest( return IngestOperationStats{}, err } - if loadResult.fileCount() == 0 { + if loadResult.fileCount() == 0 && !exciseSpan.Valid() { // All of the sstables to be ingested were empty. Nothing to do. return IngestOperationStats{}, nil } @@ -1713,61 +1713,66 @@ func (d *DB) ingest( } } - info := TableIngestInfo{ - JobID: int(jobID), - Err: err, - flushable: asFlushable, - } - if len(loadResult.local) > 0 { - info.GlobalSeqNum = loadResult.local[0].SmallestSeqNum - } else if len(loadResult.shared) > 0 { - info.GlobalSeqNum = loadResult.shared[0].SmallestSeqNum - } else { - info.GlobalSeqNum = loadResult.external[0].SmallestSeqNum - } + // TODO(jackson): Refactor this so that the case where there are no files + // but a valid excise span is not so exceptional. + var stats IngestOperationStats - if ve != nil { - info.Tables = make([]struct { - TableInfo - Level int - }, len(ve.NewTables)) - for i := range ve.NewTables { - e := &ve.NewTables[i] - info.Tables[i].Level = e.Level - info.Tables[i].TableInfo = e.Meta.TableInfo() - stats.Bytes += e.Meta.Size - if e.Level == 0 { - stats.ApproxIngestedIntoL0Bytes += e.Meta.Size - } - if metaFlushableOverlaps[e.Meta.FileNum] { - stats.MemtableOverlappingFiles++ + if loadResult.fileCount() > 0 { + info := TableIngestInfo{ + JobID: int(jobID), + Err: err, + flushable: asFlushable, + } + if len(loadResult.local) > 0 { + info.GlobalSeqNum = loadResult.local[0].SmallestSeqNum + } else if len(loadResult.shared) > 0 { + info.GlobalSeqNum = loadResult.shared[0].SmallestSeqNum + } else { + info.GlobalSeqNum = loadResult.external[0].SmallestSeqNum + } + if ve != nil { + info.Tables = make([]struct { + TableInfo + Level int + }, len(ve.NewTables)) + for i := range ve.NewTables { + e := &ve.NewTables[i] + info.Tables[i].Level = e.Level + info.Tables[i].TableInfo = e.Meta.TableInfo() + stats.Bytes += e.Meta.Size + if e.Level == 0 { + stats.ApproxIngestedIntoL0Bytes += e.Meta.Size + } + if metaFlushableOverlaps[e.Meta.FileNum] { + stats.MemtableOverlappingFiles++ + } } - } - } else if asFlushable { - // NB: If asFlushable == true, there are no shared sstables. - info.Tables = make([]struct { - TableInfo - Level int - }, len(loadResult.local)) - for i, f := range loadResult.local { - info.Tables[i].Level = -1 - info.Tables[i].TableInfo = f.TableInfo() - stats.Bytes += f.Size - // We don't have exact stats on which files will be ingested into - // L0, because actual ingestion into the LSM has been deferred until - // flush time. Instead, we infer based on memtable overlap. - // - // TODO(jackson): If we optimistically compute data overlap (#2112) - // before entering the commit pipeline, we can use that overlap to - // improve our approximation by incorporating overlap with L0, not - // just memtables. - if metaFlushableOverlaps[f.FileNum] { - stats.ApproxIngestedIntoL0Bytes += f.Size - stats.MemtableOverlappingFiles++ + } else if asFlushable { + // NB: If asFlushable == true, there are no shared sstables. + info.Tables = make([]struct { + TableInfo + Level int + }, len(loadResult.local)) + for i, f := range loadResult.local { + info.Tables[i].Level = -1 + info.Tables[i].TableInfo = f.TableInfo() + stats.Bytes += f.Size + // We don't have exact stats on which files will be ingested into + // L0, because actual ingestion into the LSM has been deferred until + // flush time. Instead, we infer based on memtable overlap. + // + // TODO(jackson): If we optimistically compute data overlap (#2112) + // before entering the commit pipeline, we can use that overlap to + // improve our approximation by incorporating overlap with L0, not + // just memtables. + if metaFlushableOverlaps[f.FileNum] { + stats.ApproxIngestedIntoL0Bytes += f.Size + stats.MemtableOverlappingFiles++ + } } } + d.opts.EventListener.TableIngested(info) } - d.opts.EventListener.TableIngested(info) return stats, err } diff --git a/ingest_test.go b/ingest_test.go index c12e1797fe..a977a8c827 100644 --- a/ingest_test.go +++ b/ingest_test.go @@ -721,25 +721,21 @@ func TestExcise(t *testing.T) { return err.Error() } return "" - case "flush": if err := d.Flush(); err != nil { return err.Error() } return "" - case "block-flush": d.mu.Lock() d.mu.compact.flushing = true d.mu.Unlock() return "" - case "allow-flush": d.mu.Lock() d.mu.compact.flushing = false d.mu.Unlock() return "" - case "ingest": noWait := td.HasArg("no-wait") if !noWait { @@ -761,7 +757,6 @@ func TestExcise(t *testing.T) { return "memtable flushed" } return "" - case "ingest-and-excise": var prevFlushableIngests uint64 noWait := td.HasArg("no-wait") @@ -772,7 +767,7 @@ func TestExcise(t *testing.T) { d.mu.Unlock() } - if err := runIngestAndExciseCmd(td, d, mem); err != nil { + if err := runIngestAndExciseCmd(td, d); err != nil { return err.Error() } if noWait { @@ -792,7 +787,6 @@ func TestExcise(t *testing.T) { return "memtable flushed" } return "" - case "file-only-snapshot": if len(td.CmdArgs) != 1 { panic("insufficient args for file-only-snapshot command") @@ -811,10 +805,8 @@ func TestExcise(t *testing.T) { s := d.NewEventuallyFileOnlySnapshot(keyRanges) efos[name] = s return "ok" - case "get": return runGetCmd(t, td, d) - case "iter": opts := &IterOptions{ KeyTypes: IterKeyTypePointsAndRanges, @@ -849,10 +841,8 @@ func TestExcise(t *testing.T) { } iter, _ := reader.NewIter(opts) return runIterCmd(td, iter, true) - case "lsm": return runLSMCmd(td, d) - case "metrics": // The asynchronous loading of table stats can change metrics, so // wait for all the tables' stats to be loaded. @@ -861,11 +851,14 @@ func TestExcise(t *testing.T) { d.mu.Unlock() return d.Metrics().StringForTests() - case "wait-pending-table-stats": return runTableStatsCmd(td, d) - case "excise": + if err := runExciseCmd(td, d); err != nil { + return err.Error() + } + return "" + case "excise-dryrun": ve := &versionEdit{ DeletedTables: map[deletedFileEntry]*tableMetadata{}, } @@ -894,7 +887,6 @@ func TestExcise(t *testing.T) { d.mu.versions.logUnlock() d.mu.Unlock() return fmt.Sprintf("would excise %d files, use ingest-and-excise to excise.\n%s", len(ve.DeletedTables), ve.DebugString(base.DefaultFormatter)) - case "confirm-backing": // Confirms that the files have the same FileBacking. fileNums := make(map[base.FileNum]struct{}) @@ -1097,7 +1089,7 @@ func testIngestSharedImpl( return "" case "ingest-and-excise": - if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil { + if err := runIngestAndExciseCmd(td, d); err != nil { return err.Error() } // Wait for a possible flush. @@ -1595,7 +1587,7 @@ func TestConcurrentExcise(t *testing.T) { d.mu.Lock() prevFlushableIngests := d.mu.versions.metrics.Flush.AsIngestCount d.mu.Unlock() - if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil { + if err := runIngestAndExciseCmd(td, d); err != nil { return err.Error() } // Wait for a possible flush. diff --git a/metrics_test.go b/metrics_test.go index de7cc4eeaa..c3b776101d 100644 --- a/metrics_test.go +++ b/metrics_test.go @@ -279,7 +279,7 @@ func TestMetrics(t *testing.T) { return s case "ingest-and-excise": - if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil { + if err := runIngestAndExciseCmd(td, d); err != nil { return err.Error() } return "" diff --git a/table_stats_test.go b/table_stats_test.go index 8409a884ec..7a320b6e85 100644 --- a/table_stats_test.go +++ b/table_stats_test.go @@ -142,7 +142,7 @@ func TestTableStats(t *testing.T) { return "" case "ingest-and-excise": - if err := runIngestAndExciseCmd(td, d, d.opts.FS); err != nil { + if err := runIngestAndExciseCmd(td, d); err != nil { return err.Error() } // Wait for a possible flush. diff --git a/testdata/determinism b/testdata/determinism index e1026ba04b..e998d00f0d 100644 --- a/testdata/determinism +++ b/testdata/determinism @@ -134,3 +134,12 @@ run count=100 sequential( 0:define 1:memtable-info 2:batch 3:batch 4:memtable-info parallel( 5:batch 6:batch ) ) ---- ok + +excise a z +---- +7:excise + +run count=100 +sequential( 0:define 1:memtable-info 2:batch 3:batch 4:memtable-info parallel( 5:batch 6:batch ) 7:excise ) +---- +ok diff --git a/testdata/excise b/testdata/excise index fa614a91d8..3d7db208a5 100644 --- a/testdata/excise +++ b/testdata/excise @@ -28,7 +28,7 @@ L0.0: L6: 000004:[a#10,SET-l#10,SET] -excise c k +excise-dryrun c k ---- would excise 2 files, use ingest-and-excise to excise. del-table: L0 000006 @@ -38,7 +38,7 @@ would excise 2 files, use ingest-and-excise to excise. add-backing: 000004 -excise a e +excise-dryrun a e ---- would excise 2 files, use ingest-and-excise to excise. del-table: L0 000006 @@ -48,7 +48,7 @@ would excise 2 files, use ingest-and-excise to excise. add-backing: 000006 add-backing: 000004 -excise e z +excise-dryrun e z ---- would excise 2 files, use ingest-and-excise to excise. del-table: L0 000006 @@ -58,7 +58,7 @@ would excise 2 files, use ingest-and-excise to excise. add-backing: 000006 add-backing: 000004 -excise f l +excise-dryrun f l ---- would excise 2 files, use ingest-and-excise to excise. del-table: L0 000006 @@ -69,7 +69,7 @@ would excise 2 files, use ingest-and-excise to excise. add-backing: 000006 add-backing: 000004 -excise f ll +excise-dryrun f ll ---- would excise 2 files, use ingest-and-excise to excise. del-table: L0 000006 @@ -79,7 +79,7 @@ would excise 2 files, use ingest-and-excise to excise. add-backing: 000006 add-backing: 000004 -excise p q +excise-dryrun p q ---- would excise 0 files, use ingest-and-excise to excise. @@ -159,7 +159,7 @@ L0.0: L6: 000004:[c#10,RANGEKEYSET-f#inf,RANGEKEYSET] -excise f g +excise-dryrun f g ---- would excise 1 files, use ingest-and-excise to excise. del-table: L0 000005 @@ -167,21 +167,21 @@ would excise 1 files, use ingest-and-excise to excise. add-table: L0 000007(000005):[g#11,RANGEDEL-i#inf,RANGEDEL] seqnums:[11-11] points:[g#11,RANGEDEL-i#inf,RANGEDEL] size:1 add-backing: 000005 -excise b c +excise-dryrun b c ---- would excise 1 files, use ingest-and-excise to excise. del-table: L0 000005 add-table: L0 000008(000005):[g#11,RANGEDEL-i#inf,RANGEDEL] seqnums:[11-11] points:[g#11,RANGEDEL-i#inf,RANGEDEL] size:1 add-backing: 000005 -excise i j +excise-dryrun i j ---- would excise 0 files, use ingest-and-excise to excise. # Excise mid range key. This will not happen in practice, but excise() # supports it. -excise c d +excise-dryrun c d ---- would excise 2 files, use ingest-and-excise to excise. del-table: L0 000005 @@ -709,6 +709,16 @@ c: (somethingElse, .) . . +excise a b +---- + +lsm +---- +L0.0: + 000020:[b#23,SET-f#23,SET] +L6: + 000022(000018):[f#0,SET-x#0,SET] + # Two overlapping ingestions wait on one another even if # the overlap is only on the excise span.