diff --git a/pkg/kv/kvserver/batcheval/cmd_delete_range.go b/pkg/kv/kvserver/batcheval/cmd_delete_range.go index f0c0c52cd82e..eb4a0c0eb1f9 100644 --- a/pkg/kv/kvserver/batcheval/cmd_delete_range.go +++ b/pkg/kv/kvserver/batcheval/cmd_delete_range.go @@ -58,6 +58,8 @@ func declareKeysDeleteRange( } } +const maxDeleteRangeBatchBytes = 32 << 20 + // DeleteRange deletes the range of key/value pairs specified by // start and end keys. func DeleteRange( @@ -67,7 +69,14 @@ func DeleteRange( h := cArgs.Header reply := resp.(*roachpb.DeleteRangeResponse) - // Use experimental MVCC range tombstone if requested. + if args.Predicates != (roachpb.DeleteRangePredicates{}) && !args.UseRangeTombstone { + // This ensures predicate based DeleteRange piggybacks on the version gate, + // roachpb api flags, and latch declarations used by the UseRangeTombstone. + return result.Result{}, errors.AssertionFailedf( + "UseRangeTombstones must be passed with predicate based Delete Range") + } + + // Use MVCC range tombstone if requested. if args.UseRangeTombstone { if cArgs.Header.Txn != nil { return result.Result{}, ErrTransactionUnsupported @@ -79,14 +88,55 @@ func DeleteRange( return result.Result{}, errors.AssertionFailedf( "ReturnKeys can't be used with range tombstones") } - desc := cArgs.EvalCtx.Desc() leftPeekBound, rightPeekBound := rangeTombstonePeekBounds( args.Key, args.EndKey, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey()) maxIntents := storage.MaxIntentsPerWriteIntentError.Get(&cArgs.EvalCtx.ClusterSettings().SV) - err := storage.MVCCDeleteRangeUsingTombstone(ctx, readWriter, cArgs.Stats, - args.Key, args.EndKey, h.Timestamp, cArgs.Now, leftPeekBound, rightPeekBound, maxIntents) + if args.Predicates == (roachpb.DeleteRangePredicates{}) { + // If no predicate parameters are passed, use the fast path. + err := storage.MVCCDeleteRangeUsingTombstone(ctx, readWriter, cArgs.Stats, + args.Key, args.EndKey, h.Timestamp, cArgs.Now, leftPeekBound, rightPeekBound, maxIntents) + return result.Result{}, err + } + + if h.MaxSpanRequestKeys == 0 { + // In production, MaxSpanRequestKeys must be greater than zero to ensure + // the DistSender serializes predicate based DeleteRange requests across + // ranges. This ensures that only one resumeSpan gets returned to the + // client. + // + // Also, note that DeleteRangeUsingTombstone requests pass the isAlone + // flag in roachpb.api.proto, ensuring the requests do not intermingle with + // other types of requests, preventing further resume span muddling. + return result.Result{}, errors.AssertionFailedf( + "MaxSpanRequestKeys must be greater than zero when using predicated based DeleteRange") + } + // TODO (msbutler): Tune the threshold once DeleteRange and DeleteRangeUsingTombstone have + // been further optimized. + defaultRangeTombstoneThreshold := int64(64) + resumeSpan, err := storage.MVCCPredicateDeleteRange(ctx, readWriter, cArgs.Stats, + args.Key, args.EndKey, h.Timestamp, cArgs.Now, leftPeekBound, rightPeekBound, + args.Predicates, h.MaxSpanRequestKeys, maxDeleteRangeBatchBytes, + defaultRangeTombstoneThreshold, maxIntents) + + if resumeSpan != nil { + reply.ResumeSpan = resumeSpan + reply.ResumeReason = roachpb.RESUME_KEY_LIMIT + + // Note: While MVCCPredicateDeleteRange _could_ return reply.NumKeys, as + // the number of keys iterated through, doing so could lead to a + // significant performance drawback. The DistSender would have used + // NumKeys to subtract the number of keys processed by one range from the + // MaxSpanRequestKeys limit given to the next range. In this case, that's + // specifically not what we want, because this request does not use the + // normal meaning of MaxSpanRequestKeys (i.e. number of keys to return). + // Here, MaxSpanRequest keys is used to limit the number of tombstones + // written. Thus, setting NumKeys would just reduce the limit available to + // the next range for no good reason. + } + // Return result is always empty, since the reply is populated into the + // passed in resp pointer. return result.Result{}, err } diff --git a/pkg/kv/kvserver/batcheval/cmd_delete_range_test.go b/pkg/kv/kvserver/batcheval/cmd_delete_range_test.go index 9692e243dc69..4e454a1001ea 100644 --- a/pkg/kv/kvserver/batcheval/cmd_delete_range_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_delete_range_test.go @@ -12,6 +12,8 @@ package batcheval import ( "context" + "fmt" + "math" "testing" "github.com/cockroachdb/cockroach/pkg/keys" @@ -27,8 +29,8 @@ import ( "github.com/stretchr/testify/require" ) -// TestDeleteRangeTombstone tests DeleteRange range tombstones directly, using -// only a Pebble engine. +// TestDeleteRangeTombstone tests DeleteRange range tombstones and predicated based DeleteRange +// directly, using only a Pebble engine. // // MVCC range tombstone logic is tested exhaustively in the MVCC history tests, // this just tests the RPC plumbing. @@ -76,6 +78,11 @@ func TestDeleteRangeTombstone(t *testing.T) { inline bool returnKeys bool expectErr interface{} // error type, substring, or true (any) + + // The fields below test predicate based delete range rpc plumbing. + predicateStartTime int64 // if set, the test will only run with predicate based delete range + onlyPointKeys bool // if set UsingRangeTombstone arg is set to false + maxBatchSize int64 // if predicateStartTime is set, then MaxBatchSize must be set }{ "above points succeed": { start: "a", @@ -142,130 +149,216 @@ func TestDeleteRangeTombstone(t *testing.T) { ts: 1e9, expectErr: &roachpb.WriteTooOldError{}, }, + "predicate without UsingRangeTombstone error": { + start: "a", + end: "f", + ts: 10e9, + predicateStartTime: 1, + maxBatchSize: maxDeleteRangeBatchBytes, + onlyPointKeys: true, + expectErr: "UseRangeTombstones must be passed with predicate based Delete Range", + }, + "predicate maxBatchSize error": { + start: "a", + end: "f", + ts: 10e9, + predicateStartTime: 1, + maxBatchSize: 0, + expectErr: "MaxSpanRequestKeys must be greater than zero when using predicated based DeleteRange", + }, } for name, tc := range testcases { t.Run(name, func(t *testing.T) { - ctx := context.Background() - st := cluster.MakeTestingClusterSettings() - engine := storage.NewDefaultInMemForTesting() - defer engine.Close() - - writeInitialData(t, ctx, engine) + for _, runWithPredicates := range []bool{false, true} { + if tc.predicateStartTime > 0 && !runWithPredicates { + continue + } + t.Run(fmt.Sprintf("Predicates=%v", runWithPredicates), func(t *testing.T) { + ctx := context.Background() + st := cluster.MakeTestingClusterSettings() + engine := storage.NewDefaultInMemForTesting() + defer engine.Close() - rangeKey := storage.MVCCRangeKey{ - StartKey: roachpb.Key(tc.start), - EndKey: roachpb.Key(tc.end), - Timestamp: hlc.Timestamp{WallTime: tc.ts}, - } + writeInitialData(t, ctx, engine) - // Prepare the request and environment. - evalCtx := &MockEvalCtx{ - ClusterSettings: st, - Desc: &roachpb.RangeDescriptor{ - StartKey: roachpb.RKey(rangeStart), - EndKey: roachpb.RKey(rangeEnd), - }, - } - - h := roachpb.Header{ - Timestamp: rangeKey.Timestamp, - } - if tc.txn { - txn := roachpb.MakeTransaction("txn", nil /* baseKey */, roachpb.NormalUserPriority, rangeKey.Timestamp, 0, 0) - h.Txn = &txn - } + rangeKey := storage.MVCCRangeKey{ + StartKey: roachpb.Key(tc.start), + EndKey: roachpb.Key(tc.end), + Timestamp: hlc.Timestamp{WallTime: tc.ts}, + } - req := &roachpb.DeleteRangeRequest{ - RequestHeader: roachpb.RequestHeader{ - Key: rangeKey.StartKey, - EndKey: rangeKey.EndKey, - }, - UseRangeTombstone: true, - Inline: tc.inline, - ReturnKeys: tc.returnKeys, - } + // Prepare the request and environment. + evalCtx := &MockEvalCtx{ + ClusterSettings: st, + Desc: &roachpb.RangeDescriptor{ + StartKey: roachpb.RKey(rangeStart), + EndKey: roachpb.RKey(rangeEnd), + }, + } - ms := computeStats(t, engine, rangeStart, rangeEnd, rangeKey.Timestamp.WallTime) + h := roachpb.Header{ + Timestamp: rangeKey.Timestamp, + } + if tc.txn { + txn := roachpb.MakeTransaction("txn", nil /* baseKey */, roachpb.NormalUserPriority, rangeKey.Timestamp, 0, 0) + h.Txn = &txn + } + var predicates roachpb.DeleteRangePredicates + if runWithPredicates { + predicates = roachpb.DeleteRangePredicates{ + StartTime: hlc.Timestamp{WallTime: 1}, + } + h.MaxSpanRequestKeys = math.MaxInt64 + } + if tc.predicateStartTime > 0 { + predicates = roachpb.DeleteRangePredicates{ + StartTime: hlc.Timestamp{WallTime: tc.predicateStartTime}, + } + h.MaxSpanRequestKeys = tc.maxBatchSize + } - // Use a spanset batch to assert latching of all accesses. In particular, - // the additional seeks necessary to check for adjacent range keys that we - // may merge with (for stats purposes) which should not cross the range - // bounds. - var latchSpans, lockSpans spanset.SpanSet - declareKeysDeleteRange(evalCtx.Desc, &h, req, &latchSpans, &lockSpans, 0) - batch := spanset.NewBatchAt(engine.NewBatch(), &latchSpans, h.Timestamp) - defer batch.Close() + req := &roachpb.DeleteRangeRequest{ + RequestHeader: roachpb.RequestHeader{ + Key: rangeKey.StartKey, + EndKey: rangeKey.EndKey, + }, + UseRangeTombstone: !tc.onlyPointKeys, + Inline: tc.inline, + ReturnKeys: tc.returnKeys, + Predicates: predicates, + } - // Run the request. - resp := &roachpb.DeleteRangeResponse{} - _, err := DeleteRange(ctx, batch, CommandArgs{ - EvalCtx: evalCtx.EvalContext(), - Stats: &ms, - Now: now, - Header: h, - Args: req, - }, resp) + ms := computeStats(t, engine, rangeStart, rangeEnd, rangeKey.Timestamp.WallTime) - // Check the error. - if tc.expectErr != nil { - require.Error(t, err) - if b, ok := tc.expectErr.(bool); ok && b { - // any error is fine - } else if expectMsg, ok := tc.expectErr.(string); ok { - require.Contains(t, err.Error(), expectMsg) - } else if e, ok := tc.expectErr.(error); ok { - require.True(t, errors.HasType(err, e), "expected %T, got %v", e, err) - } else { - require.Fail(t, "invalid expectErr", "expectErr=%v", tc.expectErr) - } - return - } - require.NoError(t, err) - require.NoError(t, batch.Commit(true)) + // Use a spanset batch to assert latching of all accesses. In particular, + // the additional seeks necessary to check for adjacent range keys that we + // may merge with (for stats purposes) which should not cross the range + // bounds. + var latchSpans, lockSpans spanset.SpanSet + declareKeysDeleteRange(evalCtx.Desc, &h, req, &latchSpans, &lockSpans, 0) + batch := spanset.NewBatchAt(engine.NewBatch(), &latchSpans, h.Timestamp) + defer batch.Close() - // Check that the range tombstone was written successfully. - iter := engine.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{ - KeyTypes: storage.IterKeyTypeRangesOnly, - LowerBound: rangeKey.StartKey, - UpperBound: rangeKey.EndKey, - }) - defer iter.Close() - iter.SeekGE(storage.MVCCKey{Key: rangeKey.StartKey}) + // Run the request. + resp := &roachpb.DeleteRangeResponse{} + _, err := DeleteRange(ctx, batch, CommandArgs{ + EvalCtx: evalCtx.EvalContext(), + Stats: &ms, + Now: now, + Header: h, + Args: req, + }, resp) - var seen storage.MVCCRangeKeyValue - for { - ok, err := iter.Valid() - require.NoError(t, err) - if !ok { - break - } - require.True(t, ok) - for _, rkv := range iter.RangeKeys() { - if rkv.RangeKey.Timestamp.Equal(rangeKey.Timestamp) { - if len(seen.RangeKey.StartKey) == 0 { - seen = rkv.Clone() + // Check the error. + if tc.expectErr != nil { + require.Error(t, err) + if b, ok := tc.expectErr.(bool); ok && b { + // any error is fine + } else if expectMsg, ok := tc.expectErr.(string); ok { + require.Contains(t, err.Error(), expectMsg) + } else if e, ok := tc.expectErr.(error); ok { + require.True(t, errors.HasType(err, e), "expected %T, got %v", e, err) } else { - seen.RangeKey.EndKey = rkv.RangeKey.EndKey.Clone() - require.Equal(t, seen.Value, rkv.Value) + require.Fail(t, "invalid expectErr", "expectErr=%v", tc.expectErr) } - break + return } - } - iter.Next() - } - require.Equal(t, rangeKey, seen.RangeKey) + require.NoError(t, err) + require.NoError(t, batch.Commit(true)) - value, err := storage.DecodeMVCCValue(seen.Value) - require.NoError(t, err) - require.True(t, value.IsTombstone()) - require.Equal(t, now, value.LocalTimestamp) + if runWithPredicates { + checkPredicateDeleteRange(t, engine, rangeKey) + } else { + checkDeleteRangeTombstone(t, engine, rangeKey, now) + } - // Check that range tombstone stats were updated correctly. - require.Equal(t, computeStats(t, engine, rangeStart, rangeEnd, rangeKey.Timestamp.WallTime), ms) + // Check that range tombstone stats were updated correctly. + require.Equal(t, computeStats(t, engine, rangeStart, rangeEnd, rangeKey.Timestamp.WallTime), ms) + }) + } }) } } +// checkDeleteRangeTombstone checks that the span targeted by the predicate +// based delete range operation only has point tombstones, as the size of the +// spans in this test are below rangeTombstoneThreshold +// +// the passed in rangekey contains info on the span PredicateDeleteRange +// operated on. The command should not have written an actual rangekey! +func checkPredicateDeleteRange(t *testing.T, engine storage.Reader, rKeyInfo storage.MVCCRangeKey) { + + iter := engine.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{ + KeyTypes: storage.IterKeyTypePointsAndRanges, + LowerBound: rKeyInfo.StartKey, + UpperBound: rKeyInfo.EndKey, + }) + defer iter.Close() + + for iter.SeekGE(storage.MVCCKey{Key: rKeyInfo.StartKey}); ; iter.NextKey() { + ok, err := iter.Valid() + require.NoError(t, err) + if !ok { + break + } + hasPoint, hashRange := iter.HasPointAndRange() + if !hasPoint && hashRange { + // PredicateDeleteRange should not have written any delete tombstones; + // therefore, any range key tombstones in the span should have been + // written before the request was issued. + for _, rKey := range iter.RangeKeys() { + require.Equal(t, true, rKey.RangeKey.Timestamp.Less(rKeyInfo.Timestamp)) + } + continue + } + value, err := storage.DecodeMVCCValue(iter.UnsafeValue()) + require.NoError(t, err) + require.True(t, value.IsTombstone()) + } +} + +// checkDeleteRangeTombstone checks that the range tombstone was written successfully. +func checkDeleteRangeTombstone( + t *testing.T, engine storage.Reader, rangeKey storage.MVCCRangeKey, now hlc.ClockTimestamp, +) { + iter := engine.NewMVCCIterator(storage.MVCCKeyAndIntentsIterKind, storage.IterOptions{ + KeyTypes: storage.IterKeyTypeRangesOnly, + LowerBound: rangeKey.StartKey, + UpperBound: rangeKey.EndKey, + }) + defer iter.Close() + iter.SeekGE(storage.MVCCKey{Key: rangeKey.StartKey}) + + var seen storage.MVCCRangeKeyValue + for { + ok, err := iter.Valid() + require.NoError(t, err) + if !ok { + break + } + require.True(t, ok) + for _, rkv := range iter.RangeKeys() { + if rkv.RangeKey.Timestamp.Equal(rangeKey.Timestamp) { + if len(seen.RangeKey.StartKey) == 0 { + seen = rkv.Clone() + } else { + seen.RangeKey.EndKey = rkv.RangeKey.EndKey.Clone() + require.Equal(t, seen.Value, rkv.Value) + } + break + } + } + iter.Next() + } + rangeKey.StartKey.Equal(seen.RangeKey.StartKey) + require.Equal(t, rangeKey, seen.RangeKey) + + value, err := storage.DecodeMVCCValue(seen.Value) + require.NoError(t, err) + require.True(t, value.IsTombstone()) + require.Equal(t, now, value.LocalTimestamp) +} + // computeStats computes MVCC stats for the given range. // // TODO(erikgrinaker): This, storage.computeStats(), and engineStats() should be diff --git a/pkg/roachpb/api.proto b/pkg/roachpb/api.proto index cb94741f68a3..85afc74f43c3 100644 --- a/pkg/roachpb/api.proto +++ b/pkg/roachpb/api.proto @@ -356,6 +356,34 @@ message DeleteRangeRequest { // The caller must check the MVCCRangeTombstones version gate before using // this parameter, as it is new in 22.2. bool use_range_tombstone = 5; + + DeleteRangePredicates predicates = 6 [(gogoproto.nullable) = false]; +} + +// DeleteRangePredicates if specified, will conduct predicate based DeleteRange. +// Predicate based delete range will issue tombstones on live keys that match the +// filters provided by the caller. In particular, long runs of matched keys will +// get deleted with a range tombstone, while smaller runs will get deleted with +// point tombstones. Note that the keyspace across runs does not overlap. +// +// To pass DeleteRangePredicates, the client must also pass UseRangeTombstone. +message DeleteRangePredicates { + // StartTime specifies an exclusive lower bound to surface keys + // for deletion. If specified, DeleteRange will only issue tombstones to keys + // within the span [startKey, endKey) that also have MVCC versions with + // timestamps between (startTime, endTime), where endTime is the request timestamp. + // + // The main application for this is a rollback of IMPORT INTO on a non-empty + // table. Here, DeleteRange with startTime = ImportStartTime, must only delete + // keys written by the import. In other words, older, pre-import, data cannot + // be touched. Because IMPORT INTO takes a table offline and does not allow + // masking an existing key, this operation will not issue tombstones to + // pre-import data that were written at or below StartTime. + // + // In other words, this operation assumes that for a k@t in the importing table: + // - t must be < endTime + // - if t in (startTime, endTime), then there is no other k@t' where t' <= startTime. + util.hlc.Timestamp start_time = 6 [(gogoproto.nullable) = false]; } // A DeleteRangeResponse is the return value from the DeleteRange() diff --git a/pkg/storage/mvcc.go b/pkg/storage/mvcc.go index 41c3e00f7f84..705ddc85bde9 100644 --- a/pkg/storage/mvcc.go +++ b/pkg/storage/mvcc.go @@ -2321,8 +2321,16 @@ func MVCCClearTimeRange( }) defer iter.Close() + // clearedMetaKey is the latest surfaced key that will get cleared var clearedMetaKey MVCCKey - var clearedMeta, restoredMeta enginepb.MVCCMetadata + + // clearedMeta contains metadata on the clearedMetaKey + var clearedMeta enginepb.MVCCMetadata + + // restoredMeta contains metadata on the previous version the clearedMetaKey. + // Once the key in clearedMetaKey is cleared, the key represented in + // restoredMeta becomes the latest version of this MVCC key. + var restoredMeta enginepb.MVCCMetadata iter.SeekGE(MVCCKey{Key: key}) for { if ok, err := iter.Valid(); err != nil { @@ -2466,6 +2474,297 @@ func MVCCDeleteRange( return keys, res.ResumeSpan, res.NumKeys, nil } +// MVCCPredicateDeleteRange issues MVCC tombstones at endTime to live keys +// within the span [startKey, endKey) that also have MVCC versions that match +// the predicate filters. Long runs of matched keys will get deleted with a +// range Tombstone, while smaller runs will get deleted with point tombstones. +// The keyspaces of each run do not overlap. +// +// This operation is non-transactional, but will check for existing intents in +// the target key span, regardless of timestamp, and return a WriteIntentError +// containing up to maxIntents intents. +// +// MVCCPredicateDeleteRange will return with a resumeSpan if the number of tombstones +// written exceeds maxBatchSize or the size of the written tombstones exceeds maxByteSize. +// These constraints prevent overwhelming raft. +// +// If an MVCC key surfaced has a timestamp at or above endTime, +// MVCCPredicateDeleteRange returns a WriteTooOldError without a resumeSpan, +// even if tombstones were already written to disk. To resolve, the caller +// should retry the call at a higher timestamp, assuming they have the +// appropriate level of isolation (e.g. the span covers an offline table, in the +// case of IMPORT rollbacks). +// +// An example of how this works: Issuing DeleteRange[a,e)@3 with +// Predicate{StartTime=1} on the following keys would issue tombstones at a@3, +// b@3, and d@3. +// +// t3 +// t2 a2 b2 d2 e2 +// t1 b1 c1 +// a b c d e +func MVCCPredicateDeleteRange( + ctx context.Context, + rw ReadWriter, + ms *enginepb.MVCCStats, + startKey, endKey roachpb.Key, + endTime hlc.Timestamp, + localTimestamp hlc.ClockTimestamp, + leftPeekBound, rightPeekBound roachpb.Key, + predicates roachpb.DeleteRangePredicates, + maxBatchSize, maxBatchByteSize int64, + rangeTombstoneThreshold int64, + maxIntents int64, +) (*roachpb.Span, error) { + + if maxBatchSize == 0 { + // Set maxBatchSize to a large number to ensure MVCCPredicateDeleteRange + // doesn't return early due to batch size. Note that maxBatchSize is only + // set to 0 during testing. + maxBatchSize = math.MaxInt64 + } + + // batchSize is the number tombstones (point and range) that have been flushed. + var batchSize int64 + var batchByteSize int64 + + // runSize is the number tombstones (point and range) that will get flushed in + // the current run. + var runSize int64 + var runByteSize int64 + + var runStart, runEnd roachpb.Key + + buf := make([]roachpb.Key, rangeTombstoneThreshold) + + if ms == nil { + return nil, errors.AssertionFailedf( + "MVCCStats passed in to MVCCPredicateDeleteRange must be non-nil to ensure proper stats" + + " computation during Delete operations") + } + + // Check for any overlapping intents, and return them to be resolved. + if intents, err := ScanIntents(ctx, rw, startKey, endKey, maxIntents, 0); err != nil { + return nil, err + } else if len(intents) > 0 { + return nil, &roachpb.WriteIntentError{Intents: intents} + } + + // continueRun returns three bools: the first is true if the current run + // should continue; the second is true if the latest key is a point tombstone; + // the third is true if the latest key is a range tombstone. If a non-nil + // error is returned, the booleans are invalid. The run should continue if: + // + // 1) The latest version of the key is a point or range tombstone, with a + // timestamp below the client provided EndTime. Since the goal is to create + // long runs, any tombstoned key should continue the run. + // + // 2) The latest key is live, matches the predicates, and has a + // timestamp below EndTime. + continueRun := func(k MVCCKey, iter SimpleMVCCIterator, + ) (toContinue bool, isPointTombstone bool, isRangeTombstone bool, err error) { + hasPointKey, hasRangeKey := iter.HasPointAndRange() + if hasRangeKey { + // TODO (msbutler): cache the range keys while the range bounds remain + // constant, since iter.RangeKeys() is expensive. Manual caching may not be necessary if + // https://github.com/cockroachdb/cockroach/issues/84379 lands. + rangeKeys := iter.RangeKeys() + if endTime.LessEq(rangeKeys[0].RangeKey.Timestamp) { + return false, false, false, roachpb.NewWriteTooOldError(endTime, + rangeKeys[0].RangeKey.Timestamp.Next(), k.Key.Clone()) + } + if !hasPointKey { + // landed on bare range key. + return true, false, true, nil + } + if k.Timestamp.Less(rangeKeys[0].RangeKey.Timestamp) { + // The latest range tombstone shadows the point key; ok to continue run. + return true, false, true, nil + } + } + + // At this point, there exists a point key that shadows all range keys, + // if they exist. + vRaw := iter.UnsafeValue() + + if endTime.LessEq(k.Timestamp) { + return false, false, false, roachpb.NewWriteTooOldError(endTime, k.Timestamp.Next(), + k.Key.Clone()) + } + if len(vRaw) == 0 { + // The latest version of the key is a point tombstone. + return true, true, false, nil + } + + // The latest key is a live point key. Conduct predicate filtering. + if k.Timestamp.LessEq(predicates.StartTime) { + return false, false, false, nil + } + + // TODO (msbutler): use MVCCValueHeader to match on job ID predicate + _, err = DecodeMVCCValue(vRaw) + if err != nil { + return false, false, false, err + } + return true, false, false, nil + } + + // Create some reusable machinery for flushing a run with point tombstones + // that is typically used in a single MVCCPut call. + pointTombstoneIter := newMVCCIterator(rw, endTime, false /* rangeKeyMasking */, IterOptions{ + KeyTypes: IterKeyTypePointsAndRanges, + Prefix: true, + }) + defer pointTombstoneIter.Close() + pointTombstoneBuf := newPutBuffer() + defer pointTombstoneBuf.release() + + flushDeleteKeys := func() error { + if runSize == 0 { + return nil + } + if runSize >= rangeTombstoneThreshold || + // Even if we didn't get a large enough number of keys to switch to + // using range tombstones, the byte size of the keys we did get is now too large to + // encode them all within the byte size limit, so use a range tombstone anyway. + batchByteSize+runByteSize >= maxBatchByteSize { + if err := MVCCDeleteRangeUsingTombstone(ctx, rw, ms, + runStart, runEnd.Next(), endTime, localTimestamp, leftPeekBound, rightPeekBound, + maxIntents); err != nil { + return err + } + batchByteSize += int64(MVCCRangeKey{StartKey: runStart, EndKey: runEnd, Timestamp: endTime}.EncodedSize()) + batchSize++ + } else { + // Use Point tombstones + for i := int64(0); i < runSize; i++ { + if err := mvccPutInternal(ctx, rw, pointTombstoneIter, ms, buf[i], endTime, localTimestamp, noValue, + nil, pointTombstoneBuf, nil); err != nil { + return err + } + } + batchByteSize += runByteSize + batchSize += runSize + } + runSize = 0 + runStart = roachpb.Key{} + return nil + } + + // Using the IncrementalIterator with the time-bound iter optimization could + // potentially be a big win here -- the expected use-case for this is to run + // over an entire table's span with a very recent timestamp, issuing tombstones to + // writes of some failed IMPORT and that could very likely only have hit + // some small subset of the table's keyspace. + // + // The MVCCIncrementalIterator uses a non-time-bound iter as its source + // of truth, and only uses the TBI iterator as an optimization when finding + // the next KV to iterate over. This pattern allows us to quickly skip over + // swaths of uninteresting keys, but then iterates over the latest key of each MVCC key. + // + // Notice that the iterator's EndTime is set to hlc.MaxTimestamp, in order to + // detect and fail on any keys written at or after the client provided + // endTime. We don't _expect_ to hit intents or newer keys in the client + // provided span since the MVCCPredicateDeleteRange is only intended for + // non-live key spans, but there could be an intent leftover. + iter := NewMVCCIncrementalIterator(rw, MVCCIncrementalIterOptions{ + EndKey: endKey, + StartTime: predicates.StartTime, + EndTime: hlc.MaxTimestamp, + RangeKeyMaskingBelow: endTime, + KeyTypes: IterKeyTypePointsAndRanges, + }) + defer iter.Close() + + iter.SeekGE(MVCCKey{Key: startKey}) + for { + if ok, err := iter.Valid(); err != nil { + return nil, err + } else if !ok { + break + } + k := iter.UnsafeKey() + toContinue, isPointTombstone, isRangeTombstone, err := continueRun(k, iter) + if err != nil { + return nil, err + } + + // If the latest version of the key is a tombstone at a timestamp < endtime, + // the timestamp could be less than predicates.startTime. In this case, the + // run can continue and Since there's no need to issue another tombstone, + // don't update runSize or buf. + if isRangeTombstone { + // Because range key information can be inferred at point keys, + // skip over the surfaced range key, and reason about shadowed keys at + // the surfaced point key. + // + // E.g. Scanning the keys below: + // 2 a2 + // 1 o---o + // a b + // + // would result in two surfaced keys: + // {a-b}@1; + // a2, {a-b}@1 + // + // Note that the range key gets surfaced before the point key, + // even though the point key shadows it. + iter.NextIgnoringTime() + } else if isPointTombstone { + // Since the latest version of this key is a point tombstone, skip over + // older versions of this key, and move the iterator to the next key + // even if it lies outside (startTime, endTime), to see if there's a + // need to flush. + iter.NextKeyIgnoringTime() + } else if toContinue { + // The latest version of the key is live, matches the predicate filters + // -- e.g. has a timestamp between (predicates.startTime, Endtime); + // therefore, plan to delete it. + if batchSize+runSize >= maxBatchSize || batchByteSize+runByteSize >= maxBatchByteSize { + // The matched key will be the start the resume span. + if err := flushDeleteKeys(); err != nil { + return nil, err + } + return &roachpb.Span{Key: k.Key.Clone(), EndKey: endKey}, nil + } + if runSize == 0 { + runStart = append(runStart[:0], k.Key...) + } + runEnd = append(runEnd[:0], k.Key...) + + if runSize < rangeTombstoneThreshold { + // Only buffer keys if there's a possibility of issuing point tombstones. + // + // To avoid unecessary memory allocation, overwrite the previous key at + // buffer's current position. No data corruption occurs because the + // buffer is flushed up to runSize. + buf[runSize] = append(buf[runSize][:0], runEnd...) + } + + runSize++ + runByteSize += int64(k.EncodedSize()) + + // Move the iterator to the next key in linear iteration even if it lies + // outside (startTime, endTime), to see if there's a need to flush. We can + // skip to the next key, as we don't care about older versions of the + // current key we're about to delete. + iter.NextKeyIgnoringTime() + } else { + // This key does not match. Flush the run of matching keys, + // to prevent issuing tombstones on keys that do not match the predicates. + if err := flushDeleteKeys(); err != nil { + return nil, err + } + // Move the incremental iterator to the next valid MVCC key that can be + // deleted. If TBI was enabled when initializing the incremental iterator, + // this step could jump over large swaths of keys that do not qualify for + // clearing. + iter.NextKey() + } + } + return nil, flushDeleteKeys() +} + // MVCCDeleteRangeUsingTombstone deletes the given MVCC keyspan at the given // timestamp using an MVCC range tombstone (rather than MVCC point tombstones). // This operation is non-transactional, but will check for existing intents and diff --git a/pkg/storage/mvcc_history_test.go b/pkg/storage/mvcc_history_test.go index ec8e44e2fe0e..bd753e7a3494 100644 --- a/pkg/storage/mvcc_history_test.go +++ b/pkg/storage/mvcc_history_test.go @@ -13,6 +13,7 @@ package storage import ( "context" "fmt" + "math" "path/filepath" "regexp" "sort" @@ -73,6 +74,7 @@ var sstIterVerify = util.ConstantWithMetamorphicTestBool("mvcc-histories-sst-ite // del [t=] [ts=[,]] [localTs=[,]] [resolve [status=]] k= // del_range [t=] [ts=[,]] [localTs=[,]] [resolve [status=]] k= [end=] [max=] [returnKeys] // del_range_ts [ts=[,]] [localTs=[,]] k= end= +// del_range_pred [ts=[,]] [localTs=[,]] k= end= [startTime=,max=,maxBytes=,rangeThreshold=] // increment [t=] [ts=[,]] [localTs=[,]] [resolve [status=]] k= [inc=] // initput [t=] [ts=[,]] [resolve [status=]] k= v= [raw] [failOnTombstones] // merge [t=] [ts=[,]] [resolve [status=]] k= v= [raw] @@ -659,6 +661,7 @@ var commands = map[string]cmd{ "del": {typDataUpdate, cmdDelete}, "del_range": {typDataUpdate, cmdDeleteRange}, "del_range_ts": {typDataUpdate, cmdDeleteRangeTombstone}, + "del_range_pred": {typDataUpdate, cmdDeleteRangePredicate}, "export": {typReadOnly, cmdExport}, "get": {typReadOnly, cmdGet}, "increment": {typDataUpdate, cmdIncrement}, @@ -1019,6 +1022,40 @@ func cmdDeleteRangeTombstone(e *evalCtx) error { }) } +func cmdDeleteRangePredicate(e *evalCtx) error { + key, endKey := e.getKeyRange() + ts := e.getTs(nil) + localTs := hlc.ClockTimestamp(e.getTsWithName("localTs")) + + max := math.MaxInt64 + if e.hasArg("max") { + e.scanArg("max", &max) + } + + maxBytes := math.MaxInt64 + if e.hasArg("maxBytes") { + e.scanArg("maxBytes", &maxBytes) + } + predicates := roachpb.DeleteRangePredicates{ + StartTime: e.getTsWithName("startTime"), + } + rangeThreshold := 64 + if e.hasArg("rangeThreshold") { + e.scanArg("rangeThreshold", &rangeThreshold) + } + return e.withWriter("del_range_pred", func(rw ReadWriter) error { + resumeSpan, err := MVCCPredicateDeleteRange(e.ctx, rw, e.ms, key, endKey, ts, + localTs, nil, nil, predicates, int64(max), int64(maxBytes), int64(rangeThreshold), 0) + + if resumeSpan != nil { + e.results.buf.Printf("del_range_pred: resume span [%s,%s)\n", resumeSpan.Key, + resumeSpan.EndKey) + } + return err + }, + ) +} + func cmdGet(e *evalCtx) error { txn := e.getTxn(optional) key := e.getKey() diff --git a/pkg/storage/testdata/mvcc_histories/delete_range_predicate b/pkg/storage/testdata/mvcc_histories/delete_range_predicate new file mode 100644 index 000000000000..b3c8e31b4b0d --- /dev/null +++ b/pkg/storage/testdata/mvcc_histories/delete_range_predicate @@ -0,0 +1,382 @@ +# Tests MVCC Del Range with timestamp predicate. +# +# Set up some point keys, point tombstones x, range tombstones o--o, +# and intents []. +# +# 7 [i7] +# 6 +# 5 +# 4 x d4 f4 x h4 o-------------------o +# 3 b3 +# 2 a2 e2 g2 +# 1 d1 +# 0 +# a b c d e f g h i j k l m n o p +run ok +put k=a ts=2 v=a2 +del k=a ts=4 +put k=b ts=3 v=b3 +put k=d ts=1 v=d1 +put k=d ts=4 v=d4 +put k=e ts=2 v=e2 +put k=f ts=4 v=f4 +put k=g ts=2 v=g2 +del k=g ts=4 +put k=h ts=4 v=h4 +del_range_ts k=k end=p ts=4 +with t=A + txn_begin ts=7 + put k=i v=i7 +---- +>> at end: +txn: "A" meta={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} lock=true stat=PENDING rts=7.000000000,0 wto=false gul=0,0 +rangekey: {k-p}/[4.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 + +# Writing next to or above point keys and tombstones should work. +run stats ok +del_range_pred k=a end=i ts=5 startTime=3 rangeThreshold=2 +---- +>> del_range_pred k=a end=i ts=5 startTime=3 rangeThreshold=2 +stats: key_bytes=+12 val_count=+1 range_key_count=+1 range_key_bytes=+14 range_val_count=+1 live_count=-3 live_bytes=-63 gc_bytes_age=+8455 +>> at end: +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {k-p}/[4.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +stats: key_count=8 key_bytes=160 val_count=12 val_bytes=111 range_key_count=2 range_key_bytes=27 range_val_count=2 live_count=3 live_bytes=111 gc_bytes_age=17863 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93 + +# error on intent, no tombstones should be written +run stats error +del_range_pred k=a end=p ts=6 startTime=1 +---- +>> del_range_pred k=a end=p ts=6 startTime=1 +stats: no change +>> at end: +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {k-p}/[4.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +stats: key_count=8 key_bytes=160 val_count=12 val_bytes=111 range_key_count=2 range_key_bytes=27 range_val_count=2 live_count=3 live_bytes=111 gc_bytes_age=17863 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93 +error: (*roachpb.WriteIntentError:) conflicting intents on "i" + +# error encountering point key at d5. +# a tombstone should not get written at c5 or e5, since +# DeleteRange didn't flush before reaching d5. +run stats error +put k=c ts=2 v=c2 +del_range_pred k=c end=f ts=5 startTime=1 +---- +>> put k=c ts=2 v=c2 +stats: key_count=+1 key_bytes=+14 val_count=+1 val_bytes=+7 live_count=+1 live_bytes=+21 +>> del_range_pred k=c end=f ts=5 startTime=1 +stats: no change +>> at end: +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {k-p}/[4.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "c"/2.000000000,0 -> /BYTES/c2 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +stats: key_count=9 key_bytes=174 val_count=13 val_bytes=118 range_key_count=2 range_key_bytes=27 range_val_count=2 live_count=4 live_bytes=132 gc_bytes_age=17863 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93 +error: (*roachpb.WriteTooOldError:) WriteTooOldError: write for key "d" at timestamp 5.000000000,0 too old; wrote at 5.000000000,1 + +# error encountering range key at k4. +# a tombstones should not get written to j4 or q4 since +# DeleteRange did not flush before reaching rangekey {k-p}4. +run stats error +put k=j ts=2 v=j2 +put k=q ts=2 v=q2 +del_range_pred k=j end=r ts=4 startTime=1 rangeThreshold=2 +---- +>> put k=j ts=2 v=j2 +stats: key_count=+1 key_bytes=+14 val_count=+1 val_bytes=+7 live_count=+1 live_bytes=+21 +>> put k=q ts=2 v=q2 +stats: key_count=+1 key_bytes=+14 val_count=+1 val_bytes=+7 live_count=+1 live_bytes=+21 +>> del_range_pred k=j end=r ts=4 startTime=1 rangeThreshold=2 +stats: no change +>> at end: +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {k-p}/[4.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "c"/2.000000000,0 -> /BYTES/c2 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +data: "j"/2.000000000,0 -> /BYTES/j2 +data: "q"/2.000000000,0 -> /BYTES/q2 +stats: key_count=11 key_bytes=202 val_count=15 val_bytes=132 range_key_count=2 range_key_bytes=27 range_val_count=2 live_count=6 live_bytes=174 gc_bytes_age=17863 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93 +error: (*roachpb.WriteTooOldError:) WriteTooOldError: write for key "k" at timestamp 4.000000000,0 too old; wrote at 4.000000000,1 + +# At this point the keyspace looks like this: +# 7 [i7] +# 6 +# 5 x o-----------o +# 4 x d4 f4 x h4 o-------------------o +# 3 b3 +# 2 a2 c2 e2 g2 j2 q2 +# 1 d1 +# 0 +# a b c d e f g h i j k l m n o p q + +# check that del_range will not write anything if no live keys are in its span +# and predicate ts. Note that the range keys bounds are [firstMatchingKey,LastMatchingKey.Next()]. +run stats ok +del_range_pred k=j end=r ts=5 startTime=2 rangeThreshold=2 +---- +>> del_range_pred k=j end=r ts=5 startTime=2 rangeThreshold=2 +stats: no change +>> at end: +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {k-p}/[4.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "c"/2.000000000,0 -> /BYTES/c2 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +data: "j"/2.000000000,0 -> /BYTES/j2 +data: "q"/2.000000000,0 -> /BYTES/q2 +stats: key_count=11 key_bytes=202 val_count=15 val_bytes=132 range_key_count=2 range_key_bytes=27 range_val_count=2 live_count=6 live_bytes=174 gc_bytes_age=17863 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93 + +# try the same call as above, except with startTime set to 1 +# check that delrange properly continues the run over a range tombstone +run stats ok +del_range_pred k=j end=r ts=5 startTime=1 rangeThreshold=2 +---- +>> del_range_pred k=j end=r ts=5 startTime=1 rangeThreshold=2 +stats: range_key_count=+2 range_key_bytes=+36 range_val_count=+3 live_count=-2 live_bytes=-42 gc_bytes_age=+7406 +>> at end: +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {j-k}/[5.000000000,0=/] +rangekey: {k-p}/[5.000000000,0=/ 4.000000000,0=/] +rangekey: {p-q\x00}/[5.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "c"/2.000000000,0 -> /BYTES/c2 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +data: "j"/2.000000000,0 -> /BYTES/j2 +data: "q"/2.000000000,0 -> /BYTES/q2 +stats: key_count=11 key_bytes=202 val_count=15 val_bytes=132 range_key_count=4 range_key_bytes=63 range_val_count=5 live_count=4 live_bytes=132 gc_bytes_age=25269 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93 + +# check that we flush with a range tombstone, if maxBytes is exceeded +# even though range tombstone threshold has not been met. +# Return a resume span. Note that the run extends past key d, since +# its latest value is a point tombstone, and is therefore not counted +# in runByteSize. +run stats ok +del_range_pred k=c end=i ts=6 startTime=1 maxBytes=1 +---- +>> del_range_pred k=c end=i ts=6 startTime=1 maxBytes=1 +del_range_pred: resume span ["e","i") +stats: range_key_count=+1 range_key_bytes=+14 range_val_count=+1 live_count=-1 live_bytes=-21 gc_bytes_age=+3290 +>> at end: +rangekey: c{-\x00}/[6.000000000,0=/] +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {j-k}/[5.000000000,0=/] +rangekey: {k-p}/[5.000000000,0=/ 4.000000000,0=/] +rangekey: {p-q\x00}/[5.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "c"/2.000000000,0 -> /BYTES/c2 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +data: "j"/2.000000000,0 -> /BYTES/j2 +data: "q"/2.000000000,0 -> /BYTES/q2 +stats: key_count=11 key_bytes=202 val_count=15 val_bytes=132 range_key_count=5 range_key_bytes=77 range_val_count=6 live_count=3 live_bytes=111 gc_bytes_age=28559 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93 + +# check that we flush properly if maxBatchSize is exceeded. +# Since max is 1, write a tombstone to e, and as soon as it sees the +# next eligible key to delete (f), return a resume span. +# Note that we dont count shadowed tombstones in the batchSize +run stats ok +put k=f ts=6 v=f6 +del_range_pred k=c end=i ts=7 startTime=1 max=1 +---- +>> put k=f ts=6 v=f6 +stats: key_bytes=+12 val_count=+1 val_bytes=+7 live_count=+1 live_bytes=+21 gc_bytes_age=-190 +>> del_range_pred k=c end=i ts=7 startTime=1 max=1 +del_range_pred: resume span ["f","i") +stats: key_bytes=+12 val_count=+1 live_count=-1 live_bytes=-21 gc_bytes_age=+3069 +>> at end: +rangekey: c{-\x00}/[6.000000000,0=/] +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {j-k}/[5.000000000,0=/] +rangekey: {k-p}/[5.000000000,0=/ 4.000000000,0=/] +rangekey: {p-q\x00}/[5.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "c"/2.000000000,0 -> /BYTES/c2 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/7.000000000,0 -> / +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/6.000000000,0 -> /BYTES/f6 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +data: "j"/2.000000000,0 -> /BYTES/j2 +data: "q"/2.000000000,0 -> /BYTES/q2 +stats: key_count=11 key_bytes=226 val_count=17 val_bytes=139 range_key_count=5 range_key_bytes=77 range_val_count=6 live_count=3 live_bytes=111 gc_bytes_age=31438 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93 + +# Run the same DeleteRange as above at ts 8 +# No resume span should get returned because the iterator goes through +# the whole span without encountering a second eligible key to delete +run stats ok +del_range_pred k=c end=i ts=8 startTime=1 max=1 +---- +>> del_range_pred k=c end=i ts=8 startTime=1 max=1 +stats: key_bytes=+12 val_count=+1 live_count=-1 live_bytes=-21 gc_bytes_age=+3036 +>> at end: +rangekey: c{-\x00}/[6.000000000,0=/] +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {j-k}/[5.000000000,0=/] +rangekey: {k-p}/[5.000000000,0=/ 4.000000000,0=/] +rangekey: {p-q\x00}/[5.000000000,0=/] +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "c"/2.000000000,0 -> /BYTES/c2 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/7.000000000,0 -> / +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/8.000000000,0 -> / +data: "f"/6.000000000,0 -> /BYTES/f6 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +data: "j"/2.000000000,0 -> /BYTES/j2 +data: "q"/2.000000000,0 -> /BYTES/q2 +stats: key_count=11 key_bytes=238 val_count=18 val_bytes=139 range_key_count=5 range_key_bytes=77 range_val_count=6 live_count=2 live_bytes=90 gc_bytes_age=34474 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93 + +# Write some new keys on a and b and ensure a run of point tombstones gets properly written +run stats ok +put k=a ts=5 v=a5 +put k=b ts=5 v=a5 +del_range_pred k=a end=c ts=6 startTime=1 +---- +>> put k=a ts=5 v=a5 +stats: key_bytes=+12 val_count=+1 val_bytes=+7 live_count=+1 live_bytes=+21 gc_bytes_age=-192 +>> put k=b ts=5 v=a5 +stats: key_bytes=+12 val_count=+1 val_bytes=+7 gc_bytes_age=+1805 +>> del_range_pred k=a end=c ts=6 startTime=1 +stats: key_bytes=+24 val_count=+2 live_count=-2 live_bytes=-42 gc_bytes_age=+6204 +>> at end: +rangekey: c{-\x00}/[6.000000000,0=/] +rangekey: {f-h\x00}/[5.000000000,0=/] +rangekey: {j-k}/[5.000000000,0=/] +rangekey: {k-p}/[5.000000000,0=/ 4.000000000,0=/] +rangekey: {p-q\x00}/[5.000000000,0=/] +data: "a"/6.000000000,0 -> / +data: "a"/5.000000000,0 -> /BYTES/a5 +data: "a"/4.000000000,0 -> / +data: "a"/2.000000000,0 -> /BYTES/a2 +data: "b"/6.000000000,0 -> / +data: "b"/5.000000000,0 -> /BYTES/a5 +data: "b"/3.000000000,0 -> /BYTES/b3 +data: "c"/2.000000000,0 -> /BYTES/c2 +data: "d"/5.000000000,0 -> / +data: "d"/4.000000000,0 -> /BYTES/d4 +data: "d"/1.000000000,0 -> /BYTES/d1 +data: "e"/7.000000000,0 -> / +data: "e"/2.000000000,0 -> /BYTES/e2 +data: "f"/8.000000000,0 -> / +data: "f"/6.000000000,0 -> /BYTES/f6 +data: "f"/4.000000000,0 -> /BYTES/f4 +data: "g"/4.000000000,0 -> / +data: "g"/2.000000000,0 -> /BYTES/g2 +data: "h"/4.000000000,0 -> /BYTES/h4 +meta: "i"/0,0 -> txn={id=00000000 key=/Min pri=0.00000000 epo=0 ts=7.000000000,0 min=0,0 seq=0} ts=7.000000000,0 del=false klen=12 vlen=7 mergeTs= txnDidNotUpdateMeta=true +data: "i"/7.000000000,0 -> /BYTES/i7 +data: "j"/2.000000000,0 -> /BYTES/j2 +data: "q"/2.000000000,0 -> /BYTES/q2 +stats: key_count=11 key_bytes=286 val_count=22 val_bytes=153 range_key_count=5 range_key_bytes=77 range_val_count=6 live_count=1 live_bytes=69 gc_bytes_age=42291 intent_count=1 intent_bytes=19 separated_intent_count=1 intent_age=93