Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DNM: storage: Test for merge/WaitForApplication bug #35653

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pkg/storage/merge_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ const (
// purgatory make merge attempts. Since merges are relatively untested, the
// reasons that a range may fail to merge are unknown, so the merge queue has
// a large purgatory interval.
mergeQueuePurgatoryCheckInterval = 1 * time.Minute
mergeQueuePurgatoryCheckInterval = 10 * time.Millisecond

// The current implementation of merges requires rewriting the right-hand data
// onto the left-hand range, even when the ranges are collocated. This is
// expensive, so limit to one merge at a time.
mergeQueueConcurrency = 1
mergeQueueConcurrency = 8
)

// MergeQueueInterval is a setting that controls how often the merge queue waits
Expand Down
4 changes: 3 additions & 1 deletion pkg/storage/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,9 @@ func waitForApplication(
return err
})
}
return g.Wait()
err := g.Wait()
time.Sleep(5 * time.Millisecond)
return err
})
}

Expand Down
13 changes: 13 additions & 0 deletions pkg/storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2227,6 +2227,19 @@ func (s *Store) MergeRange(
return err
}

// if crash := rand.Intn(3) == 0; crash && s.StoreID() == 1 {
// go func() {
// time.Sleep(time.Duration(rand.Intn(int(5 * time.Millisecond.Nanoseconds()))))
// p, err := os.FindProcess(os.Getpid())
// if err != nil {
// log.Fatal(ctx, err)
// }
// if err := p.Kill(); err != nil {
// log.Fatal(ctx, err)
// }
// }()
// }

leftRepl.raftMu.AssertHeld()
rightRepl.raftMu.AssertHeld()

Expand Down
31 changes: 31 additions & 0 deletions pkg/storage/stores_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ package storage
import (
"bytes"
"context"
"math/rand"
"os"
"time"

"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage/engine"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/retry"
)
Expand Down Expand Up @@ -102,6 +105,34 @@ func (is Server) WaitForApplication(
leaseAppliedIndex := repl.mu.state.LeaseAppliedIndex
repl.mu.RUnlock()
if leaseAppliedIndex >= req.LeaseIndex {
// For performance reasons, we don't sync to disk when
// applying raft commands. This means that if a node restarts
// after applying but before the next sync, its
// LeaseAppliedIndex could temporarily regress (until it
// reapplies its latest raft log entries).
//
// Merging relies on the monotonicity of the log applied
// index, so before returning ensure that rocksdb has synced
// everything up to this point to disk.
//
// https://github.com/cockroachdb/cockroach/issues/33120
if err := engine.WriteSyncNoop(ctx, s.engine); err != nil {
return err
}

if crash := rand.Intn(3) == 0; crash && s.StoreID() == 1 {
go func() {
time.Sleep(time.Duration(rand.Intn(int(5 * time.Millisecond.Nanoseconds()))))
p, err := os.FindProcess(os.Getpid())
if err != nil {
log.Fatal(ctx, err)
}
if err := p.Kill(); err != nil {
log.Fatal(ctx, err)
}
}()
}

return nil
}
}
Expand Down
33 changes: 33 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env bash
set -euo pipefail

export COCKROACH_ENGINE_MAX_SYNC_DURATION=60s

killall -9 cockroach || true
make build
rm -rf cockroach-data* || true
for i in 0 1 2 3; do
./cockroach start --max-offset 10ms --insecure --host 127.0.0.1 --port $((26257+i)) --http-port $((8080+i)) --background --store "cockroach-data${i}" --join 127.0.0.1:26257
if [ $i -eq 0 ]; then ./cockroach init --insecure; fi
done
echo "
SET CLUSTER SETTING kv.range_merge.queue_interval = '1ns';
SET CLUSTER SETTING kv.range_merge.queue_enabled = false;
CREATE TABLE IF NOT EXISTS data (id INT PRIMARY KEY);
ALTER TABLE data SPLIT AT SELECT i FROM generate_series(1, 1000) AS g(i);
SET CLUSTER SETTING kv.range_merge.queue_enabled = true;
" | ./cockroach sql --insecure

./cockroach quit --insecure || true
sleep 1


retval=137
while [ $retval -eq 137 ];
do
set +e
./cockroach start --max-offset 10ms --insecure --logtostderr --host 127.0.0.1 --store cockroach-data0
retval=$?
set -e
echo "exit code: $retval"
done