Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachtest: tpccbench: handle overload vm crash in last search iter #64205

Merged
merged 1 commit into from
Apr 26, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions pkg/cmd/roachtest/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -793,18 +793,8 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
t.Fatal(errors.Wrap(err, "failed to create temp dir"))
}
defer func() { _ = os.RemoveAll(resultsDir) }()
s := search.NewLineSearcher(1, b.LoadWarehouses, b.EstimatedMax, initStepSize, precision)
iteration := 0
if res, err := s.Search(func(warehouses int) (bool, error) {
iteration++
t.l.Printf("initializing cluster for %d warehouses (search attempt: %d)", warehouses, iteration)

// NB: for goroutines in this monitor, handle errors via `t.Fatal` to
// *abort* the line search and whole tpccbench run. Return the errors
// to indicate that the specific warehouse count failed, but that the
// line search ought to continue.
m := newMonitor(ctx, c, roachNodes)

restart := func() {
// We overload the clusters in tpccbench, which can lead to transient infra
// failures. These are a) really annoying to debug and b) hide the actual
// passing warehouse count, making the line search sensitive to the choice
Expand Down Expand Up @@ -841,13 +831,29 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
}

c.Start(ctx, t, append(b.startOpts(), roachNodes)...)
}

s := search.NewLineSearcher(1, b.LoadWarehouses, b.EstimatedMax, initStepSize, precision)
iteration := 0
if res, err := s.Search(func(warehouses int) (bool, error) {
iteration++
t.l.Printf("initializing cluster for %d warehouses (search attempt: %d)", warehouses, iteration)

restart()

time.Sleep(restartWait)

// Set up the load generation configuration.
rampDur := 5 * time.Minute
loadDur := 10 * time.Minute
loadDone := make(chan time.Time, numLoadGroups)

// NB: for goroutines in this monitor, handle errors via `t.Fatal` to
// *abort* the line search and whole tpccbench run. Return the errors
// to indicate that the specific warehouse count failed, but that the
// line search ought to continue.
m := newMonitor(ctx, c, roachNodes)

// If we're running chaos in this configuration, modify this config.
if b.Chaos {
// Kill one node at a time.
Expand Down Expand Up @@ -981,6 +987,10 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) {
}); err != nil {
t.Fatal(err)
} else {
// The last iteration may have been a failing run that overloaded
// nodes to the point of them crashing. Make roachtest happy by
// restarting the cluster so that it can run consistency checks.
restart()
ttycolor.Stdout(ttycolor.Green)
t.l.Printf("------\nMAX WAREHOUSES = %d\n------\n\n", res)
ttycolor.Stdout(ttycolor.Reset)
Expand Down