Skip to content

Commit

Permalink
roachtest/failure-injection: add failure injection smoke test
Browse files Browse the repository at this point in the history
This adds an integration test for the failure injection library.
The test spins up a cluster and randomly selects a failure to
inject. It then validates that the failure was correctly injected.
Afterwards, it reverts the failure and validates that the failure
was correctly cleaned up.
  • Loading branch information
DarrylWong committed Feb 6, 2025
1 parent b446053 commit 0029b12
Show file tree
Hide file tree
Showing 4 changed files with 273 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pkg/cmd/roachtest/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ go_library(
"event_log.go",
"export_parquet.go",
"failover.go",
"failure_injection.go",
"fixtures.go",
"follower_reads.go",
"go_helpers.go",
Expand Down Expand Up @@ -248,6 +249,7 @@ go_library(
"//pkg/roachprod",
"//pkg/roachprod/config",
"//pkg/roachprod/errors",
"//pkg/roachprod/failureinjection/failures",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/roachprod/prometheus",
Expand Down
265 changes: 265 additions & 0 deletions pkg/cmd/roachtest/tests/failure_injection.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
// Copyright 2025 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package tests

import (
"context"
"fmt"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/task"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/errors"
"math/rand"
"strings"
"time"
)

type failureSmokeTest struct {
testName string
failureName string
args failures.FailureArgs

// Time to wait for the failure to propagate after inject/restore.
waitForFailureToPropagate time.Duration
// Validate that the failure was injected correctly, called after Setup() and Inject().
validateFailure func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error
// Validate that the failure was restored correctly, called after Restore().
validateRestore func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error
}

func (t *failureSmokeTest) run(ctx context.Context, l *logger.Logger, c cluster.Cluster, fr *failures.FailureRegistry) error {
// TODO(darryl): In the future, roachtests should interact with the failure injection library
// through helper functions in roachtestutil so they don't have to interface with roachprod
// directly.
failure, err := fr.GetFailure(c.MakeNodes(), t.failureName, l, c.IsSecure())
if err != nil {
return err
}
if err = failure.Setup(ctx, l, t.args); err != nil {
return err
}
if err = failure.Inject(ctx, l, t.args); err != nil {
return err
}

// Allow the failure to take effect.
if t.waitForFailureToPropagate > 0 {
l.Printf("sleeping for %s to allow failure to take effect", t.waitForFailureToPropagate)
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(t.waitForFailureToPropagate):
}
}

if err = t.validateFailure(ctx, l, c); err != nil {
return err
}
if err = failure.Restore(ctx, l, t.args); err != nil {
return err
}

// Allow the cluster to return to normal.
if t.waitForFailureToPropagate > 0 {
l.Printf("sleeping for %s to allow cluster to return to normal", t.waitForFailureToPropagate)
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(t.waitForFailureToPropagate):
}
}

if err = t.validateRestore(ctx, l, c); err != nil {
return err
}
if err = failure.Cleanup(ctx, l, t.args); err != nil {
return err
}
return nil
}

func (t *failureSmokeTest) noopRun(ctx context.Context, l *logger.Logger, c cluster.Cluster, fr *failures.FailureRegistry) error {
if err := t.validateFailure(ctx, l, c); err == nil {
return errors.New("no failure was injected but validation still passed")
}
if err := t.validateRestore(ctx, l, c); err != nil {
return errors.Wrapf(err, "no failure was injected but post restore validation still failed")
}
return nil
}

var bidirectionalNetworkPartitionTest = failureSmokeTest{
testName: "bidirectional network partition",
failureName: failures.IPTablesNetworkPartitionName,
args: failures.NetworkPartitionArgs{
Partitions: []failures.NetworkPartition{{
Source: install.Nodes{1},
Destination: install.Nodes{3},
Type: failures.Bidirectional,
}},
},
waitForFailureToPropagate: 5 * time.Second,
validateFailure: func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error {
blocked, err := checkPortBlocked(ctx, l, c, c.Nodes(1), c.Nodes(3))
if err != nil {
return err
}
if !blocked {
return fmt.Errorf("expected connections from node 1 to node 3 to be blocked")
}

blocked, err = checkPortBlocked(ctx, l, c, c.Nodes(1), c.Nodes(2))
if err != nil {
return err
}
if blocked {
return fmt.Errorf("expected connections from node 1 to node 2 to not be blocked")
}
return nil
},
validateRestore: func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error {
blocked, err := checkPortBlocked(ctx, l, c, c.Nodes(1), c.Nodes(3))
if err != nil {
return err
}
if blocked {
return fmt.Errorf("expected connections from node 1 to node 3 to not be blocked")
}
return err
},
}

var asymmetricNetworkPartitionTest = failureSmokeTest{
testName: "asymmetric network partition",
failureName: failures.IPTablesNetworkPartitionName,
args: failures.NetworkPartitionArgs{
Partitions: []failures.NetworkPartition{{
Source: install.Nodes{1},
Destination: install.Nodes{3},
Type: failures.Incoming,
}},
},
waitForFailureToPropagate: 5 * time.Second,
validateFailure: func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error {
blocked, err := checkPortBlocked(ctx, l, c, c.Nodes(1), c.Nodes(3))
if err != nil {
return err
}
if blocked {
return fmt.Errorf("expected connections from node 1 to node 3 to be open")
}

blocked, err = checkPortBlocked(ctx, l, c, c.Nodes(3), c.Nodes(1))
if err != nil {
return err
}
if !blocked {
return fmt.Errorf("expected connections from node 3 to node 1 to be blocked")
}
return nil
},
validateRestore: func(ctx context.Context, l *logger.Logger, c cluster.Cluster) error {
blocked, err := checkPortBlocked(ctx, l, c, c.Nodes(1), c.Nodes(3))
if err != nil {
return err
}
if blocked {
return fmt.Errorf("expected connections from node 1 to node 3 to not be blocked")
}
return err
},
}

// Helper function that uses nmap to check if connections between two nodes is blocked.
func checkPortBlocked(ctx context.Context, l *logger.Logger, c cluster.Cluster, from, to option.NodeListOption) (bool, error) {
// `nmap -oG` example output:
// Host: {IP} {HOST_NAME} Status: Up
// Host: {IP} {HOST_NAME} Ports: 26257/open/tcp//cockroach///
// We care about the port scan result and whether it is filtered or open.
res, err := c.RunWithDetailsSingleNode(ctx, l, option.WithNodes(from), fmt.Sprintf("nmap -p {pgport%[1]s} {ip%[1]s} -oG - | awk '/Ports:/{print $5}'", to))
if err != nil {
return false, err
}
return strings.Contains(res.Stdout, "filtered"), nil
}

func setupFailureSmokeTests(ctx context.Context, t test.Test, c cluster.Cluster) error {
// Download any dependencies needed.
if err := c.Install(ctx, t.L(), c.CRDBNodes(), "nmap"); err != nil {
return err
}
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.CRDBNodes())
// Run a light workload in the background so we have some traffic in the database.
c.Run(ctx, option.WithNodes(c.WorkloadNode()), "./cockroach workload init tpcc --warehouses=100 {pgurl:1}")
t.Go(func(goCtx context.Context, l *logger.Logger) error {
return c.RunE(goCtx, option.WithNodes(c.WorkloadNode()), "./cockroach workload run tpcc --tolerate-errors --warehouses=100 {pgurl:1-3}")
}, task.WithContext(ctx))
return nil
}

func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, noopFailer bool) {
if err := setupFailureSmokeTests(ctx, t, c); err != nil {
t.Fatal(err)
}
fr := failures.NewFailureRegistry()
fr.Register()

var failureSmokeTests = []failureSmokeTest{
bidirectionalNetworkPartitionTest,
asymmetricNetworkPartitionTest,
}

// Randomize the order of the tests in case any of the failures have unexpected side
// effects that may mask failures, e.g. a cgroups disk stall isn't properly restored
// which causes a dmsetup disk stall to appear to work even if it doesn't.
rand.Shuffle(len(failureSmokeTests), func(i, j int) {
failureSmokeTests[i], failureSmokeTests[j] = failureSmokeTests[j], failureSmokeTests[i]
})

for _, test := range failureSmokeTests {
t.L().Printf("running %s test", test.testName)
if noopFailer {
if err := test.noopRun(ctx, t.L(), c, fr); err != nil {
t.Fatal(err)
}
} else {
if err := test.run(ctx, t.L(), c, fr); err != nil {
t.Fatal(err)
}
}
t.L().Printf("%s test complete", test.testName)
}
}

func registerFISmokeTest(r registry.Registry) {
r.Add(registry.TestSpec{
Name: "failure-injection-smoke-test",
Owner: registry.OwnerTestEng,
Cluster: r.MakeClusterSpec(4, spec.WorkloadNode(), spec.CPU(2), spec.WorkloadNodeCPU(2), spec.ReuseNone()),
CompatibleClouds: registry.OnlyGCE,
// TODO(darryl): When the FI library starts seeing more use through roachtests, CLI, etc. switch this to Nightly.
Suites: registry.ManualOnly,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runFailureSmokeTest(ctx, t, c, false /* noopFailer */)
},
})
r.Add(registry.TestSpec{
Name: "failure-injection-noop-smoke-test",
Owner: registry.OwnerTestEng,
Cluster: r.MakeClusterSpec(4, spec.WorkloadNode(), spec.CPU(2), spec.WorkloadNodeCPU(2), spec.ReuseNone()),
CompatibleClouds: registry.OnlyGCE,
Suites: registry.ManualOnly,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
runFailureSmokeTest(ctx, t, c, true /* noopFailer */)
},
})
}
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ func RegisterTests(r registry.Registry) {
registerHibernate(r, hibernateSpatialOpts)
registerHotSpotSplits(r)
registerHTTPRestart(r)
registerFISmokeTest(r)
registerImportCancellation(r)
registerImportDecommissioned(r)
registerImportMixedVersions(r)
Expand Down
5 changes: 5 additions & 0 deletions pkg/roachprod/install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ rm /tmp/otelcol-contrib.deb;
"bzip2": `
sudo apt-get update;
sudo apt-get install -y bzip2;
`,

"nmap": `
sudo apt-get update;
sudo apt-get install -y nmap;
`,
}

Expand Down

0 comments on commit 0029b12

Please sign in to comment.