diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 716435bb7e61..68e6b6ba4ae5 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -134,6 +134,7 @@ ALL_TESTS = [ "//pkg/cmd/reduce/reduce:reduce_test", "//pkg/cmd/release:release_test", "//pkg/cmd/roachtest/clusterstats:clusterstats_test", + "//pkg/cmd/roachtest/roachtestutil/mixedversion:mixedversion_test", "//pkg/cmd/roachtest/tests:tests_test", "//pkg/cmd/roachtest:roachtest_test", "//pkg/cmd/teamcity-trigger:teamcity-trigger_test", @@ -1006,6 +1007,9 @@ GO_TARGETS = [ "//pkg/cmd/roachtest/clusterstats:clusterstats_test", "//pkg/cmd/roachtest/option:option", "//pkg/cmd/roachtest/registry:registry", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade:clusterupgrade", + "//pkg/cmd/roachtest/roachtestutil/mixedversion:mixedversion", + "//pkg/cmd/roachtest/roachtestutil/mixedversion:mixedversion_test", "//pkg/cmd/roachtest/roachtestutil:roachtestutil", "//pkg/cmd/roachtest/spec:spec", "//pkg/cmd/roachtest/test:test", @@ -2467,6 +2471,8 @@ GET_X_DATA_TARGETS = [ "//pkg/cmd/roachtest/option:get_x_data", "//pkg/cmd/roachtest/registry:get_x_data", "//pkg/cmd/roachtest/roachtestutil:get_x_data", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade:get_x_data", + "//pkg/cmd/roachtest/roachtestutil/mixedversion:get_x_data", "//pkg/cmd/roachtest/spec:get_x_data", "//pkg/cmd/roachtest/test:get_x_data", "//pkg/cmd/roachtest/tests:get_x_data", diff --git a/pkg/cmd/roachtest/roachtestutil/clusterupgrade/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/BUILD.bazel new file mode 100644 index 000000000000..40be598bfa88 --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/BUILD.bazel @@ -0,0 +1,26 @@ +load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data") +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "clusterupgrade", + srcs = [ + "clusterupgrade.go", + "predecessor_version.go", + ], + embedsrcs = ["predecessor_version.json"], + importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade", + visibility = ["//visibility:public"], + deps = [ + "//pkg/cmd/roachtest/cluster", + "//pkg/cmd/roachtest/option", + "//pkg/cmd/roachtest/test", + "//pkg/roachpb", + "//pkg/roachprod/install", + "//pkg/roachprod/logger", + "//pkg/util/retry", + "//pkg/util/version", + "@com_github_cockroachdb_errors//:errors", + ], +) + +get_x_data(name = "get_x_data") diff --git a/pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go new file mode 100644 index 000000000000..aed7b558cd8c --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go @@ -0,0 +1,270 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package clusterupgrade + +import ( + "context" + gosql "database/sql" + "fmt" + "math/rand" + "path/filepath" + "strconv" + "time" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/retry" + "github.com/cockroachdb/cockroach/pkg/util/version" +) + +const ( + // MainVersion is the sentinel used to represent that the binary + // passed to roachtest should be uploaded when `version` is left + // unspecified. + MainVersion = "" +) + +// BinaryVersion returns the binary version running on the node +// associated with the given database connection. +// NB: version means major.minor[-internal]; the patch level isn't +// returned. For example, a binary of version 19.2.4 will return 19.2. +func BinaryVersion(db *gosql.DB) (roachpb.Version, error) { + zero := roachpb.Version{} + var sv string + if err := db.QueryRow(`SELECT crdb_internal.node_executable_version();`).Scan(&sv); err != nil { + return zero, err + } + + if len(sv) == 0 { + return zero, fmt.Errorf("empty version") + } + + return roachpb.ParseVersion(sv) +} + +// ClusterVersion returns the cluster version active on the node +// associated with the given database connection. Note that the +// returned value might become stale due to the cluster auto-upgrading +// in the background plus gossip asynchronicity. +// NB: cluster versions are always major.minor[-internal]; there isn't +// a patch level. +func ClusterVersion(ctx context.Context, db *gosql.DB) (roachpb.Version, error) { + zero := roachpb.Version{} + var sv string + if err := db.QueryRowContext(ctx, `SHOW CLUSTER SETTING version`).Scan(&sv); err != nil { + return zero, err + } + + return roachpb.ParseVersion(sv) +} + +// UploadVersion uploads the specified crdb version to the given +// nodes. It returns the path of the uploaded binaries on the nodes, +// suitable to be used with `roachdprod start --binary=`. +func UploadVersion( + ctx context.Context, + t test.Test, + l *logger.Logger, + c cluster.Cluster, + nodes option.NodeListOption, + newVersion string, +) (string, error) { + binaryName := "./cockroach" + if newVersion == MainVersion { + if err := c.PutE(ctx, l, t.Cockroach(), binaryName, nodes); err != nil { + return "", err + } + } else if binary, ok := t.VersionsBinaryOverride()[newVersion]; ok { + // If an override has been specified for newVersion, use that binary. + l.Printf("using binary override for version %s: %s", newVersion, binary) + binaryName = "./cockroach-" + newVersion + if err := c.PutE(ctx, l, binary, binaryName, nodes); err != nil { + return "", err + } + } else { + v := "v" + newVersion + dir := v + binaryName = filepath.Join(dir, "cockroach") + // Check if the cockroach binary already exists. + if err := c.RunE(ctx, nodes, "test", "-e", binaryName); err != nil { + if err := c.RunE(ctx, nodes, "mkdir", "-p", dir); err != nil { + return "", err + } + if err := c.Stage(ctx, l, "release", v, dir, nodes); err != nil { + return "", err + } + } + } + return BinaryPathFromVersion(newVersion), nil +} + +// InstallFixtures copies the previously created fixtures (in +// pkg/cmd/roachtest/fixtures) for the given version to the nodes +// passed. After this step, the corresponding binary can be started on +// the cluster and it will use that store directory. +func InstallFixtures( + ctx context.Context, l *logger.Logger, c cluster.Cluster, nodes option.NodeListOption, v string, +) error { + c.Run(ctx, nodes, "mkdir -p {store-dir}") + vv := version.MustParse("v" + v) + // The fixtures use cluster version (major.minor) but the input might be + // a patch release. + name := CheckpointName( + roachpb.Version{Major: int32(vv.Major()), Minor: int32(vv.Minor())}.String(), + ) + for _, n := range nodes { + if err := c.PutE(ctx, l, + "pkg/cmd/roachtest/fixtures/"+strconv.Itoa(n)+"/"+name+".tgz", + "{store-dir}/fixture.tgz", c.Node(n), + ); err != nil { + return err + } + } + // Extract fixture. Fail if there's already an LSM in the store dir. + c.Run(ctx, nodes, "cd {store-dir} && [ ! -f {store-dir}/CURRENT ] && tar -xf fixture.tgz") + return nil +} + +// StartWithBinary starts a cockroach binary, assumed to already be +// present in the nodes in the path given. +func StartWithBinary( + ctx context.Context, + l *logger.Logger, + c cluster.Cluster, + nodes option.NodeListOption, + binaryPath string, + startOpts option.StartOpts, +) { + settings := install.MakeClusterSettings(install.BinaryOption(binaryPath)) + c.Start(ctx, l, startOpts, settings, nodes) +} + +// BinaryPathFromVersion shows where the binary for the given version +// can be found on roachprod nodes. It's either `./cockroach` or the +// path to which a released binary is staged. +func BinaryPathFromVersion(v string) string { + if v == "" { + return "./cockroach" + } + return filepath.Join("v"+v, "cockroach") +} + +// RestartNodesWithNewBinary uploads a given cockroach version to the +// nodes passed, and restarts the cockroach process. +func RestartNodesWithNewBinary( + ctx context.Context, + t test.Test, + l *logger.Logger, + c cluster.Cluster, + nodes option.NodeListOption, + startOpts option.StartOpts, + newVersion string, +) error { + // NB: We could technically stage the binary on all nodes before + // restarting each one, but on Unix it's invalid to write to an + // executable file while it is currently running. So we do the + // simple thing and upload it serially instead. + + // Restart nodes in a random order; otherwise node 1 would be running all + // the migrations and it probably also has all the leases. + rand.Shuffle(len(nodes), func(i, j int) { + nodes[i], nodes[j] = nodes[j], nodes[i] + }) + for _, node := range nodes { + l.Printf("restarting node %d into version %s", node, VersionMsg(newVersion)) + // Stop the cockroach process gracefully in order to drain it properly. + // This makes the upgrade closer to how users do it in production, but + // it's also needed to eliminate flakiness. In particular, this will + // make sure that DistSQL draining information is dissipated through + // gossip so that other nodes running an older version don't consider + // this upgraded node for DistSQL plans (see #87154 for more details). + // TODO(yuzefovich): ideally, we would also check that the drain was + // successful since if it wasn't, then we might see flakes too. + if err := c.StopCockroachGracefullyOnNode(ctx, l, node); err != nil { + return err + } + + binary, err := UploadVersion(ctx, t, l, c, c.Node(node), newVersion) + if err != nil { + return err + } + StartWithBinary(ctx, l, c, c.Node(node), binary, startOpts) + + // We have seen cases where a transient error could occur when this + // newly upgraded node serves as a gateway for a distributed query due + // to remote nodes not being able to dial back to the gateway for some + // reason (investigation of it is tracked in #87634). For now, we're + // papering over these flakes by this sleep. For more context, see + // #87104. + // TODO(yuzefovich): remove this sleep once #87634 is fixed. + time.Sleep(4 * time.Second) + } + + return nil +} + +// WaitForClusterUpgrade waits for the cluster version to reach the +// first node's binary version. This function should only be called if +// every node in the cluster has been restarted to run the same binary +// version. We rely on the cluster's internal self-upgrading +// mechanism to update the underlying cluster version. +func WaitForClusterUpgrade( + ctx context.Context, l *logger.Logger, nodes option.NodeListOption, dbFunc func(int) *gosql.DB, +) error { + newVersion, err := BinaryVersion(dbFunc(nodes[0])) + if err != nil { + return err + } + + l.Printf("waiting for cluster to auto-upgrade to %s", newVersion) + for _, node := range nodes { + err := retry.ForDuration(5*time.Minute, func() error { + currentVersion, err := ClusterVersion(ctx, dbFunc(node)) + if err != nil { + return err + } + if currentVersion != newVersion { + return fmt.Errorf("%d: expected cluster version %s, got %s", node, newVersion, currentVersion) + } + l.Printf("%s: acked by n%d", currentVersion, node) + return nil + }) + if err != nil { + return err + } + } + + l.Printf("all nodes (%v) are upgraded to %s", nodes, newVersion) + return nil +} + +// CheckpointName returns the expected name of the checkpoint file +// under `pkg/cmd/roachtest/fixtures/{nodeID}` for the given binary +// version. +func CheckpointName(binaryVersion string) string { + return "checkpoint-v" + binaryVersion +} + +// VersionMsg returns a version string to be displayed in logs. It's +// either the version given, or the "" string to represent +// the latest cockroach version, typically built off the branch being +// tested. +func VersionMsg(v string) string { + if v == MainVersion { + return "" + } + + return v +} diff --git a/pkg/cmd/roachtest/tests/predecessor_version.go b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.go similarity index 99% rename from pkg/cmd/roachtest/tests/predecessor_version.go rename to pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.go index 5309f7a8cce4..46d507cbcbff 100644 --- a/pkg/cmd/roachtest/tests/predecessor_version.go +++ b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.go @@ -8,7 +8,7 @@ // by the Apache License, Version 2.0, included in the file // licenses/APL.txt. -package tests +package clusterupgrade import ( // Import embed for the version map diff --git a/pkg/cmd/roachtest/tests/predecessor_version.json b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.json similarity index 100% rename from pkg/cmd/roachtest/tests/predecessor_version.json rename to pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.json diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel new file mode 100644 index 000000000000..75f083b93aee --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel @@ -0,0 +1,41 @@ +load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data") +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "mixedversion", + srcs = [ + "mixedversion.go", + "planner.go", + "runner.go", + ], + importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion", + visibility = ["//visibility:public"], + deps = [ + "//pkg/cmd/roachtest/cluster", + "//pkg/cmd/roachtest/option", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade", + "//pkg/cmd/roachtest/test", + "//pkg/roachpb", + "//pkg/roachprod/logger", + "//pkg/util/randutil", + "//pkg/util/timeutil", + "//pkg/util/version", + "@org_golang_x_sync//errgroup", + ], +) + +go_test( + name = "mixedversion_test", + srcs = ["planner_test.go"], + args = ["-test.timeout=295s"], + embed = [":mixedversion"], + deps = [ + "//pkg/cmd/roachtest/option", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade", + "//pkg/roachprod/logger", + "//pkg/util/version", + "@com_github_stretchr_testify//require", + ], +) + +get_x_data(name = "get_x_data") diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go new file mode 100644 index 000000000000..f82c40430f69 --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go @@ -0,0 +1,626 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +// Package mixedversion implements a framework for testing +// functionality when a cluster is in a mixed-version state. +// +// Mixed-version is defined as points in time when some (potentially +// not all) nodes have been upgraded to a certain binary version, or +// the state when all nodes have been upgraded, but the cluster itself +// is in the process of upgrading to that version (i.e., running +// migrations). This also applies to clusters that have upgraded +// binaries on all nodes and then later decided to downgrade to a +// previous version (which is permitted if the `preserve_downgrade_option` +// cluster setting is set accordingly). +// +// The goal of the framework is to let test writers focus on the +// functionality they want to test in a mixed-version state, rather +// than having to worry about how to set that up, stopping and +// restarting nodes, etc. +// +// In the most common use-case, callers will just call the +// `InMixedVersion` function on the test object defining what they +// want to test in a mixed-version state. The framework will take care +// of exercising that functionality in different mixed-version states, +// randomizing the order of events in each run. A detailed test plan +// is logged *before* the test runs; it facilitates debugging a test +// failure by summarizing all executed steps. +// +// Typical usage: +// +// mvt, err := NewMixedVersionTest(...) +// mvt.InMixedVersion("test my feature", func(l *logger.Logger, db *gosql.DB) error { +// l.Printf("testing feature X") +// _, err := db.ExecContext(ctx, "SELECT * FROM test") +// return err +// }) +// mvt.InMixedVersion("test another feature", func(l *logger.Logger, db *gosql.DB) error { +// l.Printf("testing feature Y") +// _, err := db.ExecContext(ctx, "SELECT * FROM test2") +// return err +// }) +// mvt.Run() +// +// Functions passed to `InMixedVersion` will be called at arbitrary +// points during an upgrade/downgrade process. They may also be called +// sequentially or concurrently. This information will be included in +// the test plan. Any messages logged during the execution of the hook +// should use the Logger instance passed as a parameter to it. This +// logger writes to a `mixed-version-test/{ID}.log` file, allowing +// easy cross referencing between a test step and its corresponding +// output. +// +// The random seed used to generate a test plan is logged; reusing the +// same seed will lead to the exact same test plan being generated. To +// set a specific seed when running a mixed-version test, one can set +// the `COCKROACH_RANDOM_SEED` environment variable. +package mixedversion + +import ( + "context" + gosql "database/sql" + "fmt" + "math/rand" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/randutil" + "github.com/cockroachdb/cockroach/pkg/util/version" +) + +const ( + logPrefix = "mixed-version-test" + startupLabel = "run startup hooks" + mixedVersionLabel = "run mixed-version hooks" + afterTestLabel = "run after test hooks" + + // runWhileMigratingProbability is the probability that a + // mixed-version hook will run after all nodes in the cluster have + // been upgraded to the same binary version, and the cluster version + // is allowed to be upgraded; this typically nvolves the execution + // of migration steps before the new cluster version can be + // finalized. + runWhileMigratingProbability = 0.5 + + // noDBNeeded is an internal sentinel value expected to be returned + // by steps generated by the test plan; it indicates that the step + // requires no connection to a cockroach node. + noDBNeeded = -1 +) + +var ( + // possibleDelaysMs lists the possible delays to be added to + // concurrent steps + possibleDelaysMs = []int{ + 0, 50, 100, 200, 500, + } +) + +type ( + // Context wraps the context passed to predicate functions that + // dictate when a mixed-version hook will run during a test + Context struct { + // FromVersion is the string representation of the version the + // nodes are migrating from + FromVersion string + // FromVersionNodes are the nodes that are currently running + // `FromVersion` + FromVersionNodes option.NodeListOption + // ToVersion is the string representation of the version the nodes + // are migrating to + ToVersion string + // ToVersionNodes are the nodes that are currently running + // `ToVersion` + ToVersionNodes option.NodeListOption + // Finalizing indicates whether the cluster version is in the + // process of upgrading (i.e., all nodes in the cluster have been + // upgraded to a certain version, and the migrations are being + // executed). + Finalizing bool + } + + userFunc func(*logger.Logger, *gosql.DB) error + predicateFunc func(Context) bool + + // versionUpgradeHook is a hook that can be called at any time + // during an upgrade/downgrade process. The `predicate` is used + // during test planning to determine when it is called: currently, + // it is always auto-generated, but in the future, this package + // could expose an API that would allow callers to have control of + // when their hooks should run. + versionUpgradeHook struct { + name string + predicate predicateFunc + fn userFunc + } + + // testStep is an opaque reference to one step of a mixed-version + // test. It can be a singleStep (see below), or a "meta-step", + // meaning that it combines other steps in some way (for instance, a + // series of steps to be run sequentially or concurrently). + testStep interface{} + + // singleStep represents steps that implement the pieces on top of + // which a mixed-version test is built. In other words, they are not + // composed by other steps and hence can be directly executed. + singleStep interface { + // ID returns a unique ID associated with the step, making it easy + // to reference test output with the exact step it relates to + ID() int + // DBNode returns the database node that that step connects to + // during its execution. If the step does not require a database + // connection, this function should return the `noDBNeeded` + // constant + DBNode() int + // Description is a string representation of the step, intended + // for human-consumption. Displayed when pretty-printing the test + // plan. + Description() string + // Run implements the actual functionality of the step. + Run(context.Context, *logger.Logger, cluster.Cluster, func(int) *gosql.DB) error + } + + hooks []versionUpgradeHook + + // testHooks groups hooks associated with a mixed-version test in + // its different stages: startup, mixed-version, and after-test. + testHooks struct { + startup hooks + mixedVersion hooks + afterUpgradeFinalized hooks + + prng *rand.Rand + crdbNodes option.NodeListOption + } + + // Test is the main struct callers of this package interact with. + Test struct { + ctx context.Context + cluster cluster.Cluster + logger *logger.Logger + crdbNodes option.NodeListOption + + rt test.Test + prng *rand.Rand + seed int64 + hooks *testHooks + + // test-only field, allowing us to avoid passing a test.Test + // implementation in the tests + _buildVersion version.Version + } +) + +// NewTest creates a Test struct that users can use to create and run +// a mixed-version roachtest. +func NewTest( + ctx context.Context, + t test.Test, + l *logger.Logger, + c cluster.Cluster, + crdbNodes option.NodeListOption, +) *Test { + testLogger, err := prefixedLogger(l, logPrefix) + if err != nil { + t.Fatal(err) + } + + prng, seed := randutil.NewPseudoRand() + testLogger.Printf("random seed: %d", seed) + + return &Test{ + ctx: ctx, + cluster: c, + logger: testLogger, + crdbNodes: crdbNodes, + rt: t, + prng: prng, + seed: seed, + hooks: &testHooks{prng: prng, crdbNodes: crdbNodes}, + } +} + +// InMixedVersion adds a new mixed-version hook to the test. The +// functionality in the function passed as argument to this function +// will be tested in arbitrary mixed-version states. If multiple +// InMixedVersion hooks are passed, they will be executed +// concurrently. +func (t *Test) InMixedVersion(desc string, fn userFunc) { + lastFromVersion := "invalid-version" + var numUpgradedNodes int + predicate := func(testContext Context) bool { + // If the cluster is finalizing an upgrade, run this hook + // according to the probability defined in the package. + if testContext.Finalizing { + return t.prng.Float64() < runWhileMigratingProbability + } + + // This check makes sure we only schedule a mixed-version hook + // once while upgrading from one version to another. The number of + // nodes we wait to be running the new version is determined when + // the version changes for the first time. + if testContext.FromVersion != lastFromVersion { + lastFromVersion = testContext.FromVersion + numUpgradedNodes = t.prng.Intn(len(t.crdbNodes)) + 1 + } + + return len(testContext.ToVersionNodes) == numUpgradedNodes + } + + t.hooks.AddMixedVersion(versionUpgradeHook{desc, predicate, fn}) +} + +// OnStartup registers a callback that is run once the cluster is +// initialized (i.e., all nodes in the cluster start running CRDB at a +// certain previous version, potentially from existing fixtures). If +// multiple OnStartup hooks are passed, they will be executed +// concurrently. +func (t *Test) OnStartup(desc string, fn userFunc) { + // Since the callbacks here are only referenced in the setup steps + // of the planner, there is no need to have a predicate function + // gating them. + t.hooks.AddStartup(versionUpgradeHook{desc, nil, fn}) +} + +// AfterUpgradeFinalized registers a callback that is run once the +// mixed-version test has brought the cluster to the latest version, +// and allowed the upgrade to finalize successfully. If multiple such +// hooks are passed, they will be executed concurrently. +func (t *Test) AfterUpgradeFinalized(desc string, fn userFunc) { + t.hooks.AddAfterUpgradeFinalized(versionUpgradeHook{desc, nil, fn}) +} + +// Run runs the mixed-version test. It should be called once all +// startup, mixed-version, and after-test hooks have been declared. A +// test plan will be generated (and logged), and the test will be +// carried out. +func (t *Test) Run() { + plan, err := t.plan() + if err != nil { + t.rt.Fatal(fmt.Errorf("error creating test plan: %w", err)) + } + + t.logger.Printf(plan.PrettyPrint()) + + if err := t.run(plan); err != nil { + t.rt.Fatal(err) + } +} + +func (t *Test) run(plan *TestPlan) error { + return newTestRunner( + t.ctx, plan, t.logger, t.cluster, t.crdbNodes, t.seed, + ).run() +} + +func (t *Test) plan() (*TestPlan, error) { + previousRelease, err := clusterupgrade.PredecessorVersion(t.buildVersion()) + if err != nil { + return nil, err + } + + planner := testPlanner{ + initialVersion: previousRelease, + rt: t.rt, + crdbNodes: t.crdbNodes, + hooks: t.hooks, + prng: t.prng, + } + + return planner.Plan(), nil +} + +func (t *Test) buildVersion() version.Version { + if t._buildVersion != (version.Version{}) { + return t._buildVersion // test-only + } + + return *t.rt.BuildVersion() +} + +// startFromCheckpointStep is the step that starts the cluster from a +// specific `version`, using checked-in fixtures. +type startFromCheckpointStep struct { + id int + rt test.Test + version string + crdbNodes option.NodeListOption +} + +func (s startFromCheckpointStep) ID() int { return s.id } +func (s startFromCheckpointStep) DBNode() int { return noDBNeeded } + +func (s startFromCheckpointStep) Description() string { + return fmt.Sprintf("starting cluster from fixtures for version %q", s.version) +} + +// Run will copy the fixtures to all database nodes in the cluster, +// upload the binary associated with that given version, and finally +// start the cockroach binary on these nodes. +func (s startFromCheckpointStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + if err := clusterupgrade.InstallFixtures(ctx, l, c, s.crdbNodes, s.version); err != nil { + return err + } + + binaryPath, err := clusterupgrade.UploadVersion(ctx, s.rt, l, c, s.crdbNodes, s.version) + if err != nil { + return err + } + + startOpts := option.DefaultStartOpts() + startOpts.RoachprodOpts.Sequential = false + clusterupgrade.StartWithBinary(ctx, l, c, s.crdbNodes, binaryPath, startOpts) + return nil +} + +// waitForStableClusterVersionStep implements the process of waiting +// for the `version` cluster setting being the same on all nodes of +// the cluster and equal to the binary version of the first node in +// the `nodes` field. +type waitForStableClusterVersionStep struct { + id int + nodes option.NodeListOption +} + +func (s waitForStableClusterVersionStep) ID() int { return s.id } +func (s waitForStableClusterVersionStep) DBNode() int { return noDBNeeded } + +func (s waitForStableClusterVersionStep) Description() string { + return fmt.Sprintf( + "wait for nodes %v to all have the same cluster version (same as binary version of node %d)", + s.nodes, s.nodes[0], + ) +} + +func (s waitForStableClusterVersionStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + return clusterupgrade.WaitForClusterUpgrade(ctx, l, s.nodes, dbFunc) +} + +// preserveDowngradeOptionStep sets the `preserve_downgrade_option` +// cluster setting to the binary version running in `node`. +type preserveDowngradeOptionStep struct { + id int + node int +} + +func (s preserveDowngradeOptionStep) ID() int { return s.id } +func (s preserveDowngradeOptionStep) DBNode() int { return s.node } + +func (s preserveDowngradeOptionStep) Description() string { + return "preventing auto-upgrades by setting `preserve_downgrade_option`" +} + +func (s preserveDowngradeOptionStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + db := dbFunc(s.node) + + l.Printf("checking binary version on node %d", s.node) + bv, err := clusterupgrade.BinaryVersion(db) + if err != nil { + return err + } + + downgradeOption := bv.String() + l.Printf("setting `preserve_downgrade_option` to %s", downgradeOption) + _, err = db.ExecContext(ctx, "SET CLUSTER SETTING cluster.preserve_downgrade_option = $1", downgradeOption) + return err +} + +// restartWithNewBinaryStep restarts a certain `node` with a new +// cockroach binary. Any existing `cockroach` process will be stopped, +// then the new binary will be uploaded and the `cockroach` process +// will restart using the new binary. +type restartWithNewBinaryStep struct { + id int + version string + rt test.Test + node int +} + +func (s restartWithNewBinaryStep) ID() int { return s.id } +func (s restartWithNewBinaryStep) DBNode() int { return noDBNeeded } + +func (s restartWithNewBinaryStep) Description() string { + return fmt.Sprintf("restart node %d with binary version %s", s.node, versionMsg(s.version)) +} + +func (s restartWithNewBinaryStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + return clusterupgrade.RestartNodesWithNewBinary( + ctx, + s.rt, + l, + c, + c.Node(s.node), + option.DefaultStartOpts(), + s.version, + ) +} + +// finalizeUpgradeStep resets the `preserve_downgrade_option` cluster +// setting, allowing the upgrade migrations to run and the cluster +// version to eventually reach the binary version on the nodes. +type finalizeUpgradeStep struct { + id int + node int +} + +func (s finalizeUpgradeStep) ID() int { return s.id } +func (s finalizeUpgradeStep) DBNode() int { return s.node } + +func (s finalizeUpgradeStep) Description() string { + return "finalize upgrade by resetting `preserve_downgrade_option`" +} + +func (s finalizeUpgradeStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + db := dbFunc(s.node) + l.Printf("resetting preserve_downgrade_option") + _, err := db.ExecContext(ctx, "RESET CLUSTER SETTING cluster.preserve_downgrade_option") + return err +} + +// runHookStep is a step used to run a user-provided hook (i.e., +// callbacks passed to `OnStartup`, `InMixedVersion`, or `AfterTest`). +type runHookStep struct { + id int + node int + hook versionUpgradeHook +} + +func (s runHookStep) ID() int { return s.id } +func (s runHookStep) DBNode() int { return s.node } + +func (s runHookStep) Description() string { + return fmt.Sprintf("run %q", s.hook.name) +} + +func (s runHookStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + return s.hook.fn(l, dbFunc(s.node)) +} + +// sequentialRunStep is a "meta-step" that indicates that a sequence +// of steps are to be executed sequentially. The default test runner +// already runs steps sequentially. This meta-step exists primarily as +// a way to group related steps so that a test plan is easier to +// understand for a human. +type sequentialRunStep struct { + label string + steps []testStep +} + +func (s sequentialRunStep) Description() string { + return s.label +} + +// delayedStep is a thin wrapper around a test step that marks steps +// with a random delay that should be honored before the step starts +// executing. This is useful in steps that are to be executed +// concurrently to avoid biases of certain execution orders. While the +// Go scheduler is non-deterministic, in practice schedules are not +// uniformly distributed, and the delay is inteded to expose scenarios +// where a specific ordering of events leads to a failure. +type delayedStep struct { + delay time.Duration + step testStep +} + +// concurrentRunStep is a "meta-step" that groups multiple test steps +// that are meant to be executed concurrently. A random delay is added +// before each step starts, see notes on `delayedStep`. +type concurrentRunStep struct { + label string + delayedSteps []testStep +} + +func newConcurrentRunStep(label string, steps []testStep, rng *rand.Rand) concurrentRunStep { + delayedSteps := make([]testStep, 0, len(steps)) + for _, step := range steps { + delayedSteps = append(delayedSteps, delayedStep{delay: randomDelay(rng), step: step}) + } + + return concurrentRunStep{label: label, delayedSteps: delayedSteps} +} + +func (s concurrentRunStep) Description() string { + return fmt.Sprintf("%s concurrently", s.label) +} + +// prefixedLogger returns a logger instance off of the given `l` +// parameter, and adds a prefix to everything logged by the retured +// logger. +func prefixedLogger(l *logger.Logger, prefix string) (*logger.Logger, error) { + fileName := strings.ReplaceAll(prefix, " ", "-") + formattedPrefix := fmt.Sprintf("[%s] ", fileName) + return l.ChildLogger(fileName, logger.LogPrefix(formattedPrefix)) +} + +func randomNode(rng *rand.Rand, nodes option.NodeListOption) int { + idx := rng.Intn(len(nodes)) + return nodes[idx] +} + +func (h hooks) Filter(testContext Context) hooks { + var result hooks + for _, hook := range h { + if hook.predicate(testContext) { + result = append(result, hook) + } + } + + return result +} + +// AsSteps transforms the sequence of hooks into a corresponding test +// step. If there is only one hook, the corresponding `runHookStep` is +// returned. Otherwise, a `concurrentRunStep` is returned, where every +// hook is run concurrently. +func (h hooks) AsSteps( + label string, idGen func() int, rng *rand.Rand, nodes option.NodeListOption, +) []testStep { + steps := make([]testStep, 0, len(h)) + for _, hook := range h { + rhs := runHookStep{id: idGen(), node: randomNode(rng, nodes), hook: hook} + steps = append(steps, rhs) + } + + if len(steps) <= 1 { + return steps + } + + return []testStep{newConcurrentRunStep(label, steps, rng)} +} + +func (th *testHooks) AddStartup(hook versionUpgradeHook) { + th.startup = append(th.startup, hook) +} + +func (th *testHooks) AddMixedVersion(hook versionUpgradeHook) { + th.mixedVersion = append(th.mixedVersion, hook) +} + +func (th *testHooks) AddAfterUpgradeFinalized(hook versionUpgradeHook) { + th.afterUpgradeFinalized = append(th.afterUpgradeFinalized, hook) +} + +func (th *testHooks) StartupSteps(idGen func() int) []testStep { + return th.startup.AsSteps(startupLabel, idGen, th.prng, th.crdbNodes) +} + +func (th *testHooks) MixedVersionSteps(testContext Context, idGen func() int) []testStep { + return th.mixedVersion.Filter(testContext).AsSteps(mixedVersionLabel, idGen, th.prng, th.crdbNodes) +} + +func (th *testHooks) AfterUpgradeFinalizedSteps(idGen func() int) []testStep { + return th.afterUpgradeFinalized.AsSteps(afterTestLabel, idGen, th.prng, th.crdbNodes) +} + +func randomDelay(rng *rand.Rand) time.Duration { + idx := rng.Intn(len(possibleDelaysMs)) + return time.Duration(possibleDelaysMs[idx]) * time.Millisecond +} + +func versionMsg(version string) string { + return clusterupgrade.VersionMsg(version) +} diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go new file mode 100644 index 000000000000..7a3c58922d62 --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go @@ -0,0 +1,246 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package mixedversion + +import ( + "fmt" + "math/rand" + "strings" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" +) + +type ( + // TestPlan is the output of planning a mixed-version test. The + // initialVersion and finalVersion fields are just for presentation + // purposes, as the plan is defined by the sequence of steps that it + // contains. + TestPlan struct { + initialVersion string + finalVersion string + steps []testStep + } + + // testPlanner wraps the state and the logic involved in generating + // a test plan from the given rng and user-provided hooks. + testPlanner struct { + stepCount int + initialVersion string + crdbNodes option.NodeListOption + rt test.Test + hooks *testHooks + prng *rand.Rand + } +) + +const ( + branchString = "├──" + nestedBranchString = "│ " + lastBranchString = "└──" + lastBranchPadding = " " +) + +// Plan returns the TestPlan generated using the `prng` in the +// testPlanner field. Currently, the test will always follow the +// following high level outline: +// +// - start all nodes in the cluster from the predecessor version, +// using fixtures. +// - set `preserve_downgrade_option`. +// - run startup hooks. +// - upgrade all nodes to the current cockroach version (running +// mixed-version hooks at times determined by the planner). +// - downgrade all nodes back to the predecessor version (running +// mixed-version hooks again). +// - upgrade all nodes back to the current cockroach version one +// more time (running mixed-version hooks). +// - finally, reset `preserve_downgrade_option`, allowing the +// cluster to upgrade. Mixed-version hooks may be executed while +// this is happening. +// - run after-test hooks. +// +// TODO(renato): further opportunities for random exploration: going +// back multiple releases instead of just one; picking a patch release +// randomly instead of just the latest release. +func (p *testPlanner) Plan() *TestPlan { + var steps []testStep + addSteps := func(ss []testStep) { steps = append(steps, ss...) } + + addSteps(p.initSteps()) + + // previous -> current + addSteps(p.upgradeSteps(p.initialVersion, clusterupgrade.MainVersion)) + // current -> previous (rollback) + addSteps(p.downgradeSteps(clusterupgrade.MainVersion, p.initialVersion)) + // previous -> current + addSteps(p.upgradeSteps(p.initialVersion, clusterupgrade.MainVersion)) + // finalize + addSteps(p.finalizeUpgradeSteps()) + + addSteps(p.finalSteps()) + return &TestPlan{ + initialVersion: p.initialVersion, + finalVersion: versionMsg(clusterupgrade.MainVersion), + steps: steps, + } +} + +// initSteps returns the sequence of steps that should be executed +// before we start changing binaries on nodes in the process of +// upgrading/downgrading. It will also run any startup hooks the user +// may have provided. +func (p *testPlanner) initSteps() []testStep { + preserveDowngradeNode := randomNode(p.prng, p.crdbNodes) + return append([]testStep{ + startFromCheckpointStep{id: p.nextID(), version: p.initialVersion, rt: p.rt, crdbNodes: p.crdbNodes}, + waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes}, + preserveDowngradeOptionStep{id: p.nextID(), node: preserveDowngradeNode}, + }, p.hooks.StartupSteps(p.nextID)...) +} + +// finalSteps are the steps to be run once the nodes have been +// upgraded/downgraded. It will wait for the cluster version on all +// nodes to be the same and then run any after-finalization hooks the +// user may have provided. +func (p *testPlanner) finalSteps() []testStep { + return append([]testStep{ + waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes}, + }, p.hooks.AfterUpgradeFinalizedSteps(p.nextID)...) +} + +func (p *testPlanner) upgradeSteps(from, to string) []testStep { + msg := fmt.Sprintf("upgrade nodes %v from %q to %q", p.crdbNodes, versionMsg(from), versionMsg(to)) + return p.changeVersionSteps(from, to, msg) +} + +func (p *testPlanner) downgradeSteps(from, to string) []testStep { + msg := fmt.Sprintf("downgrade nodes %v from %q to %q", p.crdbNodes, versionMsg(from), versionMsg(to)) + return p.changeVersionSteps(from, to, msg) +} + +// changeVersionSteps returns the sequence of steps to be performed +// when we are changing the version of cockroach in the nodes of a +// cluster (either upgrading to a new binary, or downgrading to a +// predecessor version). +func (p *testPlanner) changeVersionSteps(from, to, label string) []testStep { + // copy `crdbNodes` here so that shuffling won't mutate that array + oldVersionNodes := append(option.NodeListOption{}, p.crdbNodes...) + newVersionNodes := option.NodeListOption{} + + // change the binary version on the nodes in random order + p.prng.Shuffle(len(oldVersionNodes), func(i, j int) { + oldVersionNodes[i], oldVersionNodes[j] = oldVersionNodes[j], oldVersionNodes[i] + }) + + var steps []testStep + nodeOrder := append(option.NodeListOption{}, oldVersionNodes...) + for _, node := range nodeOrder { + steps = append(steps, restartWithNewBinaryStep{id: p.nextID(), version: to, node: node, rt: p.rt}) + oldVersionNodes = oldVersionNodes[1:] + newVersionNodes = append(newVersionNodes, node) + + testContext := Context{ + FromVersion: from, + FromVersionNodes: oldVersionNodes, + ToVersion: to, + ToVersionNodes: newVersionNodes, + } + steps = append(steps, p.hooks.MixedVersionSteps(testContext, p.nextID)...) + } + + return []testStep{sequentialRunStep{label: label, steps: steps}} +} + +// finalizeUpgradeSteps finalizes the upgrade by resetting the +// `preserve_downgrade_option` and potentially running mixed-version +// hooks while the cluster version is changing. +func (p *testPlanner) finalizeUpgradeSteps() []testStep { + testContext := Context{Finalizing: true} + finalizeNode := randomNode(p.prng, p.crdbNodes) + return append([]testStep{ + finalizeUpgradeStep{id: p.nextID(), node: finalizeNode}, + }, p.hooks.MixedVersionSteps(testContext, p.nextID)...) +} + +func (p *testPlanner) nextID() int { + p.stepCount++ + return p.stepCount +} + +// PrettyPrint displays a tree-like view of the mixed-version test +// plan, useful when debugging mixed-version test failures. Each step +// is assigned an ID, making it easy to correlate the step that +// failed with the point in the test where the failure occurred. See +// test file for an example of the output of this function. +func (plan *TestPlan) PrettyPrint() string { + var out strings.Builder + for i, step := range plan.steps { + plan.prettyPrintStep(&out, step, treeBranchString(i, len(plan.steps))) + } + + return fmt.Sprintf( + "mixed-version test plan for upgrading from %s to %s:\n%s", + plan.initialVersion, plan.finalVersion, out.String(), + ) +} + +func (plan *TestPlan) prettyPrintStep(out *strings.Builder, step testStep, prefix string) { + writeNested := func(label string, steps []testStep) { + out.WriteString(fmt.Sprintf("%s %s\n", prefix, label)) + for i, subStep := range steps { + nestedPrefix := strings.ReplaceAll(prefix, branchString, nestedBranchString) + nestedPrefix = strings.ReplaceAll(nestedPrefix, lastBranchString, lastBranchPadding) + subPrefix := fmt.Sprintf("%s%s", nestedPrefix, treeBranchString(i, len(steps))) + plan.prettyPrintStep(out, subStep, subPrefix) + } + } + + // writeRunnable is the function that generates the description for + // a singleStep. It can include extra information, such as whether + // there's a delay associated with the step (in the case of + // concurrent execution), and what database node the step is + // connecting to. + writeSingle := func(rs singleStep, extraContext ...string) { + if node := rs.DBNode(); node != noDBNeeded { + dbinfo := fmt.Sprintf("with connection to node %d", node) + extraContext = append([]string{dbinfo}, extraContext...) + } + var extras string + if contextStr := strings.Join(extraContext, ", "); contextStr != "" { + extras = ", " + contextStr + } + out.WriteString(fmt.Sprintf( + "%s %s%s (%d)\n", prefix, rs.Description(), extras, rs.ID(), + )) + } + + switch s := step.(type) { + case sequentialRunStep: + writeNested(s.Description(), s.steps) + case concurrentRunStep: + writeNested(s.Description(), s.delayedSteps) + case delayedStep: + delayStr := fmt.Sprintf("after %s delay", s.delay) + writeSingle(s.step.(singleStep), delayStr) + default: + writeSingle(s.(singleStep)) + } +} + +func treeBranchString(idx, sliceLen int) string { + if idx == sliceLen-1 { + return lastBranchString + } + + return branchString +} diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go new file mode 100644 index 000000000000..b7d560b8c2ba --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go @@ -0,0 +1,189 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package mixedversion + +import ( + "context" + gosql "database/sql" + "fmt" + "io" + "math/rand" + "testing" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/version" + "github.com/stretchr/testify/require" +) + +var ( + nilLogger = func() *logger.Logger { + cfg := logger.Config{ + Stdout: io.Discard, + Stderr: io.Discard, + } + l, err := cfg.NewLogger("" /* path */) + if err != nil { + panic(err) + } + + return l + }() + + ctx = context.Background() + nodes = option.NodeListOption{1, 2, 3, 4} + + // Hardcode build and previous versions so that the test won't fail + // when new versions are released. + buildVersion = func() version.Version { + bv, err := version.Parse("v23.1.0") + if err != nil { + panic(err) + } + return *bv + }() + previousVersion = func() string { + pv, err := clusterupgrade.PredecessorVersion(buildVersion) + if err != nil { + panic(err) + } + return pv + }() +) + +const ( + seed = 12345 // expectations are based on this seed + mainVersion = clusterupgrade.MainVersion +) + +func TestTestPlanner(t *testing.T) { + mvt := newTest(t) + mvt.InMixedVersion("mixed-version 1", dummyHook) + mvt.InMixedVersion("mixed-version 2", dummyHook) + + plan, err := mvt.plan() + require.NoError(t, err) + require.Len(t, plan.steps, 9) + + // Assert on the pretty-printed version of the test plan as that + // asserts the ordering of the steps we want to take, and as a bonus + // tests the printing function itself. + expectedPrettyPlan := fmt.Sprintf(` +mixed-version test plan for upgrading from %[1]s to : +├── starting cluster from fixtures for version "%[1]s" (1) +├── wait for nodes :1-4 to all have the same cluster version (same as binary version of node 1) (2) +├── preventing auto-upgrades by setting `+"`preserve_downgrade_option`"+`, with connection to node 4 (3) +├── upgrade nodes :1-4 from "%[1]s" to "" +│ ├── restart node 2 with binary version (4) +│ ├── restart node 1 with binary version (5) +│ ├── run "mixed-version 1", with connection to node 3 (6) +│ ├── restart node 4 with binary version (7) +│ ├── restart node 3 with binary version (8) +│ └── run "mixed-version 2", with connection to node 3 (9) +├── downgrade nodes :1-4 from "" to "%[1]s" +│ ├── restart node 3 with binary version %[1]s (10) +│ ├── restart node 4 with binary version %[1]s (11) +│ ├── run mixed-version hooks concurrently +│ │ ├── run "mixed-version 1", with connection to node 1, after 200ms delay (12) +│ │ └── run "mixed-version 2", with connection to node 1, after 200ms delay (13) +│ ├── restart node 2 with binary version %[1]s (14) +│ └── restart node 1 with binary version %[1]s (15) +├── upgrade nodes :1-4 from "%[1]s" to "" +│ ├── restart node 3 with binary version (16) +│ ├── run "mixed-version 1", with connection to node 1 (17) +│ ├── restart node 4 with binary version (18) +│ ├── restart node 1 with binary version (19) +│ ├── restart node 2 with binary version (20) +│ └── run "mixed-version 2", with connection to node 2 (21) +├── finalize upgrade by resetting `+"`preserve_downgrade_option`"+`, with connection to node 3 (22) +├── run "mixed-version 2", with connection to node 1 (23) +└── wait for nodes :1-4 to all have the same cluster version (same as binary version of node 1) (24) +`, previousVersion, + ) + + expectedPrettyPlan = expectedPrettyPlan[1:] // remove leading newline + require.Equal(t, expectedPrettyPlan, plan.PrettyPrint()) + + // Assert that startup hooks are scheduled to run before any + // upgrades, i.e., after cluster is initialized (step 1), and after + // we wait for the cluster version to match on all nodes (step 2). + mvt = newTest(t) + mvt.OnStartup("startup 1", dummyHook) + mvt.OnStartup("startup 2", dummyHook) + plan, err = mvt.plan() + require.NoError(t, err) + requireConcurrentHooks(t, plan.steps[3], "startup 1", "startup 2") + + // Assert that AfterUpgradeFinalized hooks are scheduled to run in + // the last step of the test. + mvt = newTest(t) + mvt.AfterUpgradeFinalized("finalizer 1", dummyHook) + mvt.AfterUpgradeFinalized("finalizer 2", dummyHook) + mvt.AfterUpgradeFinalized("finalizer 3", dummyHook) + plan, err = mvt.plan() + require.NoError(t, err) + require.Len(t, plan.steps, 9) + requireConcurrentHooks(t, plan.steps[8], "finalizer 1", "finalizer 2", "finalizer 3") +} + +// TestDeterministicTestPlan tests that generating a test plan with +// the same seed multiple times yields the same plan every time. +func TestDeterministicTestPlan(t *testing.T) { + makePlan := func() *TestPlan { + mvt := newTest(t) + mvt.InMixedVersion("mixed-version 1", dummyHook) + mvt.InMixedVersion("mixed-version 2", dummyHook) + + plan, err := mvt.plan() + require.NoError(t, err) + return plan + } + + expectedPlan := makePlan() + const numRuns = 50 + for j := 0; j < numRuns; j++ { + require.Equal(t, expectedPlan.PrettyPrint(), makePlan().PrettyPrint(), "j = %d", j) + } +} + +func newTest(t *testing.T) *Test { + prng := rand.New(rand.NewSource(seed)) + return &Test{ + ctx: ctx, + logger: nilLogger, + crdbNodes: nodes, + _buildVersion: buildVersion, + prng: prng, + hooks: &testHooks{prng: prng, crdbNodes: nodes}, + } +} + +// requireConcurrentHooks asserts that the given step is a concurrent +// run of multiple user-provided hooks with the names passed as +// parameter. +func requireConcurrentHooks(t *testing.T, step testStep, names ...string) { + require.IsType(t, concurrentRunStep{}, step) + crs := step.(concurrentRunStep) + require.Len(t, crs.delayedSteps, len(names)) + + for j, concurrentStep := range crs.delayedSteps { + require.IsType(t, delayedStep{}, concurrentStep) + ds := concurrentStep.(delayedStep) + require.IsType(t, runHookStep{}, ds.step) + rhs := ds.step.(runHookStep) + require.Equal(t, names[j], rhs.hook.name, "j = %d", j) + } +} + +func dummyHook(*logger.Logger, *gosql.DB) error { + return nil +} diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go new file mode 100644 index 000000000000..07422d12195c --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go @@ -0,0 +1,282 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package mixedversion + +import ( + "context" + gosql "database/sql" + "fmt" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "golang.org/x/sync/errgroup" +) + +type ( + testRunner struct { + ctx context.Context + plan *TestPlan + cluster cluster.Cluster + crdbNodes option.NodeListOption + seed int64 + logger *logger.Logger + + binaryVersions []roachpb.Version + clusterVersions []roachpb.Version + + connCache []*gosql.DB + } +) + +var ( + // everything that is not an alphanum or a few special characters + invalidChars = regexp.MustCompile(`[^a-zA-Z0-9 \-_\.]`) +) + +func newTestRunner( + ctx context.Context, + plan *TestPlan, + l *logger.Logger, + c cluster.Cluster, + crdbNodes option.NodeListOption, + randomSeed int64, +) *testRunner { + return &testRunner{ + ctx: ctx, + plan: plan, + logger: l, + cluster: c, + crdbNodes: crdbNodes, + seed: randomSeed, + } +} + +// run implements the test running logic, which boils down to running +// each step in sequence. +func (tr *testRunner) run() error { + defer tr.closeConnections() + + for _, step := range tr.plan.steps { + if err := tr.runStep(step); err != nil { + return err + } + } + + return nil +} + +// runStep contains the logic of running a single test step, called +// recursively in the case of sequentialRunStep and concurrentRunStep. +func (tr *testRunner) runStep(step testStep) error { + if ss, ok := step.(singleStep); ok && ss.ID() > 1 { + // if we are running a singleStep that is *not* the first step, + // we can initialize the database connections. This represents the + // assumption that the first step in the test plan is the one that + // sets up binaries and make the `cockroach` process available on + // the nodes. + // TODO(renato): consider a way to make the test runner crash if + // the assumption does not hold + if err := tr.maybeInitConnections(); err != nil { + return err + } + if err := tr.refreshBinaryVersions(); err != nil { + return err + } + if err := tr.refreshClusterVersions(); err != nil { + return err + } + } + + switch s := step.(type) { + case sequentialRunStep: + for _, ss := range s.steps { + if err := tr.runStep(ss); err != nil { + return err + } + } + return nil + + case concurrentRunStep: + group, _ := errgroup.WithContext(tr.ctx) + for _, cs := range s.delayedSteps { + cs := cs + group.Go(func() error { + return tr.runStep(cs) + }) + } + return group.Wait() + + case delayedStep: + time.Sleep(s.delay) + return tr.runStep(s.step) + + default: + ss := s.(singleStep) + stepLogger, err := tr.loggerFor(ss) + if err != nil { + return err + } + + tr.logStep("STARTING", ss, stepLogger) + start := timeutil.Now() + defer func() { + prefix := fmt.Sprintf("FINISHED [%s]", timeutil.Since(start)) + tr.logStep(prefix, ss, stepLogger) + }() + if err := ss.Run(tr.ctx, stepLogger, tr.cluster, tr.conn); err != nil { + return tr.reportError(err, ss, stepLogger) + } + + return nil + } +} + +// reportError augments the error passed with extra +// information. Specifically, the error message will include the ID of +// the step that failed, the random seed used, the binary version on +// each node when the error occurred, and the cluster version before +// and after the step (in case the failure happened *while* the +// cluster version was updating). +func (tr *testRunner) reportError(err error, step singleStep, l *logger.Logger) error { + errMsg := fmt.Sprintf("mixed-version test failure while running step %d (%s): %s", + step.ID(), step.Description(), err, + ) + debugInfo := func(label, value string) string { + return fmt.Sprintf("%-40s%s", label+":", value) + } + seedInfo := debugInfo("test random seed", strconv.FormatInt(tr.seed, 10)) + binaryVersions := debugInfo("binary versions", formatVersions(tr.binaryVersions)) + clusterVersionsBefore := debugInfo("cluster versions before failure", formatVersions(tr.clusterVersions)) + var clusterVersionsAfter string + if err := tr.refreshClusterVersions(); err == nil { + clusterVersionsBefore += "\n" + clusterVersionsAfter = debugInfo("cluster versions after failure", formatVersions(tr.clusterVersions)) + } else { + tr.logger.Printf("failed to fetch cluster versions after failure: %s", err) + } + + if err := renameFailedLogger(l); err != nil { + tr.logger.Printf("could not rename failed step logger: %v", err) + } + + return fmt.Errorf( + "%s\n%s\n%s\n%s%s", + errMsg, seedInfo, binaryVersions, clusterVersionsBefore, clusterVersionsAfter, + ) +} + +func (tr *testRunner) logStep(prefix string, step singleStep, l *logger.Logger) { + dashes := strings.Repeat("-", 10) + l.Printf("%[1]s %s (%d): %s %[1]s", dashes, prefix, step.ID(), step.Description()) +} + +// loggerFor creates a logger instance to be used by a test step. Logs +// will be available under `mixed-version-test/{ID}.log`, making it +// easy to go from the IDs displayed in the test plan to the +// corresponding output of that step. +func (tr *testRunner) loggerFor(step singleStep) (*logger.Logger, error) { + name := invalidChars.ReplaceAllString(strings.ToLower(step.Description()), "") + name = fmt.Sprintf("%d_%s", step.ID(), name) + + prefix := fmt.Sprintf("%s/%s", logPrefix, name) + return prefixedLogger(tr.logger, prefix) +} + +// refreshBinaryVersions updates the internal `binaryVersions` field +// with the binary version running on each node of the cluster. +func (tr *testRunner) refreshBinaryVersions() error { + tr.binaryVersions = make([]roachpb.Version, 0, len(tr.crdbNodes)) + for _, node := range tr.crdbNodes { + bv, err := clusterupgrade.BinaryVersion(tr.conn(node)) + if err != nil { + return fmt.Errorf("failed to get binary version for node %d: %w", node, err) + } + tr.binaryVersions = append(tr.binaryVersions, bv) + } + + return nil +} + +// refreshClusterVersions updates the internal `clusterVersions` field +// with the current view of the cluster version in each of the nodes +// of the cluster. +func (tr *testRunner) refreshClusterVersions() error { + tr.clusterVersions = make([]roachpb.Version, 0, len(tr.crdbNodes)) + for _, node := range tr.crdbNodes { + cv, err := clusterupgrade.ClusterVersion(tr.ctx, tr.conn(node)) + if err != nil { + return fmt.Errorf("failed to get cluster version for node %d: %w", node, err) + } + tr.clusterVersions = append(tr.clusterVersions, cv) + } + + return nil +} + +// maybeInitConnections initialize connections if the connection cache +// is empty. +func (tr *testRunner) maybeInitConnections() error { + if tr.connCache != nil { + return nil + } + + tr.connCache = make([]*gosql.DB, len(tr.crdbNodes)) + for _, node := range tr.crdbNodes { + conn, err := tr.cluster.ConnE(tr.ctx, tr.logger, node) + if err != nil { + return fmt.Errorf("failed to connect to node %d: %w", node, err) + } + + tr.connCache[node-1] = conn + } + + return nil +} + +// conn returns a database connection to the given node. Assumes the +// connection cache has been previously initialized. +func (tr *testRunner) conn(node int) *gosql.DB { + return tr.connCache[node-1] +} + +func (tr *testRunner) closeConnections() { + for _, db := range tr.connCache { + if db != nil { + _ = db.Close() + } + } +} + +func renameFailedLogger(l *logger.Logger) error { + currentFileName := l.File.Name() + newLogName := strings.TrimSuffix(currentFileName, filepath.Ext(currentFileName)) + newLogName += "_FAILED.log" + return os.Rename(currentFileName, newLogName) +} + +func formatVersions(versions []roachpb.Version) string { + var pairs []string + for idx, version := range versions { + pairs = append(pairs, fmt.Sprintf("%d: %s", idx+1, version)) + } + + return fmt.Sprintf("[%s]", strings.Join(pairs, ", ")) +} diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index 641dd53f6636..63a31c139f65 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -106,7 +106,7 @@ type testImpl struct { // Map from version to path to the cockroach binary to be used when // mixed-version test wants a binary for that binary. If a particular version // is found in this map, it is used instead of the binary coming from - // `roachprod stage release `. See the --version-binary-override flags. + // `roachprod stage release `. See the --versions-binary-override flags. // // Version strings look like "20.1.4". versionsBinaryOverride map[string]string diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index 0b872c7a5119..7823ff521582 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -110,7 +110,6 @@ go_library( "pgx.go", "pgx_blocklist.go", "pop.go", - "predecessor_version.go", "psycopg.go", "psycopg_blocklist.go", "query_comparison_util.go", @@ -165,7 +164,6 @@ go_library( "versionupgrade.go", "ycsb.go", ], - embedsrcs = ["predecessor_version.json"], importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/tests", visibility = ["//visibility:public"], deps = [ @@ -184,6 +182,8 @@ go_library( "//pkg/cmd/roachtest/option", "//pkg/cmd/roachtest/registry", "//pkg/cmd/roachtest/roachtestutil", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade", + "//pkg/cmd/roachtest/roachtestutil/mixedversion", "//pkg/cmd/roachtest/spec", "//pkg/cmd/roachtest/test", "//pkg/gossip", diff --git a/pkg/cmd/roachtest/tests/autoupgrade.go b/pkg/cmd/roachtest/tests/autoupgrade.go index 1e7b191c3a7e..7e5d0aa3f52f 100644 --- a/pkg/cmd/roachtest/tests/autoupgrade.go +++ b/pkg/cmd/roachtest/tests/autoupgrade.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/testutils" @@ -259,7 +260,7 @@ func registerAutoUpgrade(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - pred, err := PredecessorVersion(*t.BuildVersion()) + pred, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/backup.go b/pkg/cmd/roachtest/tests/backup.go index 787510d2b751..3863a37f7cd8 100644 --- a/pkg/cmd/roachtest/tests/backup.go +++ b/pkg/cmd/roachtest/tests/backup.go @@ -31,6 +31,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/jobs" @@ -448,13 +449,10 @@ func registerBackupMixedVersion(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" roachNodes := c.All() upgradedNodes := c.Nodes(1, 2) oldNodes := c.Nodes(3, 4) - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) c.Put(ctx, t.DeprecatedWorkload(), "./workload") @@ -469,7 +467,7 @@ func registerBackupMixedVersion(r registry.Registry) { setShortJobIntervalsStep(1), loadBackupDataStep(c), // Upgrade some nodes. - binaryUpgradeStep(upgradedNodes, mainVersion), + binaryUpgradeStep(upgradedNodes, clusterupgrade.MainVersion), // Let us first test planning and executing a backup on different node // versions. @@ -561,7 +559,7 @@ func registerBackupMixedVersion(r registry.Registry) { enableJobAdoptionStep(c, upgradedNodes), // Allow all the nodes to now finalize their cluster version. - binaryUpgradeStep(oldNodes, mainVersion), + binaryUpgradeStep(oldNodes, clusterupgrade.MainVersion), allowAutoUpgradeStep(1), waitForUpgradeStep(roachNodes), diff --git a/pkg/cmd/roachtest/tests/decommission_self.go b/pkg/cmd/roachtest/tests/decommission_self.go index db3e8c81985d..db39d031a3b6 100644 --- a/pkg/cmd/roachtest/tests/decommission_self.go +++ b/pkg/cmd/roachtest/tests/decommission_self.go @@ -14,6 +14,7 @@ import ( "context" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" ) @@ -21,15 +22,11 @@ import ( // // See https://github.com/cockroachdb/cockroach/issues/56718 func runDecommissionSelf(ctx context.Context, t test.Test, c cluster.Cluster) { - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" - allNodes := c.All() u := newVersionUpgradeTest(c, - uploadVersionStep(allNodes, mainVersion), - startVersion(allNodes, mainVersion), - fullyDecommissionStep(2, 2, mainVersion), + uploadVersionStep(allNodes, clusterupgrade.MainVersion), + startVersion(allNodes, clusterupgrade.MainVersion), + fullyDecommissionStep(2, 2, clusterupgrade.MainVersion), func(ctx context.Context, t test.Test, u *versionUpgradeTest) { // Stop n2 and exclude it from post-test consistency checks, // as this node can't contact cluster any more and operations diff --git a/pkg/cmd/roachtest/tests/follower_reads.go b/pkg/cmd/roachtest/tests/follower_reads.go index 9c714ea8037f..51f58212580f 100644 --- a/pkg/cmd/roachtest/tests/follower_reads.go +++ b/pkg/cmd/roachtest/tests/follower_reads.go @@ -27,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -866,11 +867,8 @@ func parsePrometheusMetric(s string) (*prometheusMetric, bool) { func runFollowerReadsMixedVersionSingleRegionTest( ctx context.Context, t test.Test, c cluster.Cluster, buildVersion version.Version, ) { - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) require.NoError(t, err) - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const curVersion = "" // Start the cluster at the old version. settings := install.MakeClusterSettings() @@ -884,7 +882,7 @@ func runFollowerReadsMixedVersionSingleRegionTest( randNode := 1 + rand.Intn(c.Spec().NodeCount) t.L().Printf("upgrading n%d to current version", randNode) nodeToUpgrade := c.Node(randNode) - upgradeNodes(ctx, nodeToUpgrade, startOpts, curVersion, t, c) + upgradeNodes(ctx, t, c, nodeToUpgrade, startOpts, clusterupgrade.MainVersion) runFollowerReadsTest(ctx, t, c, topologySpec{multiRegion: false}, exactStaleness, data) // Upgrade the remaining nodes to the new version and run the test. @@ -896,6 +894,6 @@ func runFollowerReadsMixedVersionSingleRegionTest( remainingNodes = remainingNodes.Merge(c.Node(i + 1)) } t.L().Printf("upgrading nodes %s to current version", remainingNodes) - upgradeNodes(ctx, remainingNodes, startOpts, curVersion, t, c) + upgradeNodes(ctx, t, c, remainingNodes, startOpts, clusterupgrade.MainVersion) runFollowerReadsTest(ctx, t, c, topologySpec{multiRegion: false}, exactStaleness, data) } diff --git a/pkg/cmd/roachtest/tests/import.go b/pkg/cmd/roachtest/tests/import.go index de84a0f71f52..d5634bdcdccf 100644 --- a/pkg/cmd/roachtest/tests/import.go +++ b/pkg/cmd/roachtest/tests/import.go @@ -22,6 +22,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -325,9 +326,6 @@ func successfulImportStep(warehouses, nodeID int) versionStep { func runImportMixedVersion( ctx context.Context, t test.Test, c cluster.Cluster, warehouses int, predecessorVersion string, ) { - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" roachNodes := c.All() t.Status("starting csv servers") @@ -338,8 +336,8 @@ func runImportMixedVersion( preventAutoUpgradeStep(1), // Upgrade some of the nodes. - binaryUpgradeStep(c.Node(1), mainVersion), - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), successfulImportStep(warehouses, 1 /* nodeID */), ) @@ -356,7 +354,7 @@ func registerImportMixedVersion(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/mixed_version_cdc.go b/pkg/cmd/roachtest/tests/mixed_version_cdc.go index b00fdc7a6217..2586b447c101 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_cdc.go +++ b/pkg/cmd/roachtest/tests/mixed_version_cdc.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/util/randutil" @@ -353,7 +354,7 @@ func (cmvt *cdcMixedVersionTester) createChangeFeed(node int) versionStep { func runCDCMixedVersions( ctx context.Context, t test.Test, c cluster.Cluster, buildVersion version.Version, ) { - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) if err != nil { t.Fatal(err) } @@ -372,9 +373,6 @@ func runCDCMixedVersions( return tester.crdbNodes[rng.Intn(len(tester.crdbNodes))] } - // An empty string will lead to the cockroach binary specified by flag - // `cockroach` to be used. - const mainVersion = "" newVersionUpgradeTest(c, uploadAndStartFromCheckpointFixture(tester.crdbNodes, predecessorVersion), tester.setupVerifier(sqlNode()), @@ -388,7 +386,7 @@ func runCDCMixedVersions( tester.waitForResolvedTimestamps(), // Roll the nodes into the new version one by one in random order - tester.crdbUpgradeStep(binaryUpgradeStep(tester.crdbNodes, mainVersion)), + tester.crdbUpgradeStep(binaryUpgradeStep(tester.crdbNodes, clusterupgrade.MainVersion)), // let the workload run in the new version for a while tester.waitForResolvedTimestamps(), @@ -402,7 +400,7 @@ func runCDCMixedVersions( tester.assertValid(), // Roll nodes forward and finalize upgrade. - tester.crdbUpgradeStep(binaryUpgradeStep(tester.crdbNodes, mainVersion)), + tester.crdbUpgradeStep(binaryUpgradeStep(tester.crdbNodes, clusterupgrade.MainVersion)), // allow cluster version to update allowAutoUpgradeStep(sqlNode()), diff --git a/pkg/cmd/roachtest/tests/mixed_version_change_replicas.go b/pkg/cmd/roachtest/tests/mixed_version_change_replicas.go index 9061cdbc563a..2dd730c32bf2 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_change_replicas.go +++ b/pkg/cmd/roachtest/tests/mixed_version_change_replicas.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/randutil" @@ -54,9 +55,7 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C return rng.Intn(nodeCount) + 1 } - // An empty string uses the cockroach binary specified by `--cockroach`. - const mainVersion = "" - preVersion, err := PredecessorVersion(*t.BuildVersion()) + preVersion, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) // scanTableStep runs a count(*) scan across a table, asserting the row count. @@ -215,7 +214,7 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C u := newVersionUpgradeTest(c, // Start the cluster with preVersion and wait for it to bootstrap, then - // disable auto-upgrades to mainVersion. The checkpoint fixture is not + // disable auto-upgrades to MainVersion. The checkpoint fixture is not // necessary in this test, but we pull it in to get better test coverage of // historical cluster state. uploadAndStartFromCheckpointFixture(c.All(), preVersion), @@ -231,8 +230,8 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C require.NoError(t, WaitFor3XReplication(ctx, t, conn)) }, - // Upgrade n1,n2 to mainVersion, leave n3,n4 at preVersion. - binaryUpgradeStep(c.Nodes(1, 2), mainVersion), + // Upgrade n1,n2 to MainVersion, leave n3,n4 at preVersion. + binaryUpgradeStep(c.Nodes(1, 2), clusterupgrade.MainVersion), // Scatter the table's single range, to randomize replica/lease // placement -- in particular, who's responsible for splits. @@ -280,9 +279,9 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C scatterTableStep("test"), scanTableStep("test", 0), - // Upgrade n3,n4 (the remaining nodes) to mainVersion, verify that we can + // Upgrade n3,n4 (the remaining nodes) to MainVersion, verify that we can // run a table scan, move ranges around, scatter, and scan again. - binaryUpgradeStep(c.Nodes(3, 4), mainVersion), + binaryUpgradeStep(c.Nodes(3, 4), clusterupgrade.MainVersion), scanTableStep("test", 0), changeReplicasRelocateFromNodeStep("test", 1), changeReplicasZoneConfigFromNodeStep("test", 2), @@ -302,10 +301,10 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C scatterTableStep("test"), scanTableStep("test", 0), - // Upgrade all to mainVersion and finalize the upgrade. Verify that we can + // Upgrade all to MainVersion and finalize the upgrade. Verify that we can // run a table scan, shuffle the ranges, scatter them, and scan again. allowAutoUpgradeStep(1), - binaryUpgradeStep(c.All(), mainVersion), + binaryUpgradeStep(c.All(), clusterupgrade.MainVersion), waitForUpgradeStep(c.All()), scanTableStep("test", 0), changeReplicasRelocateFromNodeStep("test", 1), diff --git a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go index dab24daa6d2d..983a7f0db8a7 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go +++ b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -112,7 +113,7 @@ func runDeclSchemaChangeCompatMixedVersions( ctx context.Context, t test.Test, c cluster.Cluster, buildVersion version.Version, ) { versionRegex := regexp.MustCompile(`(\d+\.\d+)`) - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/mixed_version_decommission.go b/pkg/cmd/roachtest/tests/mixed_version_decommission.go index 2a3b2173a5a9..d7c8cf5a2426 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_decommission.go +++ b/pkg/cmd/roachtest/tests/mixed_version_decommission.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/testutils" @@ -30,7 +31,7 @@ import ( func runDecommissionMixedVersions( ctx context.Context, t test.Test, c cluster.Cluster, buildVersion version.Version, ) { - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) if err != nil { t.Fatal(err) } @@ -42,15 +43,12 @@ func runDecommissionMixedVersions( pinnedUpgrade := h.getRandNode() t.L().Printf("pinned n%d for upgrade", pinnedUpgrade) - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" allNodes := c.All() u := newVersionUpgradeTest(c, // We upload both binaries to each node, to be able to vary the binary // used when issuing `cockroach node` subcommands. uploadVersionStep(allNodes, predecessorVersion), - uploadVersionStep(allNodes, mainVersion), + uploadVersionStep(allNodes, clusterupgrade.MainVersion), startVersion(allNodes, predecessorVersion), waitForUpgradeStep(allNodes), @@ -59,8 +57,8 @@ func runDecommissionMixedVersions( preloadDataStep(pinnedUpgrade), // We upgrade a pinnedUpgrade and one other random node of the cluster to v20.2. - binaryUpgradeStep(c.Node(pinnedUpgrade), mainVersion), - binaryUpgradeStep(c.Node(h.getRandNodeOtherThan(pinnedUpgrade)), mainVersion), + binaryUpgradeStep(c.Node(pinnedUpgrade), clusterupgrade.MainVersion), + binaryUpgradeStep(c.Node(h.getRandNodeOtherThan(pinnedUpgrade)), clusterupgrade.MainVersion), checkAllMembership(pinnedUpgrade, "active"), // Partially decommission a random node from another random node. We @@ -80,7 +78,7 @@ func runDecommissionMixedVersions( binaryUpgradeStep(allNodes, predecessorVersion), // Roll all nodes forward, and finalize upgrade. - binaryUpgradeStep(allNodes, mainVersion), + binaryUpgradeStep(allNodes, clusterupgrade.MainVersion), allowAutoUpgradeStep(1), waitForUpgradeStep(allNodes), diff --git a/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go b/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go index 512bacadd8ab..4cd6165fcbb6 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go +++ b/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/stretchr/testify/require" @@ -91,13 +92,10 @@ func registerDeclarativeSchemaChangerJobCompatibilityInMixedVersion(r registry.R if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" allNodes := c.All() upgradedNodes := c.Nodes(1, 2) oldNodes := c.Nodes(3, 4) - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) u := newVersionUpgradeTest(c, @@ -109,7 +107,7 @@ func registerDeclarativeSchemaChangerJobCompatibilityInMixedVersion(r registry.R setShortGCTTLInSystemZoneConfig(c), // Upgrade some nodes. - binaryUpgradeStep(upgradedNodes, mainVersion), + binaryUpgradeStep(upgradedNodes, clusterupgrade.MainVersion), // Job backward compatibility test: // - upgraded nodes: plan schema change and create schema changer jobs diff --git a/pkg/cmd/roachtest/tests/mixed_version_jobs.go b/pkg/cmd/roachtest/tests/mixed_version_jobs.go index 7e5aa0e62d1b..7aaeae58cd6b 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_jobs.go +++ b/pkg/cmd/roachtest/tests/mixed_version_jobs.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" @@ -229,9 +230,6 @@ FROM [SHOW JOBS] WHERE status = $1 OR status = $2`, func runJobsMixedVersions( ctx context.Context, t test.Test, c cluster.Cluster, warehouses int, predecessorVersion string, ) { - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" roachNodes := c.All() backgroundTPCC := backgroundJobsTestTPCCImport(t, warehouses) resumeAllJobsAndWaitStep := makeResumeAllJobsAndWaitStep(10 * time.Second) @@ -249,22 +247,22 @@ func runJobsMixedVersions( // Roll the nodes into the new version one by one, while repeatedly pausing // and resuming all jobs. - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(4), mainVersion), + binaryUpgradeStep(c.Node(4), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), @@ -292,22 +290,22 @@ func runJobsMixedVersions( pauseAllJobsStep(), // Roll nodes forward and finalize upgrade. - binaryUpgradeStep(c.Node(4), mainVersion), + binaryUpgradeStep(c.Node(4), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), @@ -337,7 +335,7 @@ func registerJobsMixedVersions(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go index cb7143355147..fd1526a75e09 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go +++ b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -77,14 +78,11 @@ func runSchemaChangeMixedVersions( concurrency int, buildVersion version.Version, ) { - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) if err != nil { t.Fatal(err) } - // An empty string will lead to the cockroach binary specified by flag - // `cockroach` to be used. - const mainVersion = "" schemaChangeStep := runSchemaChangeWorkloadStep(c.All().RandNode()[0], maxOps, concurrency) if buildVersion.Major() < 20 { // Schema change workload is meant to run only on versions 19.2 or higher. @@ -107,13 +105,13 @@ func runSchemaChangeMixedVersions( // Roll the nodes into the new version one by one, while repeatedly running // schema changes. We use an empty string for the version below, which means // use the main ./cockroach binary (i.e. the one being tested in this run). - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(4), mainVersion), + binaryUpgradeStep(c.Node(4), clusterupgrade.MainVersion), schemaChangeStep, // Roll back again, which ought to be fine because the cluster upgrade was @@ -128,13 +126,13 @@ func runSchemaChangeMixedVersions( schemaChangeStep, // Roll nodes forward and finalize upgrade. - binaryUpgradeStep(c.Node(4), mainVersion), + binaryUpgradeStep(c.Node(4), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), schemaChangeStep, allowAutoUpgradeStep(1), diff --git a/pkg/cmd/roachtest/tests/multitenant_upgrade.go b/pkg/cmd/roachtest/tests/multitenant_upgrade.go index 84676befdd2e..c9312ef49d6d 100644 --- a/pkg/cmd/roachtest/tests/multitenant_upgrade.go +++ b/pkg/cmd/roachtest/tests/multitenant_upgrade.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" @@ -68,10 +69,10 @@ func registerMultiTenantUpgrade(r registry.Registry) { // - Tenant14{Binary: Cur, Cluster: Cur}: Create tenant 14 and verify it works. // - Tenant12{Binary: Cur, Cluster: Cur}: Restart tenant 14 and make sure it still works. func runMultiTenantUpgrade(ctx context.Context, t test.Test, c cluster.Cluster, v version.Version) { - predecessor, err := PredecessorVersion(v) + predecessor, err := clusterupgrade.PredecessorVersion(v) require.NoError(t, err) - currentBinary := uploadVersion(ctx, t, c, c.All(), "") + currentBinary := uploadVersion(ctx, t, c, c.All(), clusterupgrade.MainVersion) predecessorBinary := uploadVersion(ctx, t, c, c.All(), predecessor) kvNodes := c.Node(1) diff --git a/pkg/cmd/roachtest/tests/rebalance_load.go b/pkg/cmd/roachtest/tests/rebalance_load.go index 4aa59a674455..352e6023e7d0 100644 --- a/pkg/cmd/roachtest/tests/rebalance_load.go +++ b/pkg/cmd/roachtest/tests/rebalance_load.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -63,7 +64,7 @@ func registerRebalanceLoad(r registry.Registry) { "--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5") settings := install.MakeClusterSettings() if mixedVersion { - predecessorVersion, err := PredecessorVersion(*t.BuildVersion()) + predecessorVersion, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) settings.Binary = uploadVersion(ctx, t, c, c.All(), predecessorVersion) // Upgrade some (or all) of the first N-1 CRDB nodes. We ignore the last @@ -72,11 +73,8 @@ func registerRebalanceLoad(r registry.Registry) { lastNodeToUpgrade := rand.Intn(c.Spec().NodeCount-2) + 1 t.L().Printf("upgrading %d nodes to the current cockroach binary", lastNodeToUpgrade) nodesToUpgrade := c.Range(1, lastNodeToUpgrade) - // An empty string means that the cockroach binary specified by the - // `cockroach` flag will be used. - const newVersion = "" c.Start(ctx, t.L(), startOpts, settings, roachNodes) - upgradeNodes(ctx, nodesToUpgrade, startOpts, newVersion, t, c) + upgradeNodes(ctx, t, c, nodesToUpgrade, startOpts, clusterupgrade.MainVersion) } else { c.Put(ctx, t.Cockroach(), "./cockroach", roachNodes) c.Start(ctx, t.L(), startOpts, settings, roachNodes) diff --git a/pkg/cmd/roachtest/tests/secondary_indexes.go b/pkg/cmd/roachtest/tests/secondary_indexes.go index 095d0b21db5d..c5ac67d3db92 100644 --- a/pkg/cmd/roachtest/tests/secondary_indexes.go +++ b/pkg/cmd/roachtest/tests/secondary_indexes.go @@ -16,6 +16,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/stretchr/testify/require" ) @@ -41,9 +42,6 @@ func runIndexUpgrade( } roachNodes := c.All() - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" u := newVersionUpgradeTest(c, uploadAndStart(roachNodes, predecessorVersion), waitForUpgradeStep(roachNodes), @@ -52,7 +50,7 @@ func runIndexUpgrade( createDataStep(), // Upgrade one of the nodes. - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), // Modify index data from that node. modifyData(1, @@ -66,8 +64,8 @@ func runIndexUpgrade( verifyTableData(3, firstExpected), // Upgrade the rest of the cluster. - binaryUpgradeStep(c.Node(2), mainVersion), - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), // Finalize the upgrade. allowAutoUpgradeStep(1), @@ -144,7 +142,7 @@ func registerSecondaryIndexesMultiVersionCluster(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index 7c09a28f8731..df0de155be4e 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -405,7 +406,7 @@ func runTPCCMixedHeadroom( bankRows = 1000 } - history, err := PredecessorHistory(*t.BuildVersion(), versionsToUpgrade) + history, err := clusterupgrade.PredecessorHistory(*t.BuildVersion(), versionsToUpgrade) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go b/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go index 366d5cfd18fb..7802148035b0 100644 --- a/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go +++ b/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/pmezard/go-difflib/difflib" @@ -37,8 +38,7 @@ func registerValidateSystemSchemaAfterVersionUpgrade(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - const mainVersion = "" - predecessorVersion, err := PredecessorVersion(*t.BuildVersion()) + predecessorVersion, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } @@ -105,7 +105,7 @@ func registerValidateSystemSchemaAfterVersionUpgrade(r registry.Registry) { u := newVersionUpgradeTest(c, // Start the node with the latest binary version. - uploadAndStart(c.Node(1), mainVersion), + uploadAndStart(c.Node(1), clusterupgrade.MainVersion), // Obtain expected output from the node. obtainSystemSchemaStep(1, &expected), @@ -117,7 +117,7 @@ func registerValidateSystemSchemaAfterVersionUpgrade(r registry.Registry) { uploadAndStart(c.Node(1), predecessorVersion), // Upgrade the node version. - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), // Wait for the cluster version to also bump up to make sure the migration logic is run. waitForUpgradeStep(c.Node(1)), diff --git a/pkg/cmd/roachtest/tests/version.go b/pkg/cmd/roachtest/tests/version.go index 596bb321df7f..656444b7b78b 100644 --- a/pkg/cmd/roachtest/tests/version.go +++ b/pkg/cmd/roachtest/tests/version.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/util/version" @@ -225,7 +226,7 @@ func registerVersion(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - pred, err := PredecessorVersion(*t.BuildVersion()) + pred, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/versionupgrade.go b/pkg/cmd/roachtest/tests/versionupgrade.go index 494a2ec0b7a6..2274bdb971ed 100644 --- a/pkg/cmd/roachtest/tests/versionupgrade.go +++ b/pkg/cmd/roachtest/tests/versionupgrade.go @@ -14,43 +14,39 @@ import ( "context" gosql "database/sql" "fmt" - "math/rand" "path/filepath" "runtime" - "strconv" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/roachprod/install" - "github.com/cockroachdb/cockroach/pkg/testutils" - "github.com/cockroachdb/cockroach/pkg/util/retry" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/version" "github.com/stretchr/testify/require" ) -var ( - v201 = roachpb.Version{Major: 20, Minor: 1} - v202 = roachpb.Version{Major: 20, Minor: 2} -) +type versionFeatureTest struct { + name string + statement string +} -// Feature tests that are invoked between each step of the version upgrade test. -// Tests can use u.clusterVersion to determine which version is active at the -// moment. -// -// A gotcha is that these feature tests are also invoked when the cluster is -// in the middle of upgrading -- i.e. a state where the cluster version has -// already been bumped, but not all nodes are aware). This should be considered -// a feature of this test, and feature tests that flake because of it need to -// be fixed. -var versionUpgradeTestFeatures = versionFeatureStep{ - // NB: the next four tests are ancient and supported since v2.0. However, - // in 19.2 -> 20.1 we had a migration that disallowed most DDL in the - // mixed version state, and so for convenience we gate them on v20.1. - stmtFeatureTest("Object Access", v201, ` +// Feature tests that are invoked in mixed-version state during the +// upgrade test. A gotcha is that these feature tests are also +// invoked when the cluster is in the middle of upgrading -- i.e. a +// state where the cluster version has already been bumped, but not +// all nodes are aware). This should be considered a feature of this +// test, and feature tests that flake because of it need to be fixed. +var versionUpgradeTestFeatures = []versionFeatureTest{ + // NB: the next four tests are ancient and supported since v2.0. + { + name: "ObjectAccess", + statement: ` -- We should be able to successfully select from objects created in ancient -- versions of CRDB using their FQNs. Prevents bugs such as #43141, where -- databases created before a migration were inaccessible after the @@ -62,60 +58,78 @@ var versionUpgradeTestFeatures = versionFeatureStep{ -- on CRDB v1.0 select * from persistent_db.persistent_table; show tables from persistent_db; -`), - stmtFeatureTest("JSONB", v201, ` +`, + }, + { + name: "JSONB", + statement: ` CREATE DATABASE IF NOT EXISTS test; CREATE TABLE test.t (j JSONB); DROP TABLE test.t; - `), - stmtFeatureTest("Sequences", v201, ` +`, + }, + { + name: "Sequences", + statement: ` CREATE DATABASE IF NOT EXISTS test; CREATE SEQUENCE test.test_sequence; DROP SEQUENCE test.test_sequence; - `), - stmtFeatureTest("Computed Columns", v201, ` +`, + }, + { + name: "Computed Columns", + statement: ` CREATE DATABASE IF NOT EXISTS test; CREATE TABLE test.t (x INT AS (3) STORED); DROP TABLE test.t; - `), - stmtFeatureTest("Split and Merge Ranges", v202, ` -create database if not EXISTS splitmerge; -create table splitmerge.t (k int primary key); -alter table splitmerge.t split at values (1), (2), (3); -alter table splitmerge.t unsplit at values (1), (2), (3); -drop table splitmerge.t; - `), +`, + }, + { + name: "Split and Merge Ranges", + statement: ` +CREATE DATABASE IF NOT EXISTS splitmerge; +CREATE TABLE splitmerge.t (k INT PRIMARY KEY); +ALTER TABLE splitmerge.t SPLIT AT VALUES (1), (2), (3); +ALTER TABLE splitmerge.t UNSPLIT AT VALUES (1), (2), (3); +DROP TABLE splitmerge.t; +`, + }, } func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - predecessorVersion, err := PredecessorVersion(*t.BuildVersion()) - if err != nil { - t.Fatal(err) - } - testFeaturesStep := versionUpgradeTestFeatures.step(c.All()) - schemaChangeStep := runSchemaChangeWorkloadStep(c.All().RandNode()[0], 10 /* maxOps */, 2 /* concurrency */) - // TODO(irfansharif): All schema change instances were commented out while - // of #58489 is being addressed. - _ = schemaChangeStep - backupStep := func(ctx context.Context, t test.Test, u *versionUpgradeTest) { + mvt := mixedversion.NewTest(ctx, t, t.L(), c, c.All()) + mvt.InMixedVersion("run backup", func(l *logger.Logger, db *gosql.DB) error { // Verify that backups can be created in various configurations. This is // important to test because changes in system tables might cause backups to // fail in mixed-version clusters. dest := fmt.Sprintf("nodelocal://0/%d", timeutil.Now().UnixNano()) - _, err := u.conn(ctx, t, 1).ExecContext(ctx, `BACKUP TO $1`, dest) - require.NoError(t, err) - } + l.Printf("writing backup to %s", dest) + _, err := db.ExecContext(ctx, `BACKUP TO $1`, dest) + return err + }) + mvt.InMixedVersion("test features", func(l *logger.Logger, db *gosql.DB) error { + for _, featureTest := range versionUpgradeTestFeatures { + l.Printf("running feature test %q", featureTest.name) + _, err := db.ExecContext(ctx, featureTest.statement) + if err != nil { + l.Printf("%q: ERROR (%s)", featureTest.name, err) + return err + } + l.Printf("%q: OK", featureTest.name) + } - checkPinnedGCTTLStep := func(ctx context.Context, t test.Test, u *versionUpgradeTest) { + return nil + }) + mvt.AfterUpgradeFinalized("check if GC TTL is pinned", func(l *logger.Logger, db *gosql.DB) error { // TODO(irfansharif): This can be removed when the predecessor version // in this test is v23.1, where the default is 4h. This test was only to // make sure that existing clusters that upgrade to 23.1 retained their // existing GC TTL. - t.L().Printf("checking if GC TTL is pinned to 25h") + l.Printf("checking if GC TTL is pinned to 24h") var ttlSeconds int query := ` SELECT @@ -124,86 +138,17 @@ func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) { WHERE target = 'RANGE default' LIMIT 1 ` - require.NoError(t, u.conn(ctx, t, 1).QueryRowContext(ctx, query).Scan(&ttlSeconds)) - require.Equal(t, 24*60*60, ttlSeconds) // NB: 24h is what's used in the fixture - } - - // The steps below start a cluster at predecessorVersion (from a fixture), - // then start an upgrade that is rolled back, and finally start and finalize - // the upgrade. Between each step, we run the feature tests defined in - // versionUpgradeTestFeatures. - u := newVersionUpgradeTest(c, - // Start the cluster from a fixture. That fixture's cluster version may - // be at the predecessor version (though in practice it's fully up to - // date, if it was created via the checkpointer above), so add a - // waitForUpgradeStep to make sure we're upgraded all the way before - // moving on. - // - // See the comment on createCheckpoints for details on fixtures. - uploadAndStartFromCheckpointFixture(c.All(), predecessorVersion), - - // lower descriptor lease duration to 1 minute, working around a - // lease leak that can occasionally make this test time out (flake - // rate ~3%). - // - // TODO(renato): remove this call and function definition when - // https://github.com/cockroachdb/cockroach/issues/84382 is - // closed. - lowerLeaseDuration(1), - - uploadAndInitSchemaChangeWorkload(), - waitForUpgradeStep(c.All()), - testFeaturesStep, - - // NB: at this point, cluster and binary version equal predecessorVersion, - // and auto-upgrades are on. + if err := db.QueryRowContext(ctx, query).Scan(&ttlSeconds); err != nil { + return fmt.Errorf("error querying GC TTL: %w", err) + } + expectedTTL := 24 * 60 * 60 // NB: 24h is what's used in the fixture + if ttlSeconds != expectedTTL { + return fmt.Errorf("unexpected GC TTL: actual (%d) != expected (%d)", ttlSeconds, expectedTTL) + } + return nil + }) - // We use an empty string for the version below, which means to use the - // main ./cockroach binary (i.e. the one being tested in this run). - // We upgrade into this version more capriciously to ensure better - // coverage by first rolling the cluster into the new version with - // auto-upgrade disabled, then rolling back, and then rolling forward - // and finalizing on the auto-upgrade path. - preventAutoUpgradeStep(1), - // Roll nodes forward. - binaryUpgradeStep(c.Node(1), ""), - testFeaturesStep, - binaryUpgradeStep(c.Range(2, c.Spec().NodeCount), ""), - // Run a quick schemachange workload in between each upgrade. - // The maxOps is 10 to keep the test runtime under 1-2 minutes. - // schemaChangeStep, - backupStep, - // Roll back again. Note that bad things would happen if the cluster had - // ignored our request to not auto-upgrade. The `autoupgrade` roachtest - // exercises this in more detail, so here we just rely on things working - // as they ought to. - binaryUpgradeStep(c.All(), predecessorVersion), - testFeaturesStep, - // schemaChangeStep, - backupStep, - // Roll nodes forward, this time allowing them to upgrade, and waiting - // for it to happen. - binaryUpgradeStep(c.All(), ""), - allowAutoUpgradeStep(1), - testFeaturesStep, - // schemaChangeStep, - backupStep, - waitForUpgradeStep(c.All()), - testFeaturesStep, - // schemaChangeStep, - backupStep, - // Turn tracing on globally to give it a fighting chance at exposing any - // crash-inducing incompatibilities or horrendous memory leaks. (It won't - // catch most memory leaks since this test doesn't run for too long or does - // too much work). Then, run the previous tests again. - enableTracingGloballyStep, - testFeaturesStep, - // schemaChangeStep, - backupStep, - checkPinnedGCTTLStep, - ) - - u.run(ctx, t) + mvt.Run() } func (u *versionUpgradeTest) run(ctx context.Context, t test.Test) { @@ -239,8 +184,6 @@ func newVersionUpgradeTest(c cluster.Cluster, steps ...versionStep) *versionUpgr } } -func checkpointName(binaryVersion string) string { return "checkpoint-v" + binaryVersion } - // Return a cached conn to the given node. Don't call .Close(), the test harness // will do it. func (u *versionUpgradeTest) conn(ctx context.Context, t test.Test, i int) *gosql.DB { @@ -256,104 +199,52 @@ func (u *versionUpgradeTest) conn(ctx context.Context, t test.Test, i int) *gosq return db } -// uploadVersion uploads the specified crdb version to nodes. It returns the -// path of the uploaded binaries on the nodes, suitable to be used with -// `roachdprod start --binary=`. +// uploadVersion is a thin wrapper around +// `clusterupgrade.UploadVersion` that calls t.Fatal if that call +// returns an error func uploadVersion( ctx context.Context, t test.Test, c cluster.Cluster, nodes option.NodeListOption, newVersion string, -) (binaryName string) { - binaryName = "./cockroach" - if newVersion == "" { - if err := c.PutE(ctx, t.L(), t.Cockroach(), binaryName, nodes); err != nil { - t.Fatal(err) - } - } else if binary, ok := t.VersionsBinaryOverride()[newVersion]; ok { - // If an override has been specified for newVersion, use that binary. - t.L().Printf("using binary override for version %s: %s", newVersion, binary) - binaryName = "./cockroach-" + newVersion - if err := c.PutE(ctx, t.L(), binary, binaryName, nodes); err != nil { - t.Fatal(err) - } - } else { - v := "v" + newVersion - dir := v - binaryName = filepath.Join(dir, "cockroach") - // Check if the cockroach binary already exists. - if err := c.RunE(ctx, nodes, "test", "-e", binaryName); err != nil { - if err := c.RunE(ctx, nodes, "mkdir", "-p", dir); err != nil { - t.Fatal(err) - } - if err := c.Stage(ctx, t.L(), "release", v, dir, nodes); err != nil { - t.Fatal(err) - } - } - } - return binaryPathFromVersion(newVersion) -} - -// binaryPathFromVersion shows where the binary for the given version -// can be found on roachprod nodes. It's either `./cockroach` or the -// path to which a released binary is staged. -func binaryPathFromVersion(v string) string { - if v == "" { - return "./cockroach" - } - return filepath.Join("v"+v, "cockroach") -} - -func (u *versionUpgradeTest) uploadVersion( - ctx context.Context, t test.Test, nodes option.NodeListOption, newVersion string, ) string { - return uploadVersion(ctx, t, u.c, nodes, newVersion) -} - -// binaryVersion returns the binary running on the (one-indexed) node. -// NB: version means major.minor[-internal]; the patch level isn't returned. For example, a binary -// of version 19.2.4 will return 19.2. -func (u *versionUpgradeTest) binaryVersion( - ctx context.Context, t test.Test, i int, -) roachpb.Version { - db := u.conn(ctx, t, i) - - var sv string - if err := db.QueryRow(`SELECT crdb_internal.node_executable_version();`).Scan(&sv); err != nil { + path, err := clusterupgrade.UploadVersion(ctx, t, t.L(), c, nodes, newVersion) + if err != nil { t.Fatal(err) } - if len(sv) == 0 { - t.Fatal("empty version") - } + return path +} - cv, err := roachpb.ParseVersion(sv) - if err != nil { +// upgradeNodes is a thin wrapper around +// `clusterupgrade.RestartNodesWithNewBinary` that calls t.Fatal if +// that call returns an errror. +func upgradeNodes( + ctx context.Context, + t test.Test, + c cluster.Cluster, + nodes option.NodeListOption, + startOpts option.StartOpts, + newVersion string, +) { + if err := clusterupgrade.RestartNodesWithNewBinary( + ctx, t, t.L(), c, nodes, startOpts, newVersion, + ); err != nil { t.Fatal(err) } - return cv } -// binaryVersion returns the cluster version active on the (one-indexed) node. Note that the -// returned value might become stale due to the cluster auto-upgrading in the background plus -// gossip asynchronicity. -// NB: cluster versions are always major.minor[-internal]; there isn't a patch level. -func (u *versionUpgradeTest) clusterVersion( +func (u *versionUpgradeTest) binaryVersion( ctx context.Context, t test.Test, i int, ) roachpb.Version { db := u.conn(ctx, t, i) - - var sv string - if err := db.QueryRowContext(ctx, `SHOW CLUSTER SETTING version`).Scan(&sv); err != nil { - t.Fatal(err) - } - - cv, err := roachpb.ParseVersion(sv) + v, err := clusterupgrade.BinaryVersion(db) if err != nil { t.Fatal(err) } - return cv + + return v } // versionStep is an isolated version migration on a running cluster. @@ -361,41 +252,24 @@ type versionStep func(ctx context.Context, t test.Test, u *versionUpgradeTest) func uploadAndStartFromCheckpointFixture(nodes option.NodeListOption, v string) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - u.c.Run(ctx, nodes, "mkdir", "-p", "{store-dir}") - vv := version.MustParse("v" + v) - // The fixtures use cluster version (major.minor) but the input might be - // a patch release. - name := checkpointName( - roachpb.Version{Major: int32(vv.Major()), Minor: int32(vv.Minor())}.String(), - ) - for _, i := range nodes { - u.c.Put(ctx, - "pkg/cmd/roachtest/fixtures/"+strconv.Itoa(i)+"/"+name+".tgz", - "{store-dir}/fixture.tgz", u.c.Node(i), - ) + if err := clusterupgrade.InstallFixtures(ctx, t.L(), u.c, nodes, v); err != nil { + t.Fatal(err) } - // Extract fixture. Fail if there's already an LSM in the store dir. - u.c.Run(ctx, nodes, "cd {store-dir} && [ ! -f {store-dir}/CURRENT ] && tar -xf fixture.tgz") - - // Put and start the binary. - binary := u.uploadVersion(ctx, t, nodes, v) - settings := install.MakeClusterSettings(install.BinaryOption(binary)) + binary := uploadVersion(ctx, t, u.c, nodes, v) startOpts := option.DefaultStartOpts() // NB: can't start sequentially since cluster already bootstrapped. startOpts.RoachprodOpts.Sequential = false - u.c.Start(ctx, t.L(), startOpts, settings, nodes) + clusterupgrade.StartWithBinary(ctx, t.L(), u.c, nodes, binary, startOpts) } } func uploadAndStart(nodes option.NodeListOption, v string) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - // Put and start the binary. - binary := u.uploadVersion(ctx, t, nodes, v) - // NB: can't start sequentially since cluster already bootstrapped. + binary := uploadVersion(ctx, t, u.c, nodes, v) startOpts := option.DefaultStartOpts() + // NB: can't start sequentially since cluster already bootstrapped. startOpts.RoachprodOpts.Sequential = false - settings := install.MakeClusterSettings(install.BinaryOption(binary)) - u.c.Start(ctx, t.L(), startOpts, settings, nodes) + clusterupgrade.StartWithBinary(ctx, t.L(), u.c, nodes, binary, startOpts) } } @@ -404,84 +278,9 @@ func uploadAndStart(nodes option.NodeListOption, v string) versionStep { // Use a waitForUpgradeStep() for that. func binaryUpgradeStep(nodes option.NodeListOption, newVersion string) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - upgradeNodes(ctx, nodes, option.DefaultStartOpts(), newVersion, t, u.c) - // TODO(nvanbenschoten): add upgrade qualification step. What should we - // test? We could run logictests. We could add custom logic here. Maybe - // this should all be pushed to nightly migration tests instead. - } -} - -func upgradeNodes( - ctx context.Context, - nodes option.NodeListOption, - startOpts option.StartOpts, - newVersion string, - t test.Test, - c cluster.Cluster, -) { - // NB: We could technically stage the binary on all nodes before - // restarting each one, but on Unix it's invalid to write to an - // executable file while it is currently running. So we do the - // simple thing and upload it serially instead. - - // Restart nodes in a random order; otherwise node 1 would be running all - // the migrations and it probably also has all the leases. - rand.Shuffle(len(nodes), func(i, j int) { - nodes[i], nodes[j] = nodes[j], nodes[i] - }) - for _, node := range nodes { - v := newVersion - if v == "" { - v = "" - } - newVersionMsg := newVersion - if newVersion == "" { - newVersionMsg = "" - } - t.L().Printf("restarting node %d into version %s", node, newVersionMsg) - // Stop the cockroach process gracefully in order to drain it properly. - // This makes the upgrade closer to how users do it in production, but - // it's also needed to eliminate flakiness. In particular, this will - // make sure that DistSQL draining information is dissipated through - // gossip so that other nodes running an older version don't consider - // this upgraded node for DistSQL plans (see #87154 for more details). - // TODO(yuzefovich): ideally, we would also check that the drain was - // successful since if it wasn't, then we might see flakes too. - if err := c.StopCockroachGracefullyOnNode(ctx, t.L(), node); err != nil { - t.Fatal(err) - } - - binary := uploadVersion(ctx, t, c, c.Node(node), newVersion) - settings := install.MakeClusterSettings(install.BinaryOption(binary)) - c.Start(ctx, t.L(), startOpts, settings, c.Node(node)) - - // We have seen cases where a transient error could occur when this - // newly upgraded node serves as a gateway for a distributed query due - // to remote nodes not being able to dial back to the gateway for some - // reason (investigation of it is tracked in #87634). For now, we're - // papering over these flakes by this sleep. For more context, see - // #87104. - // TODO(yuzefovich): remove this sleep once #87634 is fixed. - time.Sleep(4 * time.Second) - } -} - -func enableTracingGloballyStep(ctx context.Context, t test.Test, u *versionUpgradeTest) { - db := u.conn(ctx, t, 1) - // NB: this enables net/trace, and as a side effect creates verbose trace spans everywhere. - _, err := db.ExecContext(ctx, `SET CLUSTER SETTING trace.debug.enable = $1`, true) - if err != nil { - t.Fatal(err) - } -} - -// lowerLeaseDuration sets the `sql.catalog.descriptor_lease_duration` -// setting to 1 minute. -func lowerLeaseDuration(node int) versionStep { - return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - db := u.conn(ctx, t, node) - _, err := db.ExecContext(ctx, `SET CLUSTER SETTING sql.catalog.descriptor_lease_duration = '1m'`) - if err != nil { + if err := clusterupgrade.RestartNodesWithNewBinary( + ctx, t, t.L(), u.c, nodes, option.DefaultStartOpts(), newVersion, + ); err != nil { t.Fatal(err) } } @@ -507,10 +306,6 @@ func allowAutoUpgradeStep(node int) versionStep { } } -// waitForUpgradeStep waits for the cluster version to reach the first node's -// binary version (which is assumed to be every node's binary version). We rely -// on the cluster's internal self-upgrading mechanism. -// // NB: this is intentionally kept separate from binaryUpgradeStep because we run // feature tests between the steps, and we want to expose them (at least // heuristically) to the real-world situation in which some nodes have already @@ -518,76 +313,13 @@ func allowAutoUpgradeStep(node int) versionStep { // situation tends to exhibit unexpected behavior. func waitForUpgradeStep(nodes option.NodeListOption) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - newVersion := u.binaryVersion(ctx, t, nodes[0]).String() - t.L().Printf("%s: waiting for cluster to auto-upgrade\n", newVersion) - - for _, i := range nodes { - err := retry.ForDuration(5*time.Minute, func() error { - currentVersion := u.clusterVersion(ctx, t, i).String() - if currentVersion != newVersion { - return fmt.Errorf("%d: expected version %s, got %s", i, newVersion, currentVersion) - } - t.L().Printf("%s: acked by n%d", currentVersion, i) - return nil - }) - if err != nil { - t.Fatal(err) - } - } - - t.L().Printf("%s: nodes %v are upgraded\n", newVersion, nodes) - - // TODO(nvanbenschoten): add upgrade qualification step. - } -} - -type versionFeatureTest struct { - name string - fn func(context.Context, test.Test, *versionUpgradeTest, option.NodeListOption) (skipped bool) -} - -type versionFeatureStep []versionFeatureTest - -func (vs versionFeatureStep) step(nodes option.NodeListOption) versionStep { - return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - for _, feature := range vs { - t.L().Printf("checking %s", feature.name) - tBegin := timeutil.Now() - skipped := feature.fn(ctx, t, u, nodes) - dur := fmt.Sprintf("%.2fs", timeutil.Since(tBegin).Seconds()) - if skipped { - t.L().Printf("^-- skip (%s)", dur) - } else { - t.L().Printf("^-- ok (%s)", dur) - } + dbFunc := func(node int) *gosql.DB { return u.conn(ctx, t, node) } + if err := clusterupgrade.WaitForClusterUpgrade(ctx, t.L(), nodes, dbFunc); err != nil { + t.Fatal(err) } } } -func stmtFeatureTest( - name string, minVersion roachpb.Version, stmt string, args ...interface{}, -) versionFeatureTest { - return versionFeatureTest{ - name: name, - fn: func(ctx context.Context, t test.Test, u *versionUpgradeTest, nodes option.NodeListOption) (skipped bool) { - i := nodes.RandNode()[0] - if u.clusterVersion(ctx, t, i).Less(minVersion) { - return true // skipped - } - db := u.conn(ctx, t, i) - if _, err := db.ExecContext(ctx, stmt, args...); err != nil { - if testutils.IsError(err, "no inbound stream connection") && u.clusterVersion(ctx, t, i).Less(v202) { - // This error has been fixed in 20.2+ but may still occur on earlier - // versions. - return true // skipped - } - t.Fatal(err) - } - return false - }, - } -} - // makeVersionFixtureAndFatal creates fixtures from which we can test // mixed-version clusters (i.e. version X mixing with X-1). The fixtures date // back all the way to v1.0; when development begins on version X, we make a @@ -610,7 +342,7 @@ func makeVersionFixtureAndFatal( useLocalBinary = true } - predecessorVersion, err := PredecessorVersion(*version.MustParse("v" + makeFixtureVersion)) + predecessorVersion, err := clusterupgrade.PredecessorVersion(*version.MustParse("v" + makeFixtureVersion)) if err != nil { t.Fatal(err) } @@ -658,10 +390,11 @@ func makeVersionFixtureAndFatal( // cluster version might be 2.0, so we can only use the 2.0 or // 2.1 binary, but not the 19.1 binary (as 19.1 and 2.0 are not // compatible). - name := checkpointName(u.binaryVersion(ctx, t, 1).String()) + name := clusterupgrade.CheckpointName(u.binaryVersion(ctx, t, 1).String()) u.c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.All()) - c.Run(ctx, c.All(), binaryPathFromVersion(makeFixtureVersion), "debug", "pebble", "db", "checkpoint", + binaryPath := clusterupgrade.BinaryPathFromVersion(makeFixtureVersion) + c.Run(ctx, c.All(), binaryPath, "debug", "pebble", "db", "checkpoint", "{store-dir}", "{store-dir}/"+name) // The `cluster-bootstrapped` marker can already be found within // store-dir, but the rocksdb checkpoint step above does not pick it diff --git a/pkg/roachprod/logger/log.go b/pkg/roachprod/logger/log.go index 509cb82a882a..59352e2b1d40 100644 --- a/pkg/roachprod/logger/log.go +++ b/pkg/roachprod/logger/log.go @@ -38,11 +38,9 @@ type loggerOption interface { apply(*Config) } -type logPrefix string +type LogPrefix string -var _ logPrefix // silence unused lint - -func (p logPrefix) apply(cfg *Config) { +func (p LogPrefix) apply(cfg *Config) { cfg.Prefix = string(p) }