From 42d014a58695dce9227bdff1c64690e5f338a34c Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Wed, 16 Nov 2022 13:03:44 -0500 Subject: [PATCH 1/3] roachtest: introduce `clusterupgrade` package The `clusterupgrade` package is introduced in this commit as a means towards the end goal of reusing upgrade-related functionality in roachtests. Specifically, as we work towards a new API for mixed-version testing in roachtests, we want to continue to support existing upgrade tests (at least for the time being). For that reason, it's useful to be able to reuse certain low level building blocks that both APIs rely on. The `clusterupgrade` package includes functions to query the binary and cluster versions on a node, to replace the binary running on nodes, upload a certain binary, etc. Most of these functions were moved from the `versionupgrade.go` file. Epic: CRDB-19321 Release note: None --- pkg/BUILD.bazel | 2 + .../roachtestutil/clusterupgrade/BUILD.bazel | 26 ++ .../clusterupgrade/clusterupgrade.go | 270 ++++++++++++++++++ .../clusterupgrade}/predecessor_version.go | 2 +- .../clusterupgrade}/predecessor_version.json | 0 pkg/cmd/roachtest/test_impl.go | 2 +- pkg/cmd/roachtest/tests/BUILD.bazel | 3 +- pkg/cmd/roachtest/tests/autoupgrade.go | 3 +- pkg/cmd/roachtest/tests/backup.go | 10 +- pkg/cmd/roachtest/tests/decommission_self.go | 11 +- pkg/cmd/roachtest/tests/follower_reads.go | 10 +- pkg/cmd/roachtest/tests/import.go | 10 +- pkg/cmd/roachtest/tests/mixed_version_cdc.go | 10 +- .../tests/mixed_version_change_replicas.go | 19 +- .../mixed_version_decl_schemachange_compat.go | 3 +- .../tests/mixed_version_decommission.go | 14 +- ...atibility_in_declarative_schema_changer.go | 8 +- pkg/cmd/roachtest/tests/mixed_version_jobs.go | 22 +- .../tests/mixed_version_schemachange.go | 22 +- .../roachtest/tests/multitenant_upgrade.go | 5 +- pkg/cmd/roachtest/tests/rebalance_load.go | 8 +- pkg/cmd/roachtest/tests/secondary_indexes.go | 12 +- pkg/cmd/roachtest/tests/tpcc.go | 3 +- ...ate_system_schema_after_version_upgrade.go | 8 +- pkg/cmd/roachtest/tests/version.go | 3 +- pkg/cmd/roachtest/tests/versionupgrade.go | 236 ++++----------- 26 files changed, 432 insertions(+), 290 deletions(-) create mode 100644 pkg/cmd/roachtest/roachtestutil/clusterupgrade/BUILD.bazel create mode 100644 pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go rename pkg/cmd/roachtest/{tests => roachtestutil/clusterupgrade}/predecessor_version.go (99%) rename pkg/cmd/roachtest/{tests => roachtestutil/clusterupgrade}/predecessor_version.json (100%) diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 716435bb7e61..7f7e3fb39395 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -1006,6 +1006,7 @@ GO_TARGETS = [ "//pkg/cmd/roachtest/clusterstats:clusterstats_test", "//pkg/cmd/roachtest/option:option", "//pkg/cmd/roachtest/registry:registry", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade:clusterupgrade", "//pkg/cmd/roachtest/roachtestutil:roachtestutil", "//pkg/cmd/roachtest/spec:spec", "//pkg/cmd/roachtest/test:test", @@ -2467,6 +2468,7 @@ GET_X_DATA_TARGETS = [ "//pkg/cmd/roachtest/option:get_x_data", "//pkg/cmd/roachtest/registry:get_x_data", "//pkg/cmd/roachtest/roachtestutil:get_x_data", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade:get_x_data", "//pkg/cmd/roachtest/spec:get_x_data", "//pkg/cmd/roachtest/test:get_x_data", "//pkg/cmd/roachtest/tests:get_x_data", diff --git a/pkg/cmd/roachtest/roachtestutil/clusterupgrade/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/BUILD.bazel new file mode 100644 index 000000000000..40be598bfa88 --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/BUILD.bazel @@ -0,0 +1,26 @@ +load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data") +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "clusterupgrade", + srcs = [ + "clusterupgrade.go", + "predecessor_version.go", + ], + embedsrcs = ["predecessor_version.json"], + importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade", + visibility = ["//visibility:public"], + deps = [ + "//pkg/cmd/roachtest/cluster", + "//pkg/cmd/roachtest/option", + "//pkg/cmd/roachtest/test", + "//pkg/roachpb", + "//pkg/roachprod/install", + "//pkg/roachprod/logger", + "//pkg/util/retry", + "//pkg/util/version", + "@com_github_cockroachdb_errors//:errors", + ], +) + +get_x_data(name = "get_x_data") diff --git a/pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go new file mode 100644 index 000000000000..aed7b558cd8c --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go @@ -0,0 +1,270 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package clusterupgrade + +import ( + "context" + gosql "database/sql" + "fmt" + "math/rand" + "path/filepath" + "strconv" + "time" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/retry" + "github.com/cockroachdb/cockroach/pkg/util/version" +) + +const ( + // MainVersion is the sentinel used to represent that the binary + // passed to roachtest should be uploaded when `version` is left + // unspecified. + MainVersion = "" +) + +// BinaryVersion returns the binary version running on the node +// associated with the given database connection. +// NB: version means major.minor[-internal]; the patch level isn't +// returned. For example, a binary of version 19.2.4 will return 19.2. +func BinaryVersion(db *gosql.DB) (roachpb.Version, error) { + zero := roachpb.Version{} + var sv string + if err := db.QueryRow(`SELECT crdb_internal.node_executable_version();`).Scan(&sv); err != nil { + return zero, err + } + + if len(sv) == 0 { + return zero, fmt.Errorf("empty version") + } + + return roachpb.ParseVersion(sv) +} + +// ClusterVersion returns the cluster version active on the node +// associated with the given database connection. Note that the +// returned value might become stale due to the cluster auto-upgrading +// in the background plus gossip asynchronicity. +// NB: cluster versions are always major.minor[-internal]; there isn't +// a patch level. +func ClusterVersion(ctx context.Context, db *gosql.DB) (roachpb.Version, error) { + zero := roachpb.Version{} + var sv string + if err := db.QueryRowContext(ctx, `SHOW CLUSTER SETTING version`).Scan(&sv); err != nil { + return zero, err + } + + return roachpb.ParseVersion(sv) +} + +// UploadVersion uploads the specified crdb version to the given +// nodes. It returns the path of the uploaded binaries on the nodes, +// suitable to be used with `roachdprod start --binary=`. +func UploadVersion( + ctx context.Context, + t test.Test, + l *logger.Logger, + c cluster.Cluster, + nodes option.NodeListOption, + newVersion string, +) (string, error) { + binaryName := "./cockroach" + if newVersion == MainVersion { + if err := c.PutE(ctx, l, t.Cockroach(), binaryName, nodes); err != nil { + return "", err + } + } else if binary, ok := t.VersionsBinaryOverride()[newVersion]; ok { + // If an override has been specified for newVersion, use that binary. + l.Printf("using binary override for version %s: %s", newVersion, binary) + binaryName = "./cockroach-" + newVersion + if err := c.PutE(ctx, l, binary, binaryName, nodes); err != nil { + return "", err + } + } else { + v := "v" + newVersion + dir := v + binaryName = filepath.Join(dir, "cockroach") + // Check if the cockroach binary already exists. + if err := c.RunE(ctx, nodes, "test", "-e", binaryName); err != nil { + if err := c.RunE(ctx, nodes, "mkdir", "-p", dir); err != nil { + return "", err + } + if err := c.Stage(ctx, l, "release", v, dir, nodes); err != nil { + return "", err + } + } + } + return BinaryPathFromVersion(newVersion), nil +} + +// InstallFixtures copies the previously created fixtures (in +// pkg/cmd/roachtest/fixtures) for the given version to the nodes +// passed. After this step, the corresponding binary can be started on +// the cluster and it will use that store directory. +func InstallFixtures( + ctx context.Context, l *logger.Logger, c cluster.Cluster, nodes option.NodeListOption, v string, +) error { + c.Run(ctx, nodes, "mkdir -p {store-dir}") + vv := version.MustParse("v" + v) + // The fixtures use cluster version (major.minor) but the input might be + // a patch release. + name := CheckpointName( + roachpb.Version{Major: int32(vv.Major()), Minor: int32(vv.Minor())}.String(), + ) + for _, n := range nodes { + if err := c.PutE(ctx, l, + "pkg/cmd/roachtest/fixtures/"+strconv.Itoa(n)+"/"+name+".tgz", + "{store-dir}/fixture.tgz", c.Node(n), + ); err != nil { + return err + } + } + // Extract fixture. Fail if there's already an LSM in the store dir. + c.Run(ctx, nodes, "cd {store-dir} && [ ! -f {store-dir}/CURRENT ] && tar -xf fixture.tgz") + return nil +} + +// StartWithBinary starts a cockroach binary, assumed to already be +// present in the nodes in the path given. +func StartWithBinary( + ctx context.Context, + l *logger.Logger, + c cluster.Cluster, + nodes option.NodeListOption, + binaryPath string, + startOpts option.StartOpts, +) { + settings := install.MakeClusterSettings(install.BinaryOption(binaryPath)) + c.Start(ctx, l, startOpts, settings, nodes) +} + +// BinaryPathFromVersion shows where the binary for the given version +// can be found on roachprod nodes. It's either `./cockroach` or the +// path to which a released binary is staged. +func BinaryPathFromVersion(v string) string { + if v == "" { + return "./cockroach" + } + return filepath.Join("v"+v, "cockroach") +} + +// RestartNodesWithNewBinary uploads a given cockroach version to the +// nodes passed, and restarts the cockroach process. +func RestartNodesWithNewBinary( + ctx context.Context, + t test.Test, + l *logger.Logger, + c cluster.Cluster, + nodes option.NodeListOption, + startOpts option.StartOpts, + newVersion string, +) error { + // NB: We could technically stage the binary on all nodes before + // restarting each one, but on Unix it's invalid to write to an + // executable file while it is currently running. So we do the + // simple thing and upload it serially instead. + + // Restart nodes in a random order; otherwise node 1 would be running all + // the migrations and it probably also has all the leases. + rand.Shuffle(len(nodes), func(i, j int) { + nodes[i], nodes[j] = nodes[j], nodes[i] + }) + for _, node := range nodes { + l.Printf("restarting node %d into version %s", node, VersionMsg(newVersion)) + // Stop the cockroach process gracefully in order to drain it properly. + // This makes the upgrade closer to how users do it in production, but + // it's also needed to eliminate flakiness. In particular, this will + // make sure that DistSQL draining information is dissipated through + // gossip so that other nodes running an older version don't consider + // this upgraded node for DistSQL plans (see #87154 for more details). + // TODO(yuzefovich): ideally, we would also check that the drain was + // successful since if it wasn't, then we might see flakes too. + if err := c.StopCockroachGracefullyOnNode(ctx, l, node); err != nil { + return err + } + + binary, err := UploadVersion(ctx, t, l, c, c.Node(node), newVersion) + if err != nil { + return err + } + StartWithBinary(ctx, l, c, c.Node(node), binary, startOpts) + + // We have seen cases where a transient error could occur when this + // newly upgraded node serves as a gateway for a distributed query due + // to remote nodes not being able to dial back to the gateway for some + // reason (investigation of it is tracked in #87634). For now, we're + // papering over these flakes by this sleep. For more context, see + // #87104. + // TODO(yuzefovich): remove this sleep once #87634 is fixed. + time.Sleep(4 * time.Second) + } + + return nil +} + +// WaitForClusterUpgrade waits for the cluster version to reach the +// first node's binary version. This function should only be called if +// every node in the cluster has been restarted to run the same binary +// version. We rely on the cluster's internal self-upgrading +// mechanism to update the underlying cluster version. +func WaitForClusterUpgrade( + ctx context.Context, l *logger.Logger, nodes option.NodeListOption, dbFunc func(int) *gosql.DB, +) error { + newVersion, err := BinaryVersion(dbFunc(nodes[0])) + if err != nil { + return err + } + + l.Printf("waiting for cluster to auto-upgrade to %s", newVersion) + for _, node := range nodes { + err := retry.ForDuration(5*time.Minute, func() error { + currentVersion, err := ClusterVersion(ctx, dbFunc(node)) + if err != nil { + return err + } + if currentVersion != newVersion { + return fmt.Errorf("%d: expected cluster version %s, got %s", node, newVersion, currentVersion) + } + l.Printf("%s: acked by n%d", currentVersion, node) + return nil + }) + if err != nil { + return err + } + } + + l.Printf("all nodes (%v) are upgraded to %s", nodes, newVersion) + return nil +} + +// CheckpointName returns the expected name of the checkpoint file +// under `pkg/cmd/roachtest/fixtures/{nodeID}` for the given binary +// version. +func CheckpointName(binaryVersion string) string { + return "checkpoint-v" + binaryVersion +} + +// VersionMsg returns a version string to be displayed in logs. It's +// either the version given, or the "" string to represent +// the latest cockroach version, typically built off the branch being +// tested. +func VersionMsg(v string) string { + if v == MainVersion { + return "" + } + + return v +} diff --git a/pkg/cmd/roachtest/tests/predecessor_version.go b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.go similarity index 99% rename from pkg/cmd/roachtest/tests/predecessor_version.go rename to pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.go index 5309f7a8cce4..46d507cbcbff 100644 --- a/pkg/cmd/roachtest/tests/predecessor_version.go +++ b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.go @@ -8,7 +8,7 @@ // by the Apache License, Version 2.0, included in the file // licenses/APL.txt. -package tests +package clusterupgrade import ( // Import embed for the version map diff --git a/pkg/cmd/roachtest/tests/predecessor_version.json b/pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.json similarity index 100% rename from pkg/cmd/roachtest/tests/predecessor_version.json rename to pkg/cmd/roachtest/roachtestutil/clusterupgrade/predecessor_version.json diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index 641dd53f6636..63a31c139f65 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -106,7 +106,7 @@ type testImpl struct { // Map from version to path to the cockroach binary to be used when // mixed-version test wants a binary for that binary. If a particular version // is found in this map, it is used instead of the binary coming from - // `roachprod stage release `. See the --version-binary-override flags. + // `roachprod stage release `. See the --versions-binary-override flags. // // Version strings look like "20.1.4". versionsBinaryOverride map[string]string diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index 0b872c7a5119..cc1fb8664704 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -110,7 +110,6 @@ go_library( "pgx.go", "pgx_blocklist.go", "pop.go", - "predecessor_version.go", "psycopg.go", "psycopg_blocklist.go", "query_comparison_util.go", @@ -165,7 +164,6 @@ go_library( "versionupgrade.go", "ycsb.go", ], - embedsrcs = ["predecessor_version.json"], importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/tests", visibility = ["//visibility:public"], deps = [ @@ -184,6 +182,7 @@ go_library( "//pkg/cmd/roachtest/option", "//pkg/cmd/roachtest/registry", "//pkg/cmd/roachtest/roachtestutil", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade", "//pkg/cmd/roachtest/spec", "//pkg/cmd/roachtest/test", "//pkg/gossip", diff --git a/pkg/cmd/roachtest/tests/autoupgrade.go b/pkg/cmd/roachtest/tests/autoupgrade.go index 1e7b191c3a7e..7e5d0aa3f52f 100644 --- a/pkg/cmd/roachtest/tests/autoupgrade.go +++ b/pkg/cmd/roachtest/tests/autoupgrade.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/testutils" @@ -259,7 +260,7 @@ func registerAutoUpgrade(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - pred, err := PredecessorVersion(*t.BuildVersion()) + pred, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/backup.go b/pkg/cmd/roachtest/tests/backup.go index 787510d2b751..3863a37f7cd8 100644 --- a/pkg/cmd/roachtest/tests/backup.go +++ b/pkg/cmd/roachtest/tests/backup.go @@ -31,6 +31,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/jobs" @@ -448,13 +449,10 @@ func registerBackupMixedVersion(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" roachNodes := c.All() upgradedNodes := c.Nodes(1, 2) oldNodes := c.Nodes(3, 4) - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) c.Put(ctx, t.DeprecatedWorkload(), "./workload") @@ -469,7 +467,7 @@ func registerBackupMixedVersion(r registry.Registry) { setShortJobIntervalsStep(1), loadBackupDataStep(c), // Upgrade some nodes. - binaryUpgradeStep(upgradedNodes, mainVersion), + binaryUpgradeStep(upgradedNodes, clusterupgrade.MainVersion), // Let us first test planning and executing a backup on different node // versions. @@ -561,7 +559,7 @@ func registerBackupMixedVersion(r registry.Registry) { enableJobAdoptionStep(c, upgradedNodes), // Allow all the nodes to now finalize their cluster version. - binaryUpgradeStep(oldNodes, mainVersion), + binaryUpgradeStep(oldNodes, clusterupgrade.MainVersion), allowAutoUpgradeStep(1), waitForUpgradeStep(roachNodes), diff --git a/pkg/cmd/roachtest/tests/decommission_self.go b/pkg/cmd/roachtest/tests/decommission_self.go index db3e8c81985d..db39d031a3b6 100644 --- a/pkg/cmd/roachtest/tests/decommission_self.go +++ b/pkg/cmd/roachtest/tests/decommission_self.go @@ -14,6 +14,7 @@ import ( "context" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" ) @@ -21,15 +22,11 @@ import ( // // See https://github.com/cockroachdb/cockroach/issues/56718 func runDecommissionSelf(ctx context.Context, t test.Test, c cluster.Cluster) { - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" - allNodes := c.All() u := newVersionUpgradeTest(c, - uploadVersionStep(allNodes, mainVersion), - startVersion(allNodes, mainVersion), - fullyDecommissionStep(2, 2, mainVersion), + uploadVersionStep(allNodes, clusterupgrade.MainVersion), + startVersion(allNodes, clusterupgrade.MainVersion), + fullyDecommissionStep(2, 2, clusterupgrade.MainVersion), func(ctx context.Context, t test.Test, u *versionUpgradeTest) { // Stop n2 and exclude it from post-test consistency checks, // as this node can't contact cluster any more and operations diff --git a/pkg/cmd/roachtest/tests/follower_reads.go b/pkg/cmd/roachtest/tests/follower_reads.go index 9c714ea8037f..51f58212580f 100644 --- a/pkg/cmd/roachtest/tests/follower_reads.go +++ b/pkg/cmd/roachtest/tests/follower_reads.go @@ -27,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -866,11 +867,8 @@ func parsePrometheusMetric(s string) (*prometheusMetric, bool) { func runFollowerReadsMixedVersionSingleRegionTest( ctx context.Context, t test.Test, c cluster.Cluster, buildVersion version.Version, ) { - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) require.NoError(t, err) - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const curVersion = "" // Start the cluster at the old version. settings := install.MakeClusterSettings() @@ -884,7 +882,7 @@ func runFollowerReadsMixedVersionSingleRegionTest( randNode := 1 + rand.Intn(c.Spec().NodeCount) t.L().Printf("upgrading n%d to current version", randNode) nodeToUpgrade := c.Node(randNode) - upgradeNodes(ctx, nodeToUpgrade, startOpts, curVersion, t, c) + upgradeNodes(ctx, t, c, nodeToUpgrade, startOpts, clusterupgrade.MainVersion) runFollowerReadsTest(ctx, t, c, topologySpec{multiRegion: false}, exactStaleness, data) // Upgrade the remaining nodes to the new version and run the test. @@ -896,6 +894,6 @@ func runFollowerReadsMixedVersionSingleRegionTest( remainingNodes = remainingNodes.Merge(c.Node(i + 1)) } t.L().Printf("upgrading nodes %s to current version", remainingNodes) - upgradeNodes(ctx, remainingNodes, startOpts, curVersion, t, c) + upgradeNodes(ctx, t, c, remainingNodes, startOpts, clusterupgrade.MainVersion) runFollowerReadsTest(ctx, t, c, topologySpec{multiRegion: false}, exactStaleness, data) } diff --git a/pkg/cmd/roachtest/tests/import.go b/pkg/cmd/roachtest/tests/import.go index de84a0f71f52..d5634bdcdccf 100644 --- a/pkg/cmd/roachtest/tests/import.go +++ b/pkg/cmd/roachtest/tests/import.go @@ -22,6 +22,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -325,9 +326,6 @@ func successfulImportStep(warehouses, nodeID int) versionStep { func runImportMixedVersion( ctx context.Context, t test.Test, c cluster.Cluster, warehouses int, predecessorVersion string, ) { - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" roachNodes := c.All() t.Status("starting csv servers") @@ -338,8 +336,8 @@ func runImportMixedVersion( preventAutoUpgradeStep(1), // Upgrade some of the nodes. - binaryUpgradeStep(c.Node(1), mainVersion), - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), successfulImportStep(warehouses, 1 /* nodeID */), ) @@ -356,7 +354,7 @@ func registerImportMixedVersion(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/mixed_version_cdc.go b/pkg/cmd/roachtest/tests/mixed_version_cdc.go index b00fdc7a6217..2586b447c101 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_cdc.go +++ b/pkg/cmd/roachtest/tests/mixed_version_cdc.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/util/randutil" @@ -353,7 +354,7 @@ func (cmvt *cdcMixedVersionTester) createChangeFeed(node int) versionStep { func runCDCMixedVersions( ctx context.Context, t test.Test, c cluster.Cluster, buildVersion version.Version, ) { - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) if err != nil { t.Fatal(err) } @@ -372,9 +373,6 @@ func runCDCMixedVersions( return tester.crdbNodes[rng.Intn(len(tester.crdbNodes))] } - // An empty string will lead to the cockroach binary specified by flag - // `cockroach` to be used. - const mainVersion = "" newVersionUpgradeTest(c, uploadAndStartFromCheckpointFixture(tester.crdbNodes, predecessorVersion), tester.setupVerifier(sqlNode()), @@ -388,7 +386,7 @@ func runCDCMixedVersions( tester.waitForResolvedTimestamps(), // Roll the nodes into the new version one by one in random order - tester.crdbUpgradeStep(binaryUpgradeStep(tester.crdbNodes, mainVersion)), + tester.crdbUpgradeStep(binaryUpgradeStep(tester.crdbNodes, clusterupgrade.MainVersion)), // let the workload run in the new version for a while tester.waitForResolvedTimestamps(), @@ -402,7 +400,7 @@ func runCDCMixedVersions( tester.assertValid(), // Roll nodes forward and finalize upgrade. - tester.crdbUpgradeStep(binaryUpgradeStep(tester.crdbNodes, mainVersion)), + tester.crdbUpgradeStep(binaryUpgradeStep(tester.crdbNodes, clusterupgrade.MainVersion)), // allow cluster version to update allowAutoUpgradeStep(sqlNode()), diff --git a/pkg/cmd/roachtest/tests/mixed_version_change_replicas.go b/pkg/cmd/roachtest/tests/mixed_version_change_replicas.go index 9061cdbc563a..2dd730c32bf2 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_change_replicas.go +++ b/pkg/cmd/roachtest/tests/mixed_version_change_replicas.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/randutil" @@ -54,9 +55,7 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C return rng.Intn(nodeCount) + 1 } - // An empty string uses the cockroach binary specified by `--cockroach`. - const mainVersion = "" - preVersion, err := PredecessorVersion(*t.BuildVersion()) + preVersion, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) // scanTableStep runs a count(*) scan across a table, asserting the row count. @@ -215,7 +214,7 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C u := newVersionUpgradeTest(c, // Start the cluster with preVersion and wait for it to bootstrap, then - // disable auto-upgrades to mainVersion. The checkpoint fixture is not + // disable auto-upgrades to MainVersion. The checkpoint fixture is not // necessary in this test, but we pull it in to get better test coverage of // historical cluster state. uploadAndStartFromCheckpointFixture(c.All(), preVersion), @@ -231,8 +230,8 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C require.NoError(t, WaitFor3XReplication(ctx, t, conn)) }, - // Upgrade n1,n2 to mainVersion, leave n3,n4 at preVersion. - binaryUpgradeStep(c.Nodes(1, 2), mainVersion), + // Upgrade n1,n2 to MainVersion, leave n3,n4 at preVersion. + binaryUpgradeStep(c.Nodes(1, 2), clusterupgrade.MainVersion), // Scatter the table's single range, to randomize replica/lease // placement -- in particular, who's responsible for splits. @@ -280,9 +279,9 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C scatterTableStep("test"), scanTableStep("test", 0), - // Upgrade n3,n4 (the remaining nodes) to mainVersion, verify that we can + // Upgrade n3,n4 (the remaining nodes) to MainVersion, verify that we can // run a table scan, move ranges around, scatter, and scan again. - binaryUpgradeStep(c.Nodes(3, 4), mainVersion), + binaryUpgradeStep(c.Nodes(3, 4), clusterupgrade.MainVersion), scanTableStep("test", 0), changeReplicasRelocateFromNodeStep("test", 1), changeReplicasZoneConfigFromNodeStep("test", 2), @@ -302,10 +301,10 @@ func runChangeReplicasMixedVersion(ctx context.Context, t test.Test, c cluster.C scatterTableStep("test"), scanTableStep("test", 0), - // Upgrade all to mainVersion and finalize the upgrade. Verify that we can + // Upgrade all to MainVersion and finalize the upgrade. Verify that we can // run a table scan, shuffle the ranges, scatter them, and scan again. allowAutoUpgradeStep(1), - binaryUpgradeStep(c.All(), mainVersion), + binaryUpgradeStep(c.All(), clusterupgrade.MainVersion), waitForUpgradeStep(c.All()), scanTableStep("test", 0), changeReplicasRelocateFromNodeStep("test", 1), diff --git a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go index dab24daa6d2d..983a7f0db8a7 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go +++ b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -112,7 +113,7 @@ func runDeclSchemaChangeCompatMixedVersions( ctx context.Context, t test.Test, c cluster.Cluster, buildVersion version.Version, ) { versionRegex := regexp.MustCompile(`(\d+\.\d+)`) - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/mixed_version_decommission.go b/pkg/cmd/roachtest/tests/mixed_version_decommission.go index 2a3b2173a5a9..d7c8cf5a2426 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_decommission.go +++ b/pkg/cmd/roachtest/tests/mixed_version_decommission.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/testutils" @@ -30,7 +31,7 @@ import ( func runDecommissionMixedVersions( ctx context.Context, t test.Test, c cluster.Cluster, buildVersion version.Version, ) { - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) if err != nil { t.Fatal(err) } @@ -42,15 +43,12 @@ func runDecommissionMixedVersions( pinnedUpgrade := h.getRandNode() t.L().Printf("pinned n%d for upgrade", pinnedUpgrade) - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" allNodes := c.All() u := newVersionUpgradeTest(c, // We upload both binaries to each node, to be able to vary the binary // used when issuing `cockroach node` subcommands. uploadVersionStep(allNodes, predecessorVersion), - uploadVersionStep(allNodes, mainVersion), + uploadVersionStep(allNodes, clusterupgrade.MainVersion), startVersion(allNodes, predecessorVersion), waitForUpgradeStep(allNodes), @@ -59,8 +57,8 @@ func runDecommissionMixedVersions( preloadDataStep(pinnedUpgrade), // We upgrade a pinnedUpgrade and one other random node of the cluster to v20.2. - binaryUpgradeStep(c.Node(pinnedUpgrade), mainVersion), - binaryUpgradeStep(c.Node(h.getRandNodeOtherThan(pinnedUpgrade)), mainVersion), + binaryUpgradeStep(c.Node(pinnedUpgrade), clusterupgrade.MainVersion), + binaryUpgradeStep(c.Node(h.getRandNodeOtherThan(pinnedUpgrade)), clusterupgrade.MainVersion), checkAllMembership(pinnedUpgrade, "active"), // Partially decommission a random node from another random node. We @@ -80,7 +78,7 @@ func runDecommissionMixedVersions( binaryUpgradeStep(allNodes, predecessorVersion), // Roll all nodes forward, and finalize upgrade. - binaryUpgradeStep(allNodes, mainVersion), + binaryUpgradeStep(allNodes, clusterupgrade.MainVersion), allowAutoUpgradeStep(1), waitForUpgradeStep(allNodes), diff --git a/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go b/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go index 512bacadd8ab..4cd6165fcbb6 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go +++ b/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/stretchr/testify/require" @@ -91,13 +92,10 @@ func registerDeclarativeSchemaChangerJobCompatibilityInMixedVersion(r registry.R if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" allNodes := c.All() upgradedNodes := c.Nodes(1, 2) oldNodes := c.Nodes(3, 4) - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) u := newVersionUpgradeTest(c, @@ -109,7 +107,7 @@ func registerDeclarativeSchemaChangerJobCompatibilityInMixedVersion(r registry.R setShortGCTTLInSystemZoneConfig(c), // Upgrade some nodes. - binaryUpgradeStep(upgradedNodes, mainVersion), + binaryUpgradeStep(upgradedNodes, clusterupgrade.MainVersion), // Job backward compatibility test: // - upgraded nodes: plan schema change and create schema changer jobs diff --git a/pkg/cmd/roachtest/tests/mixed_version_jobs.go b/pkg/cmd/roachtest/tests/mixed_version_jobs.go index 7e5aa0e62d1b..7aaeae58cd6b 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_jobs.go +++ b/pkg/cmd/roachtest/tests/mixed_version_jobs.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" @@ -229,9 +230,6 @@ FROM [SHOW JOBS] WHERE status = $1 OR status = $2`, func runJobsMixedVersions( ctx context.Context, t test.Test, c cluster.Cluster, warehouses int, predecessorVersion string, ) { - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" roachNodes := c.All() backgroundTPCC := backgroundJobsTestTPCCImport(t, warehouses) resumeAllJobsAndWaitStep := makeResumeAllJobsAndWaitStep(10 * time.Second) @@ -249,22 +247,22 @@ func runJobsMixedVersions( // Roll the nodes into the new version one by one, while repeatedly pausing // and resuming all jobs. - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(4), mainVersion), + binaryUpgradeStep(c.Node(4), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), @@ -292,22 +290,22 @@ func runJobsMixedVersions( pauseAllJobsStep(), // Roll nodes forward and finalize upgrade. - binaryUpgradeStep(c.Node(4), mainVersion), + binaryUpgradeStep(c.Node(4), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), resumeAllJobsAndWaitStep, checkForFailedJobsStep, pauseAllJobsStep(), @@ -337,7 +335,7 @@ func registerJobsMixedVersions(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go index cb7143355147..fd1526a75e09 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go +++ b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -77,14 +78,11 @@ func runSchemaChangeMixedVersions( concurrency int, buildVersion version.Version, ) { - predecessorVersion, err := PredecessorVersion(buildVersion) + predecessorVersion, err := clusterupgrade.PredecessorVersion(buildVersion) if err != nil { t.Fatal(err) } - // An empty string will lead to the cockroach binary specified by flag - // `cockroach` to be used. - const mainVersion = "" schemaChangeStep := runSchemaChangeWorkloadStep(c.All().RandNode()[0], maxOps, concurrency) if buildVersion.Major() < 20 { // Schema change workload is meant to run only on versions 19.2 or higher. @@ -107,13 +105,13 @@ func runSchemaChangeMixedVersions( // Roll the nodes into the new version one by one, while repeatedly running // schema changes. We use an empty string for the version below, which means // use the main ./cockroach binary (i.e. the one being tested in this run). - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(4), mainVersion), + binaryUpgradeStep(c.Node(4), clusterupgrade.MainVersion), schemaChangeStep, // Roll back again, which ought to be fine because the cluster upgrade was @@ -128,13 +126,13 @@ func runSchemaChangeMixedVersions( schemaChangeStep, // Roll nodes forward and finalize upgrade. - binaryUpgradeStep(c.Node(4), mainVersion), + binaryUpgradeStep(c.Node(4), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), schemaChangeStep, - binaryUpgradeStep(c.Node(2), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), schemaChangeStep, allowAutoUpgradeStep(1), diff --git a/pkg/cmd/roachtest/tests/multitenant_upgrade.go b/pkg/cmd/roachtest/tests/multitenant_upgrade.go index 84676befdd2e..c9312ef49d6d 100644 --- a/pkg/cmd/roachtest/tests/multitenant_upgrade.go +++ b/pkg/cmd/roachtest/tests/multitenant_upgrade.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" @@ -68,10 +69,10 @@ func registerMultiTenantUpgrade(r registry.Registry) { // - Tenant14{Binary: Cur, Cluster: Cur}: Create tenant 14 and verify it works. // - Tenant12{Binary: Cur, Cluster: Cur}: Restart tenant 14 and make sure it still works. func runMultiTenantUpgrade(ctx context.Context, t test.Test, c cluster.Cluster, v version.Version) { - predecessor, err := PredecessorVersion(v) + predecessor, err := clusterupgrade.PredecessorVersion(v) require.NoError(t, err) - currentBinary := uploadVersion(ctx, t, c, c.All(), "") + currentBinary := uploadVersion(ctx, t, c, c.All(), clusterupgrade.MainVersion) predecessorBinary := uploadVersion(ctx, t, c, c.All(), predecessor) kvNodes := c.Node(1) diff --git a/pkg/cmd/roachtest/tests/rebalance_load.go b/pkg/cmd/roachtest/tests/rebalance_load.go index 4aa59a674455..352e6023e7d0 100644 --- a/pkg/cmd/roachtest/tests/rebalance_load.go +++ b/pkg/cmd/roachtest/tests/rebalance_load.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -63,7 +64,7 @@ func registerRebalanceLoad(r registry.Registry) { "--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5") settings := install.MakeClusterSettings() if mixedVersion { - predecessorVersion, err := PredecessorVersion(*t.BuildVersion()) + predecessorVersion, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) settings.Binary = uploadVersion(ctx, t, c, c.All(), predecessorVersion) // Upgrade some (or all) of the first N-1 CRDB nodes. We ignore the last @@ -72,11 +73,8 @@ func registerRebalanceLoad(r registry.Registry) { lastNodeToUpgrade := rand.Intn(c.Spec().NodeCount-2) + 1 t.L().Printf("upgrading %d nodes to the current cockroach binary", lastNodeToUpgrade) nodesToUpgrade := c.Range(1, lastNodeToUpgrade) - // An empty string means that the cockroach binary specified by the - // `cockroach` flag will be used. - const newVersion = "" c.Start(ctx, t.L(), startOpts, settings, roachNodes) - upgradeNodes(ctx, nodesToUpgrade, startOpts, newVersion, t, c) + upgradeNodes(ctx, t, c, nodesToUpgrade, startOpts, clusterupgrade.MainVersion) } else { c.Put(ctx, t.Cockroach(), "./cockroach", roachNodes) c.Start(ctx, t.L(), startOpts, settings, roachNodes) diff --git a/pkg/cmd/roachtest/tests/secondary_indexes.go b/pkg/cmd/roachtest/tests/secondary_indexes.go index 095d0b21db5d..c5ac67d3db92 100644 --- a/pkg/cmd/roachtest/tests/secondary_indexes.go +++ b/pkg/cmd/roachtest/tests/secondary_indexes.go @@ -16,6 +16,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/stretchr/testify/require" ) @@ -41,9 +42,6 @@ func runIndexUpgrade( } roachNodes := c.All() - // An empty string means that the cockroach binary specified by flag - // `cockroach` will be used. - const mainVersion = "" u := newVersionUpgradeTest(c, uploadAndStart(roachNodes, predecessorVersion), waitForUpgradeStep(roachNodes), @@ -52,7 +50,7 @@ func runIndexUpgrade( createDataStep(), // Upgrade one of the nodes. - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), // Modify index data from that node. modifyData(1, @@ -66,8 +64,8 @@ func runIndexUpgrade( verifyTableData(3, firstExpected), // Upgrade the rest of the cluster. - binaryUpgradeStep(c.Node(2), mainVersion), - binaryUpgradeStep(c.Node(3), mainVersion), + binaryUpgradeStep(c.Node(2), clusterupgrade.MainVersion), + binaryUpgradeStep(c.Node(3), clusterupgrade.MainVersion), // Finalize the upgrade. allowAutoUpgradeStep(1), @@ -144,7 +142,7 @@ func registerSecondaryIndexesMultiVersionCluster(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - predV, err := PredecessorVersion(*t.BuildVersion()) + predV, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index 7c09a28f8731..df0de155be4e 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -405,7 +406,7 @@ func runTPCCMixedHeadroom( bankRows = 1000 } - history, err := PredecessorHistory(*t.BuildVersion(), versionsToUpgrade) + history, err := clusterupgrade.PredecessorHistory(*t.BuildVersion(), versionsToUpgrade) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go b/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go index 366d5cfd18fb..7802148035b0 100644 --- a/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go +++ b/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/pmezard/go-difflib/difflib" @@ -37,8 +38,7 @@ func registerValidateSystemSchemaAfterVersionUpgrade(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - const mainVersion = "" - predecessorVersion, err := PredecessorVersion(*t.BuildVersion()) + predecessorVersion, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } @@ -105,7 +105,7 @@ func registerValidateSystemSchemaAfterVersionUpgrade(r registry.Registry) { u := newVersionUpgradeTest(c, // Start the node with the latest binary version. - uploadAndStart(c.Node(1), mainVersion), + uploadAndStart(c.Node(1), clusterupgrade.MainVersion), // Obtain expected output from the node. obtainSystemSchemaStep(1, &expected), @@ -117,7 +117,7 @@ func registerValidateSystemSchemaAfterVersionUpgrade(r registry.Registry) { uploadAndStart(c.Node(1), predecessorVersion), // Upgrade the node version. - binaryUpgradeStep(c.Node(1), mainVersion), + binaryUpgradeStep(c.Node(1), clusterupgrade.MainVersion), // Wait for the cluster version to also bump up to make sure the migration logic is run. waitForUpgradeStep(c.Node(1)), diff --git a/pkg/cmd/roachtest/tests/version.go b/pkg/cmd/roachtest/tests/version.go index 596bb321df7f..656444b7b78b 100644 --- a/pkg/cmd/roachtest/tests/version.go +++ b/pkg/cmd/roachtest/tests/version.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/util/version" @@ -225,7 +226,7 @@ func registerVersion(r registry.Registry) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - pred, err := PredecessorVersion(*t.BuildVersion()) + pred, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/versionupgrade.go b/pkg/cmd/roachtest/tests/versionupgrade.go index 494a2ec0b7a6..3e2cd8a4b17c 100644 --- a/pkg/cmd/roachtest/tests/versionupgrade.go +++ b/pkg/cmd/roachtest/tests/versionupgrade.go @@ -14,19 +14,17 @@ import ( "context" gosql "database/sql" "fmt" - "math/rand" "path/filepath" "runtime" - "strconv" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/testutils" - "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/version" "github.com/stretchr/testify/require" @@ -91,7 +89,7 @@ func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - predecessorVersion, err := PredecessorVersion(*t.BuildVersion()) + predecessorVersion, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) } @@ -239,8 +237,6 @@ func newVersionUpgradeTest(c cluster.Cluster, steps ...versionStep) *versionUpgr } } -func checkpointName(binaryVersion string) string { return "checkpoint-v" + binaryVersion } - // Return a cached conn to the given node. Don't call .Close(), the test harness // will do it. func (u *versionUpgradeTest) conn(ctx context.Context, t test.Test, i int) *gosql.DB { @@ -256,104 +252,66 @@ func (u *versionUpgradeTest) conn(ctx context.Context, t test.Test, i int) *gosq return db } -// uploadVersion uploads the specified crdb version to nodes. It returns the -// path of the uploaded binaries on the nodes, suitable to be used with -// `roachdprod start --binary=`. +// uploadVersion is a thin wrapper around +// `clusterupgrade.UploadVersion` that calls t.Fatal if that call +// returns an error func uploadVersion( ctx context.Context, t test.Test, c cluster.Cluster, nodes option.NodeListOption, newVersion string, -) (binaryName string) { - binaryName = "./cockroach" - if newVersion == "" { - if err := c.PutE(ctx, t.L(), t.Cockroach(), binaryName, nodes); err != nil { - t.Fatal(err) - } - } else if binary, ok := t.VersionsBinaryOverride()[newVersion]; ok { - // If an override has been specified for newVersion, use that binary. - t.L().Printf("using binary override for version %s: %s", newVersion, binary) - binaryName = "./cockroach-" + newVersion - if err := c.PutE(ctx, t.L(), binary, binaryName, nodes); err != nil { - t.Fatal(err) - } - } else { - v := "v" + newVersion - dir := v - binaryName = filepath.Join(dir, "cockroach") - // Check if the cockroach binary already exists. - if err := c.RunE(ctx, nodes, "test", "-e", binaryName); err != nil { - if err := c.RunE(ctx, nodes, "mkdir", "-p", dir); err != nil { - t.Fatal(err) - } - if err := c.Stage(ctx, t.L(), "release", v, dir, nodes); err != nil { - t.Fatal(err) - } - } +) string { + path, err := clusterupgrade.UploadVersion( + ctx, t.L(), c, nodes, newVersion, t.Cockroach(), t.VersionsBinaryOverride(), + ) + if err != nil { + t.Fatal(err) } - return binaryPathFromVersion(newVersion) -} -// binaryPathFromVersion shows where the binary for the given version -// can be found on roachprod nodes. It's either `./cockroach` or the -// path to which a released binary is staged. -func binaryPathFromVersion(v string) string { - if v == "" { - return "./cockroach" - } - return filepath.Join("v"+v, "cockroach") + return path } -func (u *versionUpgradeTest) uploadVersion( - ctx context.Context, t test.Test, nodes option.NodeListOption, newVersion string, -) string { - return uploadVersion(ctx, t, u.c, nodes, newVersion) +// upgradeNodes is a thin wrapper around +// `clusterupgrade.RestartNodesWithNewBinary` that calls t.Fatal if +// that call returns an errror. +func upgradeNodes( + ctx context.Context, + t test.Test, + c cluster.Cluster, + nodes option.NodeListOption, + startOpts option.StartOpts, + newVersion string, +) { + if err := clusterupgrade.RestartNodesWithNewBinary( + ctx, t.L(), c, nodes, startOpts, newVersion, t.Cockroach(), t.VersionsBinaryOverride(), + ); err != nil { + t.Fatal(err) + } } -// binaryVersion returns the binary running on the (one-indexed) node. -// NB: version means major.minor[-internal]; the patch level isn't returned. For example, a binary -// of version 19.2.4 will return 19.2. func (u *versionUpgradeTest) binaryVersion( ctx context.Context, t test.Test, i int, ) roachpb.Version { db := u.conn(ctx, t, i) - - var sv string - if err := db.QueryRow(`SELECT crdb_internal.node_executable_version();`).Scan(&sv); err != nil { - t.Fatal(err) - } - - if len(sv) == 0 { - t.Fatal("empty version") - } - - cv, err := roachpb.ParseVersion(sv) + v, err := clusterupgrade.BinaryVersion(ctx, db) if err != nil { t.Fatal(err) } - return cv + + return v } -// binaryVersion returns the cluster version active on the (one-indexed) node. Note that the -// returned value might become stale due to the cluster auto-upgrading in the background plus -// gossip asynchronicity. -// NB: cluster versions are always major.minor[-internal]; there isn't a patch level. func (u *versionUpgradeTest) clusterVersion( ctx context.Context, t test.Test, i int, ) roachpb.Version { db := u.conn(ctx, t, i) - - var sv string - if err := db.QueryRowContext(ctx, `SHOW CLUSTER SETTING version`).Scan(&sv); err != nil { - t.Fatal(err) - } - - cv, err := roachpb.ParseVersion(sv) + v, err := clusterupgrade.ClusterVersion(ctx, db) if err != nil { t.Fatal(err) } - return cv + + return v } // versionStep is an isolated version migration on a running cluster. @@ -361,41 +319,20 @@ type versionStep func(ctx context.Context, t test.Test, u *versionUpgradeTest) func uploadAndStartFromCheckpointFixture(nodes option.NodeListOption, v string) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - u.c.Run(ctx, nodes, "mkdir", "-p", "{store-dir}") - vv := version.MustParse("v" + v) - // The fixtures use cluster version (major.minor) but the input might be - // a patch release. - name := checkpointName( - roachpb.Version{Major: int32(vv.Major()), Minor: int32(vv.Minor())}.String(), - ) - for _, i := range nodes { - u.c.Put(ctx, - "pkg/cmd/roachtest/fixtures/"+strconv.Itoa(i)+"/"+name+".tgz", - "{store-dir}/fixture.tgz", u.c.Node(i), - ) + if err := clusterupgrade.InstallFixtures(ctx, t.L(), u.c, nodes, v); err != nil { + t.Fatal(err) } - // Extract fixture. Fail if there's already an LSM in the store dir. - u.c.Run(ctx, nodes, "cd {store-dir} && [ ! -f {store-dir}/CURRENT ] && tar -xf fixture.tgz") - - // Put and start the binary. - binary := u.uploadVersion(ctx, t, nodes, v) - settings := install.MakeClusterSettings(install.BinaryOption(binary)) - startOpts := option.DefaultStartOpts() + binary := uploadVersion(ctx, t, u.c, nodes, v) // NB: can't start sequentially since cluster already bootstrapped. - startOpts.RoachprodOpts.Sequential = false - u.c.Start(ctx, t.L(), startOpts, settings, nodes) + clusterupgrade.StartWithBinary(ctx, t.L(), u.c, nodes, binary, false /* sequential */) } } func uploadAndStart(nodes option.NodeListOption, v string) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - // Put and start the binary. - binary := u.uploadVersion(ctx, t, nodes, v) + binary := uploadVersion(ctx, t, u.c, nodes, v) // NB: can't start sequentially since cluster already bootstrapped. - startOpts := option.DefaultStartOpts() - startOpts.RoachprodOpts.Sequential = false - settings := install.MakeClusterSettings(install.BinaryOption(binary)) - u.c.Start(ctx, t.L(), startOpts, settings, nodes) + clusterupgrade.StartWithBinary(ctx, t.L(), u.c, nodes, binary, false /* sequential */) } } @@ -404,65 +341,11 @@ func uploadAndStart(nodes option.NodeListOption, v string) versionStep { // Use a waitForUpgradeStep() for that. func binaryUpgradeStep(nodes option.NodeListOption, newVersion string) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - upgradeNodes(ctx, nodes, option.DefaultStartOpts(), newVersion, t, u.c) - // TODO(nvanbenschoten): add upgrade qualification step. What should we - // test? We could run logictests. We could add custom logic here. Maybe - // this should all be pushed to nightly migration tests instead. - } -} - -func upgradeNodes( - ctx context.Context, - nodes option.NodeListOption, - startOpts option.StartOpts, - newVersion string, - t test.Test, - c cluster.Cluster, -) { - // NB: We could technically stage the binary on all nodes before - // restarting each one, but on Unix it's invalid to write to an - // executable file while it is currently running. So we do the - // simple thing and upload it serially instead. - - // Restart nodes in a random order; otherwise node 1 would be running all - // the migrations and it probably also has all the leases. - rand.Shuffle(len(nodes), func(i, j int) { - nodes[i], nodes[j] = nodes[j], nodes[i] - }) - for _, node := range nodes { - v := newVersion - if v == "" { - v = "" - } - newVersionMsg := newVersion - if newVersion == "" { - newVersionMsg = "" - } - t.L().Printf("restarting node %d into version %s", node, newVersionMsg) - // Stop the cockroach process gracefully in order to drain it properly. - // This makes the upgrade closer to how users do it in production, but - // it's also needed to eliminate flakiness. In particular, this will - // make sure that DistSQL draining information is dissipated through - // gossip so that other nodes running an older version don't consider - // this upgraded node for DistSQL plans (see #87154 for more details). - // TODO(yuzefovich): ideally, we would also check that the drain was - // successful since if it wasn't, then we might see flakes too. - if err := c.StopCockroachGracefullyOnNode(ctx, t.L(), node); err != nil { + if err := clusterupgrade.RestartNodesWithNewBinary( + ctx, t.L(), u.c, nodes, option.DefaultStartOpts(), newVersion, t.Cockroach(), t.VersionsBinaryOverride(), + ); err != nil { t.Fatal(err) } - - binary := uploadVersion(ctx, t, c, c.Node(node), newVersion) - settings := install.MakeClusterSettings(install.BinaryOption(binary)) - c.Start(ctx, t.L(), startOpts, settings, c.Node(node)) - - // We have seen cases where a transient error could occur when this - // newly upgraded node serves as a gateway for a distributed query due - // to remote nodes not being able to dial back to the gateway for some - // reason (investigation of it is tracked in #87634). For now, we're - // papering over these flakes by this sleep. For more context, see - // #87104. - // TODO(yuzefovich): remove this sleep once #87634 is fixed. - time.Sleep(4 * time.Second) } } @@ -507,10 +390,6 @@ func allowAutoUpgradeStep(node int) versionStep { } } -// waitForUpgradeStep waits for the cluster version to reach the first node's -// binary version (which is assumed to be every node's binary version). We rely -// on the cluster's internal self-upgrading mechanism. -// // NB: this is intentionally kept separate from binaryUpgradeStep because we run // feature tests between the steps, and we want to expose them (at least // heuristically) to the real-world situation in which some nodes have already @@ -518,26 +397,10 @@ func allowAutoUpgradeStep(node int) versionStep { // situation tends to exhibit unexpected behavior. func waitForUpgradeStep(nodes option.NodeListOption) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - newVersion := u.binaryVersion(ctx, t, nodes[0]).String() - t.L().Printf("%s: waiting for cluster to auto-upgrade\n", newVersion) - - for _, i := range nodes { - err := retry.ForDuration(5*time.Minute, func() error { - currentVersion := u.clusterVersion(ctx, t, i).String() - if currentVersion != newVersion { - return fmt.Errorf("%d: expected version %s, got %s", i, newVersion, currentVersion) - } - t.L().Printf("%s: acked by n%d", currentVersion, i) - return nil - }) - if err != nil { - t.Fatal(err) - } + dbFunc := func(node int) *gosql.DB { return u.conn(ctx, t, node) } + if err := clusterupgrade.WaitForClusterUpgrade(ctx, t.L(), nodes, dbFunc); err != nil { + t.Fatal(err) } - - t.L().Printf("%s: nodes %v are upgraded\n", newVersion, nodes) - - // TODO(nvanbenschoten): add upgrade qualification step. } } @@ -610,7 +473,7 @@ func makeVersionFixtureAndFatal( useLocalBinary = true } - predecessorVersion, err := PredecessorVersion(*version.MustParse("v" + makeFixtureVersion)) + predecessorVersion, err := clusterupgrade.PredecessorVersion(*version.MustParse("v" + makeFixtureVersion)) if err != nil { t.Fatal(err) } @@ -658,10 +521,11 @@ func makeVersionFixtureAndFatal( // cluster version might be 2.0, so we can only use the 2.0 or // 2.1 binary, but not the 19.1 binary (as 19.1 and 2.0 are not // compatible). - name := checkpointName(u.binaryVersion(ctx, t, 1).String()) + name := clusterupgrade.CheckpointName(u.binaryVersion(ctx, t, 1).String()) u.c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.All()) - c.Run(ctx, c.All(), binaryPathFromVersion(makeFixtureVersion), "debug", "pebble", "db", "checkpoint", + binaryPath := clusterupgrade.BinaryPathFromVersion(makeFixtureVersion) + c.Run(ctx, c.All(), binaryPath, "debug", "pebble", "db", "checkpoint", "{store-dir}", "{store-dir}/"+name) // The `cluster-bootstrapped` marker can already be found within // store-dir, but the rocksdb checkpoint step above does not pick it From 22a90649d4a3fa6f415e7a0f1df334c24917a228 Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Wed, 16 Nov 2022 22:06:01 -0500 Subject: [PATCH 2/3] roachtest: introduce new mixed-version testing API This introduces a new mixed-version API for roachtests, located under `roachtestutil/mixedversion`. The goal of this API is to provide a higher level framework than the existing `versionUpgradeTest` set of functions. Currently, all mixed-version roachtests rely on teams having to specify, step by step, which release version nodes should start from, how and in what order they upgrade, whether they attempt a rollback, at what point the feature being tested is invoked, etc. All of this makes the process of writing mixed-version roachtests time consuming and takes the focus away from what is actually being tested in mixed-version. In addition, it's typically not feasible to test multiple combinations of event orderings in mixed-version tests, and as a consequence most tests only exercise functionality after all nodes have upgraded to the new version, potentially hiding bugs that are only exposed if _some_ nodes are running a newer binary, while others are running an older release. The new API has the goal of automatically exploring different orderings in each run. In the most common scenario, teams will only need to specify what they want tested in a mixed-version setting, and the framework will decide when that feature will be invoked: in some runs, it will happen when all nodes are migrated to the new binary (most common scenario in existing tests); in other runs, it will happen when only some nodes have upgraded; sometimes it will run while upgrade migrations are running in the background; etc. It's also possible to specify multiple features to be tested in mixed-version state, and the framework will randomly schedule them during the test, potentially running them concurrently. The randomness is controlled by a seed that is logged when the test is executed. By reusing the same seed, the same mixed-version test is generated which helps with reproducibility (but, of course, does not make the test deterministic). Mixed-version tests that use the `mixedversion` package can be broken down into two stages: _planning_ and _execution_. During planning, all mixed-version hooks will be scheduled and a test plan is generated. All randomness is introduced during test plan generation. The execution stage just takes care of running the plan, and does not introduce any other randomness. When the mixed-version test is run, the test plan is printed in a tree-like format, providing a high level overview of what the test is doing. In addition, each step of the test is assigned a numeric ID, which can be used to reference the output of that step during execution. Each step logs to a different file, and the artifacts containing the output of each step will live in `mixed-version-test/{ID}.log`. The new framework builds on top of the recently introduced `clusterupgrade` package, which provides basic, low-level fiunctions that are also used by the existing mixed-version tests. Epic: CRDB-19321 Release note: None --- pkg/BUILD.bazel | 4 + .../roachtestutil/mixedversion/BUILD.bazel | 41 ++ .../mixedversion/mixedversion.go | 626 ++++++++++++++++++ .../roachtestutil/mixedversion/planner.go | 246 +++++++ .../mixedversion/planner_test.go | 189 ++++++ .../roachtestutil/mixedversion/runner.go | 282 ++++++++ pkg/cmd/roachtest/tests/BUILD.bazel | 1 + pkg/roachprod/logger/log.go | 6 +- 8 files changed, 1391 insertions(+), 4 deletions(-) create mode 100644 pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel create mode 100644 pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go create mode 100644 pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go create mode 100644 pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go create mode 100644 pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 7f7e3fb39395..68e6b6ba4ae5 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -134,6 +134,7 @@ ALL_TESTS = [ "//pkg/cmd/reduce/reduce:reduce_test", "//pkg/cmd/release:release_test", "//pkg/cmd/roachtest/clusterstats:clusterstats_test", + "//pkg/cmd/roachtest/roachtestutil/mixedversion:mixedversion_test", "//pkg/cmd/roachtest/tests:tests_test", "//pkg/cmd/roachtest:roachtest_test", "//pkg/cmd/teamcity-trigger:teamcity-trigger_test", @@ -1007,6 +1008,8 @@ GO_TARGETS = [ "//pkg/cmd/roachtest/option:option", "//pkg/cmd/roachtest/registry:registry", "//pkg/cmd/roachtest/roachtestutil/clusterupgrade:clusterupgrade", + "//pkg/cmd/roachtest/roachtestutil/mixedversion:mixedversion", + "//pkg/cmd/roachtest/roachtestutil/mixedversion:mixedversion_test", "//pkg/cmd/roachtest/roachtestutil:roachtestutil", "//pkg/cmd/roachtest/spec:spec", "//pkg/cmd/roachtest/test:test", @@ -2469,6 +2472,7 @@ GET_X_DATA_TARGETS = [ "//pkg/cmd/roachtest/registry:get_x_data", "//pkg/cmd/roachtest/roachtestutil:get_x_data", "//pkg/cmd/roachtest/roachtestutil/clusterupgrade:get_x_data", + "//pkg/cmd/roachtest/roachtestutil/mixedversion:get_x_data", "//pkg/cmd/roachtest/spec:get_x_data", "//pkg/cmd/roachtest/test:get_x_data", "//pkg/cmd/roachtest/tests:get_x_data", diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel new file mode 100644 index 000000000000..75f083b93aee --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel @@ -0,0 +1,41 @@ +load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data") +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "mixedversion", + srcs = [ + "mixedversion.go", + "planner.go", + "runner.go", + ], + importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion", + visibility = ["//visibility:public"], + deps = [ + "//pkg/cmd/roachtest/cluster", + "//pkg/cmd/roachtest/option", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade", + "//pkg/cmd/roachtest/test", + "//pkg/roachpb", + "//pkg/roachprod/logger", + "//pkg/util/randutil", + "//pkg/util/timeutil", + "//pkg/util/version", + "@org_golang_x_sync//errgroup", + ], +) + +go_test( + name = "mixedversion_test", + srcs = ["planner_test.go"], + args = ["-test.timeout=295s"], + embed = [":mixedversion"], + deps = [ + "//pkg/cmd/roachtest/option", + "//pkg/cmd/roachtest/roachtestutil/clusterupgrade", + "//pkg/roachprod/logger", + "//pkg/util/version", + "@com_github_stretchr_testify//require", + ], +) + +get_x_data(name = "get_x_data") diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go new file mode 100644 index 000000000000..f82c40430f69 --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go @@ -0,0 +1,626 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +// Package mixedversion implements a framework for testing +// functionality when a cluster is in a mixed-version state. +// +// Mixed-version is defined as points in time when some (potentially +// not all) nodes have been upgraded to a certain binary version, or +// the state when all nodes have been upgraded, but the cluster itself +// is in the process of upgrading to that version (i.e., running +// migrations). This also applies to clusters that have upgraded +// binaries on all nodes and then later decided to downgrade to a +// previous version (which is permitted if the `preserve_downgrade_option` +// cluster setting is set accordingly). +// +// The goal of the framework is to let test writers focus on the +// functionality they want to test in a mixed-version state, rather +// than having to worry about how to set that up, stopping and +// restarting nodes, etc. +// +// In the most common use-case, callers will just call the +// `InMixedVersion` function on the test object defining what they +// want to test in a mixed-version state. The framework will take care +// of exercising that functionality in different mixed-version states, +// randomizing the order of events in each run. A detailed test plan +// is logged *before* the test runs; it facilitates debugging a test +// failure by summarizing all executed steps. +// +// Typical usage: +// +// mvt, err := NewMixedVersionTest(...) +// mvt.InMixedVersion("test my feature", func(l *logger.Logger, db *gosql.DB) error { +// l.Printf("testing feature X") +// _, err := db.ExecContext(ctx, "SELECT * FROM test") +// return err +// }) +// mvt.InMixedVersion("test another feature", func(l *logger.Logger, db *gosql.DB) error { +// l.Printf("testing feature Y") +// _, err := db.ExecContext(ctx, "SELECT * FROM test2") +// return err +// }) +// mvt.Run() +// +// Functions passed to `InMixedVersion` will be called at arbitrary +// points during an upgrade/downgrade process. They may also be called +// sequentially or concurrently. This information will be included in +// the test plan. Any messages logged during the execution of the hook +// should use the Logger instance passed as a parameter to it. This +// logger writes to a `mixed-version-test/{ID}.log` file, allowing +// easy cross referencing between a test step and its corresponding +// output. +// +// The random seed used to generate a test plan is logged; reusing the +// same seed will lead to the exact same test plan being generated. To +// set a specific seed when running a mixed-version test, one can set +// the `COCKROACH_RANDOM_SEED` environment variable. +package mixedversion + +import ( + "context" + gosql "database/sql" + "fmt" + "math/rand" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/randutil" + "github.com/cockroachdb/cockroach/pkg/util/version" +) + +const ( + logPrefix = "mixed-version-test" + startupLabel = "run startup hooks" + mixedVersionLabel = "run mixed-version hooks" + afterTestLabel = "run after test hooks" + + // runWhileMigratingProbability is the probability that a + // mixed-version hook will run after all nodes in the cluster have + // been upgraded to the same binary version, and the cluster version + // is allowed to be upgraded; this typically nvolves the execution + // of migration steps before the new cluster version can be + // finalized. + runWhileMigratingProbability = 0.5 + + // noDBNeeded is an internal sentinel value expected to be returned + // by steps generated by the test plan; it indicates that the step + // requires no connection to a cockroach node. + noDBNeeded = -1 +) + +var ( + // possibleDelaysMs lists the possible delays to be added to + // concurrent steps + possibleDelaysMs = []int{ + 0, 50, 100, 200, 500, + } +) + +type ( + // Context wraps the context passed to predicate functions that + // dictate when a mixed-version hook will run during a test + Context struct { + // FromVersion is the string representation of the version the + // nodes are migrating from + FromVersion string + // FromVersionNodes are the nodes that are currently running + // `FromVersion` + FromVersionNodes option.NodeListOption + // ToVersion is the string representation of the version the nodes + // are migrating to + ToVersion string + // ToVersionNodes are the nodes that are currently running + // `ToVersion` + ToVersionNodes option.NodeListOption + // Finalizing indicates whether the cluster version is in the + // process of upgrading (i.e., all nodes in the cluster have been + // upgraded to a certain version, and the migrations are being + // executed). + Finalizing bool + } + + userFunc func(*logger.Logger, *gosql.DB) error + predicateFunc func(Context) bool + + // versionUpgradeHook is a hook that can be called at any time + // during an upgrade/downgrade process. The `predicate` is used + // during test planning to determine when it is called: currently, + // it is always auto-generated, but in the future, this package + // could expose an API that would allow callers to have control of + // when their hooks should run. + versionUpgradeHook struct { + name string + predicate predicateFunc + fn userFunc + } + + // testStep is an opaque reference to one step of a mixed-version + // test. It can be a singleStep (see below), or a "meta-step", + // meaning that it combines other steps in some way (for instance, a + // series of steps to be run sequentially or concurrently). + testStep interface{} + + // singleStep represents steps that implement the pieces on top of + // which a mixed-version test is built. In other words, they are not + // composed by other steps and hence can be directly executed. + singleStep interface { + // ID returns a unique ID associated with the step, making it easy + // to reference test output with the exact step it relates to + ID() int + // DBNode returns the database node that that step connects to + // during its execution. If the step does not require a database + // connection, this function should return the `noDBNeeded` + // constant + DBNode() int + // Description is a string representation of the step, intended + // for human-consumption. Displayed when pretty-printing the test + // plan. + Description() string + // Run implements the actual functionality of the step. + Run(context.Context, *logger.Logger, cluster.Cluster, func(int) *gosql.DB) error + } + + hooks []versionUpgradeHook + + // testHooks groups hooks associated with a mixed-version test in + // its different stages: startup, mixed-version, and after-test. + testHooks struct { + startup hooks + mixedVersion hooks + afterUpgradeFinalized hooks + + prng *rand.Rand + crdbNodes option.NodeListOption + } + + // Test is the main struct callers of this package interact with. + Test struct { + ctx context.Context + cluster cluster.Cluster + logger *logger.Logger + crdbNodes option.NodeListOption + + rt test.Test + prng *rand.Rand + seed int64 + hooks *testHooks + + // test-only field, allowing us to avoid passing a test.Test + // implementation in the tests + _buildVersion version.Version + } +) + +// NewTest creates a Test struct that users can use to create and run +// a mixed-version roachtest. +func NewTest( + ctx context.Context, + t test.Test, + l *logger.Logger, + c cluster.Cluster, + crdbNodes option.NodeListOption, +) *Test { + testLogger, err := prefixedLogger(l, logPrefix) + if err != nil { + t.Fatal(err) + } + + prng, seed := randutil.NewPseudoRand() + testLogger.Printf("random seed: %d", seed) + + return &Test{ + ctx: ctx, + cluster: c, + logger: testLogger, + crdbNodes: crdbNodes, + rt: t, + prng: prng, + seed: seed, + hooks: &testHooks{prng: prng, crdbNodes: crdbNodes}, + } +} + +// InMixedVersion adds a new mixed-version hook to the test. The +// functionality in the function passed as argument to this function +// will be tested in arbitrary mixed-version states. If multiple +// InMixedVersion hooks are passed, they will be executed +// concurrently. +func (t *Test) InMixedVersion(desc string, fn userFunc) { + lastFromVersion := "invalid-version" + var numUpgradedNodes int + predicate := func(testContext Context) bool { + // If the cluster is finalizing an upgrade, run this hook + // according to the probability defined in the package. + if testContext.Finalizing { + return t.prng.Float64() < runWhileMigratingProbability + } + + // This check makes sure we only schedule a mixed-version hook + // once while upgrading from one version to another. The number of + // nodes we wait to be running the new version is determined when + // the version changes for the first time. + if testContext.FromVersion != lastFromVersion { + lastFromVersion = testContext.FromVersion + numUpgradedNodes = t.prng.Intn(len(t.crdbNodes)) + 1 + } + + return len(testContext.ToVersionNodes) == numUpgradedNodes + } + + t.hooks.AddMixedVersion(versionUpgradeHook{desc, predicate, fn}) +} + +// OnStartup registers a callback that is run once the cluster is +// initialized (i.e., all nodes in the cluster start running CRDB at a +// certain previous version, potentially from existing fixtures). If +// multiple OnStartup hooks are passed, they will be executed +// concurrently. +func (t *Test) OnStartup(desc string, fn userFunc) { + // Since the callbacks here are only referenced in the setup steps + // of the planner, there is no need to have a predicate function + // gating them. + t.hooks.AddStartup(versionUpgradeHook{desc, nil, fn}) +} + +// AfterUpgradeFinalized registers a callback that is run once the +// mixed-version test has brought the cluster to the latest version, +// and allowed the upgrade to finalize successfully. If multiple such +// hooks are passed, they will be executed concurrently. +func (t *Test) AfterUpgradeFinalized(desc string, fn userFunc) { + t.hooks.AddAfterUpgradeFinalized(versionUpgradeHook{desc, nil, fn}) +} + +// Run runs the mixed-version test. It should be called once all +// startup, mixed-version, and after-test hooks have been declared. A +// test plan will be generated (and logged), and the test will be +// carried out. +func (t *Test) Run() { + plan, err := t.plan() + if err != nil { + t.rt.Fatal(fmt.Errorf("error creating test plan: %w", err)) + } + + t.logger.Printf(plan.PrettyPrint()) + + if err := t.run(plan); err != nil { + t.rt.Fatal(err) + } +} + +func (t *Test) run(plan *TestPlan) error { + return newTestRunner( + t.ctx, plan, t.logger, t.cluster, t.crdbNodes, t.seed, + ).run() +} + +func (t *Test) plan() (*TestPlan, error) { + previousRelease, err := clusterupgrade.PredecessorVersion(t.buildVersion()) + if err != nil { + return nil, err + } + + planner := testPlanner{ + initialVersion: previousRelease, + rt: t.rt, + crdbNodes: t.crdbNodes, + hooks: t.hooks, + prng: t.prng, + } + + return planner.Plan(), nil +} + +func (t *Test) buildVersion() version.Version { + if t._buildVersion != (version.Version{}) { + return t._buildVersion // test-only + } + + return *t.rt.BuildVersion() +} + +// startFromCheckpointStep is the step that starts the cluster from a +// specific `version`, using checked-in fixtures. +type startFromCheckpointStep struct { + id int + rt test.Test + version string + crdbNodes option.NodeListOption +} + +func (s startFromCheckpointStep) ID() int { return s.id } +func (s startFromCheckpointStep) DBNode() int { return noDBNeeded } + +func (s startFromCheckpointStep) Description() string { + return fmt.Sprintf("starting cluster from fixtures for version %q", s.version) +} + +// Run will copy the fixtures to all database nodes in the cluster, +// upload the binary associated with that given version, and finally +// start the cockroach binary on these nodes. +func (s startFromCheckpointStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + if err := clusterupgrade.InstallFixtures(ctx, l, c, s.crdbNodes, s.version); err != nil { + return err + } + + binaryPath, err := clusterupgrade.UploadVersion(ctx, s.rt, l, c, s.crdbNodes, s.version) + if err != nil { + return err + } + + startOpts := option.DefaultStartOpts() + startOpts.RoachprodOpts.Sequential = false + clusterupgrade.StartWithBinary(ctx, l, c, s.crdbNodes, binaryPath, startOpts) + return nil +} + +// waitForStableClusterVersionStep implements the process of waiting +// for the `version` cluster setting being the same on all nodes of +// the cluster and equal to the binary version of the first node in +// the `nodes` field. +type waitForStableClusterVersionStep struct { + id int + nodes option.NodeListOption +} + +func (s waitForStableClusterVersionStep) ID() int { return s.id } +func (s waitForStableClusterVersionStep) DBNode() int { return noDBNeeded } + +func (s waitForStableClusterVersionStep) Description() string { + return fmt.Sprintf( + "wait for nodes %v to all have the same cluster version (same as binary version of node %d)", + s.nodes, s.nodes[0], + ) +} + +func (s waitForStableClusterVersionStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + return clusterupgrade.WaitForClusterUpgrade(ctx, l, s.nodes, dbFunc) +} + +// preserveDowngradeOptionStep sets the `preserve_downgrade_option` +// cluster setting to the binary version running in `node`. +type preserveDowngradeOptionStep struct { + id int + node int +} + +func (s preserveDowngradeOptionStep) ID() int { return s.id } +func (s preserveDowngradeOptionStep) DBNode() int { return s.node } + +func (s preserveDowngradeOptionStep) Description() string { + return "preventing auto-upgrades by setting `preserve_downgrade_option`" +} + +func (s preserveDowngradeOptionStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + db := dbFunc(s.node) + + l.Printf("checking binary version on node %d", s.node) + bv, err := clusterupgrade.BinaryVersion(db) + if err != nil { + return err + } + + downgradeOption := bv.String() + l.Printf("setting `preserve_downgrade_option` to %s", downgradeOption) + _, err = db.ExecContext(ctx, "SET CLUSTER SETTING cluster.preserve_downgrade_option = $1", downgradeOption) + return err +} + +// restartWithNewBinaryStep restarts a certain `node` with a new +// cockroach binary. Any existing `cockroach` process will be stopped, +// then the new binary will be uploaded and the `cockroach` process +// will restart using the new binary. +type restartWithNewBinaryStep struct { + id int + version string + rt test.Test + node int +} + +func (s restartWithNewBinaryStep) ID() int { return s.id } +func (s restartWithNewBinaryStep) DBNode() int { return noDBNeeded } + +func (s restartWithNewBinaryStep) Description() string { + return fmt.Sprintf("restart node %d with binary version %s", s.node, versionMsg(s.version)) +} + +func (s restartWithNewBinaryStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + return clusterupgrade.RestartNodesWithNewBinary( + ctx, + s.rt, + l, + c, + c.Node(s.node), + option.DefaultStartOpts(), + s.version, + ) +} + +// finalizeUpgradeStep resets the `preserve_downgrade_option` cluster +// setting, allowing the upgrade migrations to run and the cluster +// version to eventually reach the binary version on the nodes. +type finalizeUpgradeStep struct { + id int + node int +} + +func (s finalizeUpgradeStep) ID() int { return s.id } +func (s finalizeUpgradeStep) DBNode() int { return s.node } + +func (s finalizeUpgradeStep) Description() string { + return "finalize upgrade by resetting `preserve_downgrade_option`" +} + +func (s finalizeUpgradeStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + db := dbFunc(s.node) + l.Printf("resetting preserve_downgrade_option") + _, err := db.ExecContext(ctx, "RESET CLUSTER SETTING cluster.preserve_downgrade_option") + return err +} + +// runHookStep is a step used to run a user-provided hook (i.e., +// callbacks passed to `OnStartup`, `InMixedVersion`, or `AfterTest`). +type runHookStep struct { + id int + node int + hook versionUpgradeHook +} + +func (s runHookStep) ID() int { return s.id } +func (s runHookStep) DBNode() int { return s.node } + +func (s runHookStep) Description() string { + return fmt.Sprintf("run %q", s.hook.name) +} + +func (s runHookStep) Run( + ctx context.Context, l *logger.Logger, c cluster.Cluster, dbFunc func(int) *gosql.DB, +) error { + return s.hook.fn(l, dbFunc(s.node)) +} + +// sequentialRunStep is a "meta-step" that indicates that a sequence +// of steps are to be executed sequentially. The default test runner +// already runs steps sequentially. This meta-step exists primarily as +// a way to group related steps so that a test plan is easier to +// understand for a human. +type sequentialRunStep struct { + label string + steps []testStep +} + +func (s sequentialRunStep) Description() string { + return s.label +} + +// delayedStep is a thin wrapper around a test step that marks steps +// with a random delay that should be honored before the step starts +// executing. This is useful in steps that are to be executed +// concurrently to avoid biases of certain execution orders. While the +// Go scheduler is non-deterministic, in practice schedules are not +// uniformly distributed, and the delay is inteded to expose scenarios +// where a specific ordering of events leads to a failure. +type delayedStep struct { + delay time.Duration + step testStep +} + +// concurrentRunStep is a "meta-step" that groups multiple test steps +// that are meant to be executed concurrently. A random delay is added +// before each step starts, see notes on `delayedStep`. +type concurrentRunStep struct { + label string + delayedSteps []testStep +} + +func newConcurrentRunStep(label string, steps []testStep, rng *rand.Rand) concurrentRunStep { + delayedSteps := make([]testStep, 0, len(steps)) + for _, step := range steps { + delayedSteps = append(delayedSteps, delayedStep{delay: randomDelay(rng), step: step}) + } + + return concurrentRunStep{label: label, delayedSteps: delayedSteps} +} + +func (s concurrentRunStep) Description() string { + return fmt.Sprintf("%s concurrently", s.label) +} + +// prefixedLogger returns a logger instance off of the given `l` +// parameter, and adds a prefix to everything logged by the retured +// logger. +func prefixedLogger(l *logger.Logger, prefix string) (*logger.Logger, error) { + fileName := strings.ReplaceAll(prefix, " ", "-") + formattedPrefix := fmt.Sprintf("[%s] ", fileName) + return l.ChildLogger(fileName, logger.LogPrefix(formattedPrefix)) +} + +func randomNode(rng *rand.Rand, nodes option.NodeListOption) int { + idx := rng.Intn(len(nodes)) + return nodes[idx] +} + +func (h hooks) Filter(testContext Context) hooks { + var result hooks + for _, hook := range h { + if hook.predicate(testContext) { + result = append(result, hook) + } + } + + return result +} + +// AsSteps transforms the sequence of hooks into a corresponding test +// step. If there is only one hook, the corresponding `runHookStep` is +// returned. Otherwise, a `concurrentRunStep` is returned, where every +// hook is run concurrently. +func (h hooks) AsSteps( + label string, idGen func() int, rng *rand.Rand, nodes option.NodeListOption, +) []testStep { + steps := make([]testStep, 0, len(h)) + for _, hook := range h { + rhs := runHookStep{id: idGen(), node: randomNode(rng, nodes), hook: hook} + steps = append(steps, rhs) + } + + if len(steps) <= 1 { + return steps + } + + return []testStep{newConcurrentRunStep(label, steps, rng)} +} + +func (th *testHooks) AddStartup(hook versionUpgradeHook) { + th.startup = append(th.startup, hook) +} + +func (th *testHooks) AddMixedVersion(hook versionUpgradeHook) { + th.mixedVersion = append(th.mixedVersion, hook) +} + +func (th *testHooks) AddAfterUpgradeFinalized(hook versionUpgradeHook) { + th.afterUpgradeFinalized = append(th.afterUpgradeFinalized, hook) +} + +func (th *testHooks) StartupSteps(idGen func() int) []testStep { + return th.startup.AsSteps(startupLabel, idGen, th.prng, th.crdbNodes) +} + +func (th *testHooks) MixedVersionSteps(testContext Context, idGen func() int) []testStep { + return th.mixedVersion.Filter(testContext).AsSteps(mixedVersionLabel, idGen, th.prng, th.crdbNodes) +} + +func (th *testHooks) AfterUpgradeFinalizedSteps(idGen func() int) []testStep { + return th.afterUpgradeFinalized.AsSteps(afterTestLabel, idGen, th.prng, th.crdbNodes) +} + +func randomDelay(rng *rand.Rand) time.Duration { + idx := rng.Intn(len(possibleDelaysMs)) + return time.Duration(possibleDelaysMs[idx]) * time.Millisecond +} + +func versionMsg(version string) string { + return clusterupgrade.VersionMsg(version) +} diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go new file mode 100644 index 000000000000..7a3c58922d62 --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go @@ -0,0 +1,246 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package mixedversion + +import ( + "fmt" + "math/rand" + "strings" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" +) + +type ( + // TestPlan is the output of planning a mixed-version test. The + // initialVersion and finalVersion fields are just for presentation + // purposes, as the plan is defined by the sequence of steps that it + // contains. + TestPlan struct { + initialVersion string + finalVersion string + steps []testStep + } + + // testPlanner wraps the state and the logic involved in generating + // a test plan from the given rng and user-provided hooks. + testPlanner struct { + stepCount int + initialVersion string + crdbNodes option.NodeListOption + rt test.Test + hooks *testHooks + prng *rand.Rand + } +) + +const ( + branchString = "├──" + nestedBranchString = "│ " + lastBranchString = "└──" + lastBranchPadding = " " +) + +// Plan returns the TestPlan generated using the `prng` in the +// testPlanner field. Currently, the test will always follow the +// following high level outline: +// +// - start all nodes in the cluster from the predecessor version, +// using fixtures. +// - set `preserve_downgrade_option`. +// - run startup hooks. +// - upgrade all nodes to the current cockroach version (running +// mixed-version hooks at times determined by the planner). +// - downgrade all nodes back to the predecessor version (running +// mixed-version hooks again). +// - upgrade all nodes back to the current cockroach version one +// more time (running mixed-version hooks). +// - finally, reset `preserve_downgrade_option`, allowing the +// cluster to upgrade. Mixed-version hooks may be executed while +// this is happening. +// - run after-test hooks. +// +// TODO(renato): further opportunities for random exploration: going +// back multiple releases instead of just one; picking a patch release +// randomly instead of just the latest release. +func (p *testPlanner) Plan() *TestPlan { + var steps []testStep + addSteps := func(ss []testStep) { steps = append(steps, ss...) } + + addSteps(p.initSteps()) + + // previous -> current + addSteps(p.upgradeSteps(p.initialVersion, clusterupgrade.MainVersion)) + // current -> previous (rollback) + addSteps(p.downgradeSteps(clusterupgrade.MainVersion, p.initialVersion)) + // previous -> current + addSteps(p.upgradeSteps(p.initialVersion, clusterupgrade.MainVersion)) + // finalize + addSteps(p.finalizeUpgradeSteps()) + + addSteps(p.finalSteps()) + return &TestPlan{ + initialVersion: p.initialVersion, + finalVersion: versionMsg(clusterupgrade.MainVersion), + steps: steps, + } +} + +// initSteps returns the sequence of steps that should be executed +// before we start changing binaries on nodes in the process of +// upgrading/downgrading. It will also run any startup hooks the user +// may have provided. +func (p *testPlanner) initSteps() []testStep { + preserveDowngradeNode := randomNode(p.prng, p.crdbNodes) + return append([]testStep{ + startFromCheckpointStep{id: p.nextID(), version: p.initialVersion, rt: p.rt, crdbNodes: p.crdbNodes}, + waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes}, + preserveDowngradeOptionStep{id: p.nextID(), node: preserveDowngradeNode}, + }, p.hooks.StartupSteps(p.nextID)...) +} + +// finalSteps are the steps to be run once the nodes have been +// upgraded/downgraded. It will wait for the cluster version on all +// nodes to be the same and then run any after-finalization hooks the +// user may have provided. +func (p *testPlanner) finalSteps() []testStep { + return append([]testStep{ + waitForStableClusterVersionStep{id: p.nextID(), nodes: p.crdbNodes}, + }, p.hooks.AfterUpgradeFinalizedSteps(p.nextID)...) +} + +func (p *testPlanner) upgradeSteps(from, to string) []testStep { + msg := fmt.Sprintf("upgrade nodes %v from %q to %q", p.crdbNodes, versionMsg(from), versionMsg(to)) + return p.changeVersionSteps(from, to, msg) +} + +func (p *testPlanner) downgradeSteps(from, to string) []testStep { + msg := fmt.Sprintf("downgrade nodes %v from %q to %q", p.crdbNodes, versionMsg(from), versionMsg(to)) + return p.changeVersionSteps(from, to, msg) +} + +// changeVersionSteps returns the sequence of steps to be performed +// when we are changing the version of cockroach in the nodes of a +// cluster (either upgrading to a new binary, or downgrading to a +// predecessor version). +func (p *testPlanner) changeVersionSteps(from, to, label string) []testStep { + // copy `crdbNodes` here so that shuffling won't mutate that array + oldVersionNodes := append(option.NodeListOption{}, p.crdbNodes...) + newVersionNodes := option.NodeListOption{} + + // change the binary version on the nodes in random order + p.prng.Shuffle(len(oldVersionNodes), func(i, j int) { + oldVersionNodes[i], oldVersionNodes[j] = oldVersionNodes[j], oldVersionNodes[i] + }) + + var steps []testStep + nodeOrder := append(option.NodeListOption{}, oldVersionNodes...) + for _, node := range nodeOrder { + steps = append(steps, restartWithNewBinaryStep{id: p.nextID(), version: to, node: node, rt: p.rt}) + oldVersionNodes = oldVersionNodes[1:] + newVersionNodes = append(newVersionNodes, node) + + testContext := Context{ + FromVersion: from, + FromVersionNodes: oldVersionNodes, + ToVersion: to, + ToVersionNodes: newVersionNodes, + } + steps = append(steps, p.hooks.MixedVersionSteps(testContext, p.nextID)...) + } + + return []testStep{sequentialRunStep{label: label, steps: steps}} +} + +// finalizeUpgradeSteps finalizes the upgrade by resetting the +// `preserve_downgrade_option` and potentially running mixed-version +// hooks while the cluster version is changing. +func (p *testPlanner) finalizeUpgradeSteps() []testStep { + testContext := Context{Finalizing: true} + finalizeNode := randomNode(p.prng, p.crdbNodes) + return append([]testStep{ + finalizeUpgradeStep{id: p.nextID(), node: finalizeNode}, + }, p.hooks.MixedVersionSteps(testContext, p.nextID)...) +} + +func (p *testPlanner) nextID() int { + p.stepCount++ + return p.stepCount +} + +// PrettyPrint displays a tree-like view of the mixed-version test +// plan, useful when debugging mixed-version test failures. Each step +// is assigned an ID, making it easy to correlate the step that +// failed with the point in the test where the failure occurred. See +// test file for an example of the output of this function. +func (plan *TestPlan) PrettyPrint() string { + var out strings.Builder + for i, step := range plan.steps { + plan.prettyPrintStep(&out, step, treeBranchString(i, len(plan.steps))) + } + + return fmt.Sprintf( + "mixed-version test plan for upgrading from %s to %s:\n%s", + plan.initialVersion, plan.finalVersion, out.String(), + ) +} + +func (plan *TestPlan) prettyPrintStep(out *strings.Builder, step testStep, prefix string) { + writeNested := func(label string, steps []testStep) { + out.WriteString(fmt.Sprintf("%s %s\n", prefix, label)) + for i, subStep := range steps { + nestedPrefix := strings.ReplaceAll(prefix, branchString, nestedBranchString) + nestedPrefix = strings.ReplaceAll(nestedPrefix, lastBranchString, lastBranchPadding) + subPrefix := fmt.Sprintf("%s%s", nestedPrefix, treeBranchString(i, len(steps))) + plan.prettyPrintStep(out, subStep, subPrefix) + } + } + + // writeRunnable is the function that generates the description for + // a singleStep. It can include extra information, such as whether + // there's a delay associated with the step (in the case of + // concurrent execution), and what database node the step is + // connecting to. + writeSingle := func(rs singleStep, extraContext ...string) { + if node := rs.DBNode(); node != noDBNeeded { + dbinfo := fmt.Sprintf("with connection to node %d", node) + extraContext = append([]string{dbinfo}, extraContext...) + } + var extras string + if contextStr := strings.Join(extraContext, ", "); contextStr != "" { + extras = ", " + contextStr + } + out.WriteString(fmt.Sprintf( + "%s %s%s (%d)\n", prefix, rs.Description(), extras, rs.ID(), + )) + } + + switch s := step.(type) { + case sequentialRunStep: + writeNested(s.Description(), s.steps) + case concurrentRunStep: + writeNested(s.Description(), s.delayedSteps) + case delayedStep: + delayStr := fmt.Sprintf("after %s delay", s.delay) + writeSingle(s.step.(singleStep), delayStr) + default: + writeSingle(s.(singleStep)) + } +} + +func treeBranchString(idx, sliceLen int) string { + if idx == sliceLen-1 { + return lastBranchString + } + + return branchString +} diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go new file mode 100644 index 000000000000..b7d560b8c2ba --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go @@ -0,0 +1,189 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package mixedversion + +import ( + "context" + gosql "database/sql" + "fmt" + "io" + "math/rand" + "testing" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/version" + "github.com/stretchr/testify/require" +) + +var ( + nilLogger = func() *logger.Logger { + cfg := logger.Config{ + Stdout: io.Discard, + Stderr: io.Discard, + } + l, err := cfg.NewLogger("" /* path */) + if err != nil { + panic(err) + } + + return l + }() + + ctx = context.Background() + nodes = option.NodeListOption{1, 2, 3, 4} + + // Hardcode build and previous versions so that the test won't fail + // when new versions are released. + buildVersion = func() version.Version { + bv, err := version.Parse("v23.1.0") + if err != nil { + panic(err) + } + return *bv + }() + previousVersion = func() string { + pv, err := clusterupgrade.PredecessorVersion(buildVersion) + if err != nil { + panic(err) + } + return pv + }() +) + +const ( + seed = 12345 // expectations are based on this seed + mainVersion = clusterupgrade.MainVersion +) + +func TestTestPlanner(t *testing.T) { + mvt := newTest(t) + mvt.InMixedVersion("mixed-version 1", dummyHook) + mvt.InMixedVersion("mixed-version 2", dummyHook) + + plan, err := mvt.plan() + require.NoError(t, err) + require.Len(t, plan.steps, 9) + + // Assert on the pretty-printed version of the test plan as that + // asserts the ordering of the steps we want to take, and as a bonus + // tests the printing function itself. + expectedPrettyPlan := fmt.Sprintf(` +mixed-version test plan for upgrading from %[1]s to : +├── starting cluster from fixtures for version "%[1]s" (1) +├── wait for nodes :1-4 to all have the same cluster version (same as binary version of node 1) (2) +├── preventing auto-upgrades by setting `+"`preserve_downgrade_option`"+`, with connection to node 4 (3) +├── upgrade nodes :1-4 from "%[1]s" to "" +│ ├── restart node 2 with binary version (4) +│ ├── restart node 1 with binary version (5) +│ ├── run "mixed-version 1", with connection to node 3 (6) +│ ├── restart node 4 with binary version (7) +│ ├── restart node 3 with binary version (8) +│ └── run "mixed-version 2", with connection to node 3 (9) +├── downgrade nodes :1-4 from "" to "%[1]s" +│ ├── restart node 3 with binary version %[1]s (10) +│ ├── restart node 4 with binary version %[1]s (11) +│ ├── run mixed-version hooks concurrently +│ │ ├── run "mixed-version 1", with connection to node 1, after 200ms delay (12) +│ │ └── run "mixed-version 2", with connection to node 1, after 200ms delay (13) +│ ├── restart node 2 with binary version %[1]s (14) +│ └── restart node 1 with binary version %[1]s (15) +├── upgrade nodes :1-4 from "%[1]s" to "" +│ ├── restart node 3 with binary version (16) +│ ├── run "mixed-version 1", with connection to node 1 (17) +│ ├── restart node 4 with binary version (18) +│ ├── restart node 1 with binary version (19) +│ ├── restart node 2 with binary version (20) +│ └── run "mixed-version 2", with connection to node 2 (21) +├── finalize upgrade by resetting `+"`preserve_downgrade_option`"+`, with connection to node 3 (22) +├── run "mixed-version 2", with connection to node 1 (23) +└── wait for nodes :1-4 to all have the same cluster version (same as binary version of node 1) (24) +`, previousVersion, + ) + + expectedPrettyPlan = expectedPrettyPlan[1:] // remove leading newline + require.Equal(t, expectedPrettyPlan, plan.PrettyPrint()) + + // Assert that startup hooks are scheduled to run before any + // upgrades, i.e., after cluster is initialized (step 1), and after + // we wait for the cluster version to match on all nodes (step 2). + mvt = newTest(t) + mvt.OnStartup("startup 1", dummyHook) + mvt.OnStartup("startup 2", dummyHook) + plan, err = mvt.plan() + require.NoError(t, err) + requireConcurrentHooks(t, plan.steps[3], "startup 1", "startup 2") + + // Assert that AfterUpgradeFinalized hooks are scheduled to run in + // the last step of the test. + mvt = newTest(t) + mvt.AfterUpgradeFinalized("finalizer 1", dummyHook) + mvt.AfterUpgradeFinalized("finalizer 2", dummyHook) + mvt.AfterUpgradeFinalized("finalizer 3", dummyHook) + plan, err = mvt.plan() + require.NoError(t, err) + require.Len(t, plan.steps, 9) + requireConcurrentHooks(t, plan.steps[8], "finalizer 1", "finalizer 2", "finalizer 3") +} + +// TestDeterministicTestPlan tests that generating a test plan with +// the same seed multiple times yields the same plan every time. +func TestDeterministicTestPlan(t *testing.T) { + makePlan := func() *TestPlan { + mvt := newTest(t) + mvt.InMixedVersion("mixed-version 1", dummyHook) + mvt.InMixedVersion("mixed-version 2", dummyHook) + + plan, err := mvt.plan() + require.NoError(t, err) + return plan + } + + expectedPlan := makePlan() + const numRuns = 50 + for j := 0; j < numRuns; j++ { + require.Equal(t, expectedPlan.PrettyPrint(), makePlan().PrettyPrint(), "j = %d", j) + } +} + +func newTest(t *testing.T) *Test { + prng := rand.New(rand.NewSource(seed)) + return &Test{ + ctx: ctx, + logger: nilLogger, + crdbNodes: nodes, + _buildVersion: buildVersion, + prng: prng, + hooks: &testHooks{prng: prng, crdbNodes: nodes}, + } +} + +// requireConcurrentHooks asserts that the given step is a concurrent +// run of multiple user-provided hooks with the names passed as +// parameter. +func requireConcurrentHooks(t *testing.T, step testStep, names ...string) { + require.IsType(t, concurrentRunStep{}, step) + crs := step.(concurrentRunStep) + require.Len(t, crs.delayedSteps, len(names)) + + for j, concurrentStep := range crs.delayedSteps { + require.IsType(t, delayedStep{}, concurrentStep) + ds := concurrentStep.(delayedStep) + require.IsType(t, runHookStep{}, ds.step) + rhs := ds.step.(runHookStep) + require.Equal(t, names[j], rhs.hook.name, "j = %d", j) + } +} + +func dummyHook(*logger.Logger, *gosql.DB) error { + return nil +} diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go new file mode 100644 index 000000000000..07422d12195c --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go @@ -0,0 +1,282 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package mixedversion + +import ( + "context" + gosql "database/sql" + "fmt" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "golang.org/x/sync/errgroup" +) + +type ( + testRunner struct { + ctx context.Context + plan *TestPlan + cluster cluster.Cluster + crdbNodes option.NodeListOption + seed int64 + logger *logger.Logger + + binaryVersions []roachpb.Version + clusterVersions []roachpb.Version + + connCache []*gosql.DB + } +) + +var ( + // everything that is not an alphanum or a few special characters + invalidChars = regexp.MustCompile(`[^a-zA-Z0-9 \-_\.]`) +) + +func newTestRunner( + ctx context.Context, + plan *TestPlan, + l *logger.Logger, + c cluster.Cluster, + crdbNodes option.NodeListOption, + randomSeed int64, +) *testRunner { + return &testRunner{ + ctx: ctx, + plan: plan, + logger: l, + cluster: c, + crdbNodes: crdbNodes, + seed: randomSeed, + } +} + +// run implements the test running logic, which boils down to running +// each step in sequence. +func (tr *testRunner) run() error { + defer tr.closeConnections() + + for _, step := range tr.plan.steps { + if err := tr.runStep(step); err != nil { + return err + } + } + + return nil +} + +// runStep contains the logic of running a single test step, called +// recursively in the case of sequentialRunStep and concurrentRunStep. +func (tr *testRunner) runStep(step testStep) error { + if ss, ok := step.(singleStep); ok && ss.ID() > 1 { + // if we are running a singleStep that is *not* the first step, + // we can initialize the database connections. This represents the + // assumption that the first step in the test plan is the one that + // sets up binaries and make the `cockroach` process available on + // the nodes. + // TODO(renato): consider a way to make the test runner crash if + // the assumption does not hold + if err := tr.maybeInitConnections(); err != nil { + return err + } + if err := tr.refreshBinaryVersions(); err != nil { + return err + } + if err := tr.refreshClusterVersions(); err != nil { + return err + } + } + + switch s := step.(type) { + case sequentialRunStep: + for _, ss := range s.steps { + if err := tr.runStep(ss); err != nil { + return err + } + } + return nil + + case concurrentRunStep: + group, _ := errgroup.WithContext(tr.ctx) + for _, cs := range s.delayedSteps { + cs := cs + group.Go(func() error { + return tr.runStep(cs) + }) + } + return group.Wait() + + case delayedStep: + time.Sleep(s.delay) + return tr.runStep(s.step) + + default: + ss := s.(singleStep) + stepLogger, err := tr.loggerFor(ss) + if err != nil { + return err + } + + tr.logStep("STARTING", ss, stepLogger) + start := timeutil.Now() + defer func() { + prefix := fmt.Sprintf("FINISHED [%s]", timeutil.Since(start)) + tr.logStep(prefix, ss, stepLogger) + }() + if err := ss.Run(tr.ctx, stepLogger, tr.cluster, tr.conn); err != nil { + return tr.reportError(err, ss, stepLogger) + } + + return nil + } +} + +// reportError augments the error passed with extra +// information. Specifically, the error message will include the ID of +// the step that failed, the random seed used, the binary version on +// each node when the error occurred, and the cluster version before +// and after the step (in case the failure happened *while* the +// cluster version was updating). +func (tr *testRunner) reportError(err error, step singleStep, l *logger.Logger) error { + errMsg := fmt.Sprintf("mixed-version test failure while running step %d (%s): %s", + step.ID(), step.Description(), err, + ) + debugInfo := func(label, value string) string { + return fmt.Sprintf("%-40s%s", label+":", value) + } + seedInfo := debugInfo("test random seed", strconv.FormatInt(tr.seed, 10)) + binaryVersions := debugInfo("binary versions", formatVersions(tr.binaryVersions)) + clusterVersionsBefore := debugInfo("cluster versions before failure", formatVersions(tr.clusterVersions)) + var clusterVersionsAfter string + if err := tr.refreshClusterVersions(); err == nil { + clusterVersionsBefore += "\n" + clusterVersionsAfter = debugInfo("cluster versions after failure", formatVersions(tr.clusterVersions)) + } else { + tr.logger.Printf("failed to fetch cluster versions after failure: %s", err) + } + + if err := renameFailedLogger(l); err != nil { + tr.logger.Printf("could not rename failed step logger: %v", err) + } + + return fmt.Errorf( + "%s\n%s\n%s\n%s%s", + errMsg, seedInfo, binaryVersions, clusterVersionsBefore, clusterVersionsAfter, + ) +} + +func (tr *testRunner) logStep(prefix string, step singleStep, l *logger.Logger) { + dashes := strings.Repeat("-", 10) + l.Printf("%[1]s %s (%d): %s %[1]s", dashes, prefix, step.ID(), step.Description()) +} + +// loggerFor creates a logger instance to be used by a test step. Logs +// will be available under `mixed-version-test/{ID}.log`, making it +// easy to go from the IDs displayed in the test plan to the +// corresponding output of that step. +func (tr *testRunner) loggerFor(step singleStep) (*logger.Logger, error) { + name := invalidChars.ReplaceAllString(strings.ToLower(step.Description()), "") + name = fmt.Sprintf("%d_%s", step.ID(), name) + + prefix := fmt.Sprintf("%s/%s", logPrefix, name) + return prefixedLogger(tr.logger, prefix) +} + +// refreshBinaryVersions updates the internal `binaryVersions` field +// with the binary version running on each node of the cluster. +func (tr *testRunner) refreshBinaryVersions() error { + tr.binaryVersions = make([]roachpb.Version, 0, len(tr.crdbNodes)) + for _, node := range tr.crdbNodes { + bv, err := clusterupgrade.BinaryVersion(tr.conn(node)) + if err != nil { + return fmt.Errorf("failed to get binary version for node %d: %w", node, err) + } + tr.binaryVersions = append(tr.binaryVersions, bv) + } + + return nil +} + +// refreshClusterVersions updates the internal `clusterVersions` field +// with the current view of the cluster version in each of the nodes +// of the cluster. +func (tr *testRunner) refreshClusterVersions() error { + tr.clusterVersions = make([]roachpb.Version, 0, len(tr.crdbNodes)) + for _, node := range tr.crdbNodes { + cv, err := clusterupgrade.ClusterVersion(tr.ctx, tr.conn(node)) + if err != nil { + return fmt.Errorf("failed to get cluster version for node %d: %w", node, err) + } + tr.clusterVersions = append(tr.clusterVersions, cv) + } + + return nil +} + +// maybeInitConnections initialize connections if the connection cache +// is empty. +func (tr *testRunner) maybeInitConnections() error { + if tr.connCache != nil { + return nil + } + + tr.connCache = make([]*gosql.DB, len(tr.crdbNodes)) + for _, node := range tr.crdbNodes { + conn, err := tr.cluster.ConnE(tr.ctx, tr.logger, node) + if err != nil { + return fmt.Errorf("failed to connect to node %d: %w", node, err) + } + + tr.connCache[node-1] = conn + } + + return nil +} + +// conn returns a database connection to the given node. Assumes the +// connection cache has been previously initialized. +func (tr *testRunner) conn(node int) *gosql.DB { + return tr.connCache[node-1] +} + +func (tr *testRunner) closeConnections() { + for _, db := range tr.connCache { + if db != nil { + _ = db.Close() + } + } +} + +func renameFailedLogger(l *logger.Logger) error { + currentFileName := l.File.Name() + newLogName := strings.TrimSuffix(currentFileName, filepath.Ext(currentFileName)) + newLogName += "_FAILED.log" + return os.Rename(currentFileName, newLogName) +} + +func formatVersions(versions []roachpb.Version) string { + var pairs []string + for idx, version := range versions { + pairs = append(pairs, fmt.Sprintf("%d: %s", idx+1, version)) + } + + return fmt.Sprintf("[%s]", strings.Join(pairs, ", ")) +} diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index cc1fb8664704..7823ff521582 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -183,6 +183,7 @@ go_library( "//pkg/cmd/roachtest/registry", "//pkg/cmd/roachtest/roachtestutil", "//pkg/cmd/roachtest/roachtestutil/clusterupgrade", + "//pkg/cmd/roachtest/roachtestutil/mixedversion", "//pkg/cmd/roachtest/spec", "//pkg/cmd/roachtest/test", "//pkg/gossip", diff --git a/pkg/roachprod/logger/log.go b/pkg/roachprod/logger/log.go index 509cb82a882a..59352e2b1d40 100644 --- a/pkg/roachprod/logger/log.go +++ b/pkg/roachprod/logger/log.go @@ -38,11 +38,9 @@ type loggerOption interface { apply(*Config) } -type logPrefix string +type LogPrefix string -var _ logPrefix // silence unused lint - -func (p logPrefix) apply(cfg *Config) { +func (p LogPrefix) apply(cfg *Config) { cfg.Prefix = string(p) } From bbccd8f460ff646cc2771ed2ccdbe712f35eb759 Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Wed, 16 Nov 2022 22:06:31 -0500 Subject: [PATCH 3/3] roachtest: rewrite acceptance/version-upgrade to use new API This commit migrates the `acceptace/version-upgrade` test to use the new API in the `mixedversion` package. Two features were previously being tested in mixed-version state in this test: backups, and basic functionality that can be tested by running a SQL statement. This change simplifies the code as it is no longer required to set up the nodes, and the concept of the `versionFeatureStep` was removed as it is not necessary and was only being used by this test. Epic: CRDB-19321 Release note: None --- pkg/cmd/roachtest/tests/versionupgrade.go | 299 ++++++---------------- 1 file changed, 84 insertions(+), 215 deletions(-) diff --git a/pkg/cmd/roachtest/tests/versionupgrade.go b/pkg/cmd/roachtest/tests/versionupgrade.go index 3e2cd8a4b17c..2274bdb971ed 100644 --- a/pkg/cmd/roachtest/tests/versionupgrade.go +++ b/pkg/cmd/roachtest/tests/versionupgrade.go @@ -21,34 +21,32 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/roachprod/install" - "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/version" "github.com/stretchr/testify/require" ) -var ( - v201 = roachpb.Version{Major: 20, Minor: 1} - v202 = roachpb.Version{Major: 20, Minor: 2} -) +type versionFeatureTest struct { + name string + statement string +} -// Feature tests that are invoked between each step of the version upgrade test. -// Tests can use u.clusterVersion to determine which version is active at the -// moment. -// -// A gotcha is that these feature tests are also invoked when the cluster is -// in the middle of upgrading -- i.e. a state where the cluster version has -// already been bumped, but not all nodes are aware). This should be considered -// a feature of this test, and feature tests that flake because of it need to -// be fixed. -var versionUpgradeTestFeatures = versionFeatureStep{ - // NB: the next four tests are ancient and supported since v2.0. However, - // in 19.2 -> 20.1 we had a migration that disallowed most DDL in the - // mixed version state, and so for convenience we gate them on v20.1. - stmtFeatureTest("Object Access", v201, ` +// Feature tests that are invoked in mixed-version state during the +// upgrade test. A gotcha is that these feature tests are also +// invoked when the cluster is in the middle of upgrading -- i.e. a +// state where the cluster version has already been bumped, but not +// all nodes are aware). This should be considered a feature of this +// test, and feature tests that flake because of it need to be fixed. +var versionUpgradeTestFeatures = []versionFeatureTest{ + // NB: the next four tests are ancient and supported since v2.0. + { + name: "ObjectAccess", + statement: ` -- We should be able to successfully select from objects created in ancient -- versions of CRDB using their FQNs. Prevents bugs such as #43141, where -- databases created before a migration were inaccessible after the @@ -60,60 +58,78 @@ var versionUpgradeTestFeatures = versionFeatureStep{ -- on CRDB v1.0 select * from persistent_db.persistent_table; show tables from persistent_db; -`), - stmtFeatureTest("JSONB", v201, ` +`, + }, + { + name: "JSONB", + statement: ` CREATE DATABASE IF NOT EXISTS test; CREATE TABLE test.t (j JSONB); DROP TABLE test.t; - `), - stmtFeatureTest("Sequences", v201, ` +`, + }, + { + name: "Sequences", + statement: ` CREATE DATABASE IF NOT EXISTS test; CREATE SEQUENCE test.test_sequence; DROP SEQUENCE test.test_sequence; - `), - stmtFeatureTest("Computed Columns", v201, ` +`, + }, + { + name: "Computed Columns", + statement: ` CREATE DATABASE IF NOT EXISTS test; CREATE TABLE test.t (x INT AS (3) STORED); DROP TABLE test.t; - `), - stmtFeatureTest("Split and Merge Ranges", v202, ` -create database if not EXISTS splitmerge; -create table splitmerge.t (k int primary key); -alter table splitmerge.t split at values (1), (2), (3); -alter table splitmerge.t unsplit at values (1), (2), (3); -drop table splitmerge.t; - `), +`, + }, + { + name: "Split and Merge Ranges", + statement: ` +CREATE DATABASE IF NOT EXISTS splitmerge; +CREATE TABLE splitmerge.t (k INT PRIMARY KEY); +ALTER TABLE splitmerge.t SPLIT AT VALUES (1), (2), (3); +ALTER TABLE splitmerge.t UNSPLIT AT VALUES (1), (2), (3); +DROP TABLE splitmerge.t; +`, + }, } func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) { if c.IsLocal() && runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } - predecessorVersion, err := clusterupgrade.PredecessorVersion(*t.BuildVersion()) - if err != nil { - t.Fatal(err) - } - testFeaturesStep := versionUpgradeTestFeatures.step(c.All()) - schemaChangeStep := runSchemaChangeWorkloadStep(c.All().RandNode()[0], 10 /* maxOps */, 2 /* concurrency */) - // TODO(irfansharif): All schema change instances were commented out while - // of #58489 is being addressed. - _ = schemaChangeStep - backupStep := func(ctx context.Context, t test.Test, u *versionUpgradeTest) { + mvt := mixedversion.NewTest(ctx, t, t.L(), c, c.All()) + mvt.InMixedVersion("run backup", func(l *logger.Logger, db *gosql.DB) error { // Verify that backups can be created in various configurations. This is // important to test because changes in system tables might cause backups to // fail in mixed-version clusters. dest := fmt.Sprintf("nodelocal://0/%d", timeutil.Now().UnixNano()) - _, err := u.conn(ctx, t, 1).ExecContext(ctx, `BACKUP TO $1`, dest) - require.NoError(t, err) - } + l.Printf("writing backup to %s", dest) + _, err := db.ExecContext(ctx, `BACKUP TO $1`, dest) + return err + }) + mvt.InMixedVersion("test features", func(l *logger.Logger, db *gosql.DB) error { + for _, featureTest := range versionUpgradeTestFeatures { + l.Printf("running feature test %q", featureTest.name) + _, err := db.ExecContext(ctx, featureTest.statement) + if err != nil { + l.Printf("%q: ERROR (%s)", featureTest.name, err) + return err + } + l.Printf("%q: OK", featureTest.name) + } - checkPinnedGCTTLStep := func(ctx context.Context, t test.Test, u *versionUpgradeTest) { + return nil + }) + mvt.AfterUpgradeFinalized("check if GC TTL is pinned", func(l *logger.Logger, db *gosql.DB) error { // TODO(irfansharif): This can be removed when the predecessor version // in this test is v23.1, where the default is 4h. This test was only to // make sure that existing clusters that upgrade to 23.1 retained their // existing GC TTL. - t.L().Printf("checking if GC TTL is pinned to 25h") + l.Printf("checking if GC TTL is pinned to 24h") var ttlSeconds int query := ` SELECT @@ -122,86 +138,17 @@ func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) { WHERE target = 'RANGE default' LIMIT 1 ` - require.NoError(t, u.conn(ctx, t, 1).QueryRowContext(ctx, query).Scan(&ttlSeconds)) - require.Equal(t, 24*60*60, ttlSeconds) // NB: 24h is what's used in the fixture - } - - // The steps below start a cluster at predecessorVersion (from a fixture), - // then start an upgrade that is rolled back, and finally start and finalize - // the upgrade. Between each step, we run the feature tests defined in - // versionUpgradeTestFeatures. - u := newVersionUpgradeTest(c, - // Start the cluster from a fixture. That fixture's cluster version may - // be at the predecessor version (though in practice it's fully up to - // date, if it was created via the checkpointer above), so add a - // waitForUpgradeStep to make sure we're upgraded all the way before - // moving on. - // - // See the comment on createCheckpoints for details on fixtures. - uploadAndStartFromCheckpointFixture(c.All(), predecessorVersion), - - // lower descriptor lease duration to 1 minute, working around a - // lease leak that can occasionally make this test time out (flake - // rate ~3%). - // - // TODO(renato): remove this call and function definition when - // https://github.com/cockroachdb/cockroach/issues/84382 is - // closed. - lowerLeaseDuration(1), - - uploadAndInitSchemaChangeWorkload(), - waitForUpgradeStep(c.All()), - testFeaturesStep, - - // NB: at this point, cluster and binary version equal predecessorVersion, - // and auto-upgrades are on. + if err := db.QueryRowContext(ctx, query).Scan(&ttlSeconds); err != nil { + return fmt.Errorf("error querying GC TTL: %w", err) + } + expectedTTL := 24 * 60 * 60 // NB: 24h is what's used in the fixture + if ttlSeconds != expectedTTL { + return fmt.Errorf("unexpected GC TTL: actual (%d) != expected (%d)", ttlSeconds, expectedTTL) + } + return nil + }) - // We use an empty string for the version below, which means to use the - // main ./cockroach binary (i.e. the one being tested in this run). - // We upgrade into this version more capriciously to ensure better - // coverage by first rolling the cluster into the new version with - // auto-upgrade disabled, then rolling back, and then rolling forward - // and finalizing on the auto-upgrade path. - preventAutoUpgradeStep(1), - // Roll nodes forward. - binaryUpgradeStep(c.Node(1), ""), - testFeaturesStep, - binaryUpgradeStep(c.Range(2, c.Spec().NodeCount), ""), - // Run a quick schemachange workload in between each upgrade. - // The maxOps is 10 to keep the test runtime under 1-2 minutes. - // schemaChangeStep, - backupStep, - // Roll back again. Note that bad things would happen if the cluster had - // ignored our request to not auto-upgrade. The `autoupgrade` roachtest - // exercises this in more detail, so here we just rely on things working - // as they ought to. - binaryUpgradeStep(c.All(), predecessorVersion), - testFeaturesStep, - // schemaChangeStep, - backupStep, - // Roll nodes forward, this time allowing them to upgrade, and waiting - // for it to happen. - binaryUpgradeStep(c.All(), ""), - allowAutoUpgradeStep(1), - testFeaturesStep, - // schemaChangeStep, - backupStep, - waitForUpgradeStep(c.All()), - testFeaturesStep, - // schemaChangeStep, - backupStep, - // Turn tracing on globally to give it a fighting chance at exposing any - // crash-inducing incompatibilities or horrendous memory leaks. (It won't - // catch most memory leaks since this test doesn't run for too long or does - // too much work). Then, run the previous tests again. - enableTracingGloballyStep, - testFeaturesStep, - // schemaChangeStep, - backupStep, - checkPinnedGCTTLStep, - ) - - u.run(ctx, t) + mvt.Run() } func (u *versionUpgradeTest) run(ctx context.Context, t test.Test) { @@ -262,9 +209,7 @@ func uploadVersion( nodes option.NodeListOption, newVersion string, ) string { - path, err := clusterupgrade.UploadVersion( - ctx, t.L(), c, nodes, newVersion, t.Cockroach(), t.VersionsBinaryOverride(), - ) + path, err := clusterupgrade.UploadVersion(ctx, t, t.L(), c, nodes, newVersion) if err != nil { t.Fatal(err) } @@ -284,7 +229,7 @@ func upgradeNodes( newVersion string, ) { if err := clusterupgrade.RestartNodesWithNewBinary( - ctx, t.L(), c, nodes, startOpts, newVersion, t.Cockroach(), t.VersionsBinaryOverride(), + ctx, t, t.L(), c, nodes, startOpts, newVersion, ); err != nil { t.Fatal(err) } @@ -294,19 +239,7 @@ func (u *versionUpgradeTest) binaryVersion( ctx context.Context, t test.Test, i int, ) roachpb.Version { db := u.conn(ctx, t, i) - v, err := clusterupgrade.BinaryVersion(ctx, db) - if err != nil { - t.Fatal(err) - } - - return v -} - -func (u *versionUpgradeTest) clusterVersion( - ctx context.Context, t test.Test, i int, -) roachpb.Version { - db := u.conn(ctx, t, i) - v, err := clusterupgrade.ClusterVersion(ctx, db) + v, err := clusterupgrade.BinaryVersion(db) if err != nil { t.Fatal(err) } @@ -323,16 +256,20 @@ func uploadAndStartFromCheckpointFixture(nodes option.NodeListOption, v string) t.Fatal(err) } binary := uploadVersion(ctx, t, u.c, nodes, v) + startOpts := option.DefaultStartOpts() // NB: can't start sequentially since cluster already bootstrapped. - clusterupgrade.StartWithBinary(ctx, t.L(), u.c, nodes, binary, false /* sequential */) + startOpts.RoachprodOpts.Sequential = false + clusterupgrade.StartWithBinary(ctx, t.L(), u.c, nodes, binary, startOpts) } } func uploadAndStart(nodes option.NodeListOption, v string) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { binary := uploadVersion(ctx, t, u.c, nodes, v) + startOpts := option.DefaultStartOpts() // NB: can't start sequentially since cluster already bootstrapped. - clusterupgrade.StartWithBinary(ctx, t.L(), u.c, nodes, binary, false /* sequential */) + startOpts.RoachprodOpts.Sequential = false + clusterupgrade.StartWithBinary(ctx, t.L(), u.c, nodes, binary, startOpts) } } @@ -342,34 +279,13 @@ func uploadAndStart(nodes option.NodeListOption, v string) versionStep { func binaryUpgradeStep(nodes option.NodeListOption, newVersion string) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { if err := clusterupgrade.RestartNodesWithNewBinary( - ctx, t.L(), u.c, nodes, option.DefaultStartOpts(), newVersion, t.Cockroach(), t.VersionsBinaryOverride(), + ctx, t, t.L(), u.c, nodes, option.DefaultStartOpts(), newVersion, ); err != nil { t.Fatal(err) } } } -func enableTracingGloballyStep(ctx context.Context, t test.Test, u *versionUpgradeTest) { - db := u.conn(ctx, t, 1) - // NB: this enables net/trace, and as a side effect creates verbose trace spans everywhere. - _, err := db.ExecContext(ctx, `SET CLUSTER SETTING trace.debug.enable = $1`, true) - if err != nil { - t.Fatal(err) - } -} - -// lowerLeaseDuration sets the `sql.catalog.descriptor_lease_duration` -// setting to 1 minute. -func lowerLeaseDuration(node int) versionStep { - return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - db := u.conn(ctx, t, node) - _, err := db.ExecContext(ctx, `SET CLUSTER SETTING sql.catalog.descriptor_lease_duration = '1m'`) - if err != nil { - t.Fatal(err) - } - } -} - func preventAutoUpgradeStep(node int) versionStep { return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { db := u.conn(ctx, t, node) @@ -404,53 +320,6 @@ func waitForUpgradeStep(nodes option.NodeListOption) versionStep { } } -type versionFeatureTest struct { - name string - fn func(context.Context, test.Test, *versionUpgradeTest, option.NodeListOption) (skipped bool) -} - -type versionFeatureStep []versionFeatureTest - -func (vs versionFeatureStep) step(nodes option.NodeListOption) versionStep { - return func(ctx context.Context, t test.Test, u *versionUpgradeTest) { - for _, feature := range vs { - t.L().Printf("checking %s", feature.name) - tBegin := timeutil.Now() - skipped := feature.fn(ctx, t, u, nodes) - dur := fmt.Sprintf("%.2fs", timeutil.Since(tBegin).Seconds()) - if skipped { - t.L().Printf("^-- skip (%s)", dur) - } else { - t.L().Printf("^-- ok (%s)", dur) - } - } - } -} - -func stmtFeatureTest( - name string, minVersion roachpb.Version, stmt string, args ...interface{}, -) versionFeatureTest { - return versionFeatureTest{ - name: name, - fn: func(ctx context.Context, t test.Test, u *versionUpgradeTest, nodes option.NodeListOption) (skipped bool) { - i := nodes.RandNode()[0] - if u.clusterVersion(ctx, t, i).Less(minVersion) { - return true // skipped - } - db := u.conn(ctx, t, i) - if _, err := db.ExecContext(ctx, stmt, args...); err != nil { - if testutils.IsError(err, "no inbound stream connection") && u.clusterVersion(ctx, t, i).Less(v202) { - // This error has been fixed in 20.2+ but may still occur on earlier - // versions. - return true // skipped - } - t.Fatal(err) - } - return false - }, - } -} - // makeVersionFixtureAndFatal creates fixtures from which we can test // mixed-version clusters (i.e. version X mixing with X-1). The fixtures date // back all the way to v1.0; when development begins on version X, we make a