From b588f1f02c4e7bd4a10a08d9b20a914e03751e40 Mon Sep 17 00:00:00 2001
From: Michael Butler <butler@cockroachlabs.com>
Date: Mon, 6 Mar 2023 12:55:47 -0500
Subject: [PATCH] backupccl: replace restore2TB perf tests

This patch removes the restore2TB* roachtests which ran a 2TB bank restore to
benchmark restore performance across a few hardware configurations. This patch
also replaces the `restoreTPCCInc/nodes=10` test which tested our ability to
handle a backup with a long chain.

This patch also adds:
1. `restore/tpce/400GB/aws/nodes=4/cpus=16` to measure how per-node throughput
scales when the per node vcpu count doubles relative to default.
2. `restore/tpce/400GB/aws/nodes=8/cpus=8` to measure how per-node throughput
scales when the number of nodes doubles relative to default.
3. `restore/tpce/400GB/aws/backupsIncluded=48/nodes=4/cpus=8` to measure
restore reliability and performance on 48 length long backup chain relative to
default.

A future patch will update the fixtures used in the restore node shutdown
scripts, and add more perf based tests.

Fixes #92699

Release note: None
---
 pkg/cmd/roachtest/tests/restore.go | 212 ++++-------------------------
 1 file changed, 28 insertions(+), 184 deletions(-)

diff --git a/pkg/cmd/roachtest/tests/restore.go b/pkg/cmd/roachtest/tests/restore.go
index 1ac899e7e5ca..9b318ba4ddd0 100644
--- a/pkg/cmd/roachtest/tests/restore.go
+++ b/pkg/cmd/roachtest/tests/restore.go
@@ -328,193 +328,11 @@ func registerRestoreNodeShutdown(r registry.Registry) {
 	})
 }
 
-type testDataSet interface {
-	name() string
-	// runRestore does any setup that's required and restores the dataset into
-	// the given cluster. Any setup shouldn't take a long amount of time since
-	// perf artifacts are based on how long this takes.
-	runRestore(ctx context.Context, c cluster.Cluster)
-
-	// runRestoreDetached is like runRestore but runs the RESTORE WITH detahced,
-	// and returns the job ID.
-	runRestoreDetached(ctx context.Context, t test.Test, c cluster.Cluster) (jobspb.JobID, error)
-}
-
-type dataBank2TB struct{}
-
-func (dataBank2TB) name() string {
-	return "2TB"
-}
-
-func (dataBank2TB) runRestore(ctx context.Context, c cluster.Cluster) {
-	c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "CREATE DATABASE restore2tb"`)
-	c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
-				RESTORE csv.bank FROM
-				'gs://cockroach-fixtures/workload/bank/version=1.0.0,payload-bytes=10240,ranges=0,rows=65104166,seed=1/bank?AUTH=implicit'
-				WITH into_db = 'restore2tb'"`)
-}
-
-func (dataBank2TB) runRestoreDetached(
-	ctx context.Context, t test.Test, c cluster.Cluster,
-) (jobspb.JobID, error) {
-	c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "CREATE DATABASE restore2tb"`)
-	c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
-				RESTORE csv.bank FROM
-				'gs://cockroach-fixtures/workload/bank/version=1.0.0,payload-bytes=10240,ranges=0,rows=65104166,seed=1/bank?AUTH=implicit'
-				WITH into_db = 'restore2tb', detached"`)
-	db, err := c.ConnE(ctx, t.L(), c.Node(1)[0])
-	if err != nil {
-		return 0, errors.Wrap(err, "failed to connect to node 1; running restore detached")
-	}
-
-	var jobID jobspb.JobID
-	if err := db.QueryRow(`SELECT job_id FROM [SHOW JOBS] WHERE job_type = 'RESTORE'`).Scan(&jobID); err != nil {
-		return 0, err
-	}
-
-	return jobID, nil
-}
-
-var _ testDataSet = dataBank2TB{}
-
-type tpccIncData struct{}
-
-func (tpccIncData) name() string {
-	return "TPCCInc"
-}
-
-func (tpccIncData) runRestore(ctx context.Context, c cluster.Cluster) {
-	// This data set restores a 1.80TB (replicated) backup consisting of 48
-	// incremental backup layers taken every 15 minutes. 8000 warehouses were
-	// imported and then a workload of 1000 warehouses was run against the cluster
-	// while the incremental backups were being taken.
-	c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
-				RESTORE FROM '2022/09/29-000000.00' IN
-				'gs://cockroach-fixtures/backups/tpcc/rev-history=false,inc-count=48,cluster/8000-warehouses/22.2.0-alpha.4?AUTH=implicit'
-				AS OF SYSTEM TIME '2022-09-28 23:42:00'"`)
-}
-
-func (tpccIncData) runRestoreDetached(
-	ctx context.Context, t test.Test, c cluster.Cluster,
-) (jobspb.JobID, error) {
-	c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "
-				RESTORE FROM '/2022/09/07-000000.00' IN
-				'gs://cockroach-fixtures/tpcc-incrementals-22.2?AUTH=implicit'
-				AS OF SYSTEM TIME '2022-09-07 12:15:00'"
-				WITH detached"`)
-	db, err := c.ConnE(ctx, t.L(), c.Node(1)[0])
-	if err != nil {
-		return 0, errors.Wrap(err, "failed to connect to node 1; running restore detached")
-	}
-
-	var jobID jobspb.JobID
-	if err := db.QueryRow(`SELECT job_id FROM [SHOW JOBS] WHERE job_type = 'RESTORE'`).Scan(&jobID); err != nil {
-		return 0, err
-	}
-
-	return jobID, nil
-}
-
 func registerRestore(r registry.Registry) {
 	// TODO(msbutler): delete the tests created by the loop below. Specifically
 	// - restore2TB/nodes=10
 	// - restore2TB/nodes=32
 	// - restore2TB/nodes=6/cpus=8/pd-volume=2500GB
-	largeVolumeSize := 2500 // the size in GB of disks in large volume configs
-	for _, item := range []struct {
-		nodes        int
-		cpus         int
-		largeVolumes bool
-		dataSet      testDataSet
-
-		timeout time.Duration
-	}{
-		{dataSet: dataBank2TB{}, nodes: 10, timeout: 6 * time.Hour},
-		{dataSet: dataBank2TB{}, nodes: 32, timeout: 3 * time.Hour},
-		{dataSet: dataBank2TB{}, nodes: 6, timeout: 4 * time.Hour, cpus: 8, largeVolumes: true},
-		{dataSet: tpccIncData{}, nodes: 10, timeout: 6 * time.Hour},
-	} {
-		item := item
-		clusterOpts := make([]spec.Option, 0)
-		testName := fmt.Sprintf("restore%s/nodes=%d", item.dataSet.name(), item.nodes)
-		if item.cpus != 0 {
-			clusterOpts = append(clusterOpts, spec.CPU(item.cpus))
-			testName += fmt.Sprintf("/cpus=%d", item.cpus)
-		}
-		if item.largeVolumes {
-			clusterOpts = append(clusterOpts, spec.VolumeSize(largeVolumeSize))
-			testName += fmt.Sprintf("/pd-volume=%dGB", largeVolumeSize)
-		}
-		// Has been seen to OOM: https://github.com/cockroachdb/cockroach/issues/71805
-		clusterOpts = append(clusterOpts, spec.HighMem(true))
-
-		r.Add(registry.TestSpec{
-			Name:              testName,
-			Owner:             registry.OwnerDisasterRecovery,
-			Cluster:           r.MakeClusterSpec(item.nodes, clusterOpts...),
-			Timeout:           item.timeout,
-			EncryptionSupport: registry.EncryptionMetamorphic,
-			Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
-				c.Put(ctx, t.Cockroach(), "./cockroach")
-				c.Start(ctx, t.L(), option.DefaultStartOptsNoBackups(), install.MakeClusterSettings())
-				m := c.NewMonitor(ctx)
-
-				// Run the disk usage logger in the monitor to guarantee its
-				// having terminated when the test ends.
-				dul := NewDiskUsageLogger(t, c)
-				m.Go(dul.Runner)
-				hc := NewHealthChecker(t, c, c.All())
-				m.Go(hc.Runner)
-
-				// TODO(peter): This currently causes the test to fail because we see a
-				// flurry of valid merges when the restore finishes.
-				//
-				// m.Go(func(ctx context.Context) error {
-				// 	// Make sure the merge queue doesn't muck with our restore.
-				// 	return verifyMetrics(ctx, c, map[string]float64{
-				// 		"cr.store.queue.merge.process.success": 10,
-				// 		"cr.store.queue.merge.process.failure": 10,
-				// 	})
-				// })
-
-				tick, perfBuf := initBulkJobPerfArtifacts(testName, item.timeout)
-				m.Go(func(ctx context.Context) error {
-					defer dul.Done()
-					defer hc.Done()
-					t.Status(`running restore`)
-					// Tick once before starting the restore, and once after to
-					// capture the total elapsed time. This is used by
-					// roachperf to compute and display the average MB/sec per
-					// node.
-					if item.cpus >= 8 {
-						// If the nodes are large enough (specifically, if they
-						// have enough memory we can increase the parallelism
-						// of restore). Machines with 16 vCPUs typically have
-						// enough memory to support 3 concurrent workers.
-						c.Run(ctx, c.Node(1),
-							`./cockroach sql --insecure -e "SET CLUSTER SETTING kv.bulk_io_write.restore_node_concurrency = 5"`)
-						c.Run(ctx, c.Node(1),
-							`./cockroach sql --insecure -e "SET CLUSTER SETTING kv.bulk_io_write.concurrent_addsstable_requests = 5"`)
-					}
-					tick()
-					item.dataSet.runRestore(ctx, c)
-					tick()
-
-					// Upload the perf artifacts to any one of the nodes so that the test
-					// runner copies it into an appropriate directory path.
-					dest := filepath.Join(t.PerfArtifactsDir(), "stats.json")
-					if err := c.RunE(ctx, c.Node(1), "mkdir -p "+filepath.Dir(dest)); err != nil {
-						log.Errorf(ctx, "failed to create perf dir: %+v", err)
-					}
-					if err := c.PutString(ctx, perfBuf.String(), dest, 0755, c.Node(1)); err != nil {
-						log.Errorf(ctx, "failed to upload perf artifacts to node: %s", err.Error())
-					}
-					return nil
-				})
-				m.Wait()
-			},
-		})
-	}
 	durationGauge := r.PromFactory().NewGaugeVec(prometheus.GaugeOpts{Namespace: registry.
 		PrometheusNameSpace, Subsystem: "restore", Name: "duration"}, []string{"test_name"})
 
@@ -692,6 +510,28 @@ func registerRestore(r registry.Registry) {
 			timeout:  1 * time.Hour,
 		},
 		{
+			// Benchmarks if per node throughput remains constant if the number of
+			// nodes doubles relative to default.
+			hardware: makeHardwareSpecs(hardwareSpecs{nodes: 8}),
+			backup:   makeBackupSpecs(backupSpecs{}),
+			timeout:  1 * time.Hour,
+		},
+		{
+			// Benchmarks if per node throughput doubles if the vcpu count doubles
+			// relative to default.
+			hardware: makeHardwareSpecs(hardwareSpecs{cpus: 16}),
+			backup:   makeBackupSpecs(backupSpecs{}),
+			timeout:  1 * time.Hour,
+		},
+		{
+			// Ensures we can restore a 48 length incremental chain.
+			// Also benchmarks per node throughput for a long chain.
+			hardware: makeHardwareSpecs(hardwareSpecs{}),
+			backup:   makeBackupSpecs(backupSpecs{backupsIncluded: 48}),
+			timeout:  1 * time.Hour,
+		},
+		{
+			// The nightly 8TB Restore test.
 			hardware: makeHardwareSpecs(hardwareSpecs{nodes: 10, volumeSize: 2000}),
 			backup: makeBackupSpecs(backupSpecs{
 				version:  "v22.2.1",
@@ -699,6 +539,7 @@ func registerRestore(r registry.Registry) {
 			timeout: 5 * time.Hour,
 		},
 		{
+			// The weekly 32TB Restore test.
 			hardware: makeHardwareSpecs(hardwareSpecs{nodes: 15, cpus: 16, volumeSize: 5000}),
 			backup: makeBackupSpecs(backupSpecs{
 				version:  "v22.2.1",
@@ -708,9 +549,7 @@ func registerRestore(r registry.Registry) {
 		},
 		// TODO(msbutler): add the following tests once roachperf/grafana is hooked up and old tests are
 		// removed:
-		// - restore/tpce/400GB/nodes=10
 		// - restore/tpce/400GB/nodes=30
-		// - restore/tpce/400GB/cpu=16
 		// - restore/tpce/400GB/encryption
 	} {
 		sp := sp
@@ -909,9 +748,14 @@ func makeBackupSpecs(override backupSpecs) backupSpecs {
 		specs.fullBackupDir = override.fullBackupDir
 	}
 
+	if override.backupsIncluded != 0 {
+		specs.backupsIncluded = override.backupsIncluded
+	}
+
 	if override.workload != nil {
 		specs.workload = override.workload
 	}
+
 	return specs
 }