Skip to content

Commit

Permalink
roachtest: surface cloud cluster spec info in artifacts
Browse files Browse the repository at this point in the history
Previously, getting the spec of the VMs on which a roachtest
ran was tricky to derive.
This was inadequate because it's often necessary to understand
the particulars of a roachtest run's environment.
To address this, this patch creates a json file per VM describing
its spec in. These files are stored in the Artifacts directory.

Epic: none
Fixes: #112707
Release note: None
  • Loading branch information
vidit-bhat committed May 17, 2024
1 parent d883247 commit 5021905
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 3 deletions.
56 changes: 56 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"bytes"
"context"
gosql "database/sql"
"encoding/json"
"fmt"
"io"
"io/fs"
Expand Down Expand Up @@ -1393,6 +1394,61 @@ func (c *clusterImpl) FetchDebugZip(
})
}

// FetchVMSpecs downloads the VM specs from the cluster using `roachprod get`.
// The logs will be placed in the test's artifacts dir.
func (c *clusterImpl) FetchVMSpecs(ctx context.Context, l *logger.Logger) error {
if c.IsLocal() {
return nil
}

l.Printf("fetching VM specs\n")
c.status("fetching VM specs")

vmSpecsFolder := filepath.Join(c.t.ArtifactsDir(), "vm_specs")
if err := os.MkdirAll(vmSpecsFolder, 0755); err != nil {
return err
}

// Don't hang forever if we can't fetch the specs.
return timeutil.RunWithTimeout(ctx, "fetch logs", 5*time.Minute, func(ctx context.Context) error {
cachedCluster, err := getCachedCluster(c.name)
if err != nil {
return err
}
providerToVMs := bucketVMsByProvider(cachedCluster)

for provider, vms := range providerToVMs {
p := vm.Providers[provider]
vmSpecs, err := p.GetVMSpecs(l, vms)
if err != nil {
l.Errorf("failed to get VM spec for provider %s: %s", provider, err)
continue
}
for _, vmSpec := range vmSpecs {
name, ok := vmSpec["name"].(string)
if !ok {
l.Printf("failed to create spec files for some VMs")
continue
}

dest := filepath.Join(vmSpecsFolder, name+".json")
specJSON, err := json.MarshalIndent(vmSpec, "", " ")
if err != nil {
l.Printf("Failed to marshal JSON: %v\n", err)
continue
}

err = os.WriteFile(dest, specJSON, 0644)
if err != nil {
l.Printf("Failed to write spec to file for %s\n", name)
continue
}
}
}
return nil
})
}

// checkNoDeadNode returns an error if at least one of the nodes that have a populated
// data dir are found to be not running. It prints both to t.L() and the test
// output.
Expand Down
7 changes: 5 additions & 2 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -981,7 +981,7 @@ func getGoCoverArtifacts(ctx context.Context, c *clusterImpl, t test.Test) {
// this returns. This happens when the test doesn't respond to cancellation.
//
// Args:
// c: The cluster on which the test will run. runTest() does not wipe or destroy the cluster.
// c: The cluster on which the test will run. runTest() does not wipe or destroy the cluster.
func (r *testRunner) runTest(
ctx context.Context,
t *testImpl,
Expand Down Expand Up @@ -1476,7 +1476,7 @@ func (r *testRunner) teardownTest(
func (r *testRunner) collectArtifacts(
ctx context.Context, t *testImpl, c *clusterImpl, timedOut bool, timeout time.Duration,
) error {
// Collecting artifacts may hang so we run it in a goroutine which is abandoned
// Collecting artifacts may hang, so we run it in a goroutine which is abandoned
// after a timeout.
artifactsCollectedCh := make(chan struct{})
_ = r.stopper.RunAsyncTask(ctx, "collect-artifacts", func(ctx context.Context) {
Expand Down Expand Up @@ -1562,6 +1562,9 @@ func (r *testRunner) collectArtifacts(
if err := c.FetchDebugZip(ctx, t.L(), "debug.zip"); err != nil {
t.L().Printf("failed to collect zip: %s", err)
}
if err := c.FetchVMSpecs(ctx, t.L()); err != nil {
t.L().Printf("failed to collect VM specs: %s", err)
}
})

select {
Expand Down
4 changes: 4 additions & 0 deletions pkg/roachprod/vm/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,10 @@ func (p *Provider) GetHostErrorVMs(
return nil, nil
}

func (p *Provider) GetVMSpecs(l *logger.Logger, vms vm.List) ([]map[string]interface{}, error) {
return nil, nil
}

const (
defaultSSDMachineType = "m6id.xlarge"
defaultMachineType = "m6i.xlarge"
Expand Down
4 changes: 4 additions & 0 deletions pkg/roachprod/vm/azure/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ func (p *Provider) GetHostErrorVMs(
return nil, nil
}

func (p *Provider) GetVMSpecs(l *logger.Logger, vms vm.List) ([]map[string]interface{}, error) {
return nil, nil
}

func (p *Provider) CreateVolumeSnapshot(
l *logger.Logger, volume vm.Volume, vsco vm.VolumeSnapshotCreateOpts,
) (vm.VolumeSnapshot, error) {
Expand Down
4 changes: 4 additions & 0 deletions pkg/roachprod/vm/flagstub/flagstub.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ func (p *provider) GetHostErrorVMs(
return nil, nil
}

func (p *provider) GetVMSpecs(l *logger.Logger, vms vm.List) ([]map[string]interface{}, error) {
return nil, nil
}

func (p *provider) CreateVolumeSnapshot(
l *logger.Logger, volume vm.Volume, vsco vm.VolumeSnapshotCreateOpts,
) (vm.VolumeSnapshot, error) {
Expand Down
24 changes: 24 additions & 0 deletions pkg/roachprod/vm/gce/gcloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,30 @@ func (p *Provider) GetHostErrorVMs(
return hostErrorVMs, nil
}

// GetVMSpecs returns a json list of VM specs
func (p *Provider) GetVMSpecs(l *logger.Logger, vms vm.List) ([]map[string]interface{}, error) {
if p.GetProject() == "" {
return nil, errors.New("project name cannot be empty")
}
if vms == nil {
return nil, errors.New("vms cannot be nil")
}
// Extract the spec of all VMs.
var vmSpecs []map[string]interface{}
for _, vmInstance := range vms {
var vmSpec map[string]interface{}
vmFullResourceName := "projects/" + p.GetProject() + "/zones/" + vmInstance.Zone + "/instances/" + vmInstance.Name
args := []string{"compute", "instances", "describe", vmFullResourceName, "--format=json"}

if err := runJSONCommand(args, &vmSpec); err != nil {
l.Errorf("Error describing instance %s in zone %s: %v", vmInstance.Name, vmInstance.Zone, err)
return nil, err
}
vmSpecs = append(vmSpecs, vmSpec)
}
return vmSpecs, nil
}

func buildFilterCliArgs(
vms vm.List, projectName string, since time.Time, filter string,
) ([]string, error) {
Expand Down
4 changes: 4 additions & 0 deletions pkg/roachprod/vm/local/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ func (p *Provider) GetHostErrorVMs(
return nil, nil
}

func (p *Provider) GetVMSpecs(l *logger.Logger, vms vm.List) ([]map[string]interface{}, error) {
return nil, nil
}

func (p *Provider) CreateVolumeSnapshot(
l *logger.Logger, volume vm.Volume, vsco vm.VolumeSnapshotCreateOpts,
) (vm.VolumeSnapshot, error) {
Expand Down
4 changes: 3 additions & 1 deletion pkg/roachprod/vm/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,8 +508,10 @@ type Provider interface {
// GetPreemptedSpotVMs returns a list of Spot VMs that were preempted since the time specified.
// Returns nil, nil when SupportsSpotVMs() is false.
GetPreemptedSpotVMs(l *logger.Logger, vms List, since time.Time) ([]PreemptedVM, error)
// GetHostErrorVMs returns a list of Spot VMs that had host error since the time specified.
// GetHostErrorVMs returns a list of VMs that had host error since the time specified.
GetHostErrorVMs(l *logger.Logger, vms List, since time.Time) ([]string, error)
// GetVMSpecs returns a json list of VM specs
GetVMSpecs(l *logger.Logger, vms List) ([]map[string]interface{}, error)

// CreateLoadBalancer creates a load balancer, for a specific port, that
// delegates to the given cluster.
Expand Down

0 comments on commit 5021905

Please sign in to comment.