Skip to content

Commit 522b630

Browse files
authored
debug: add version constraint to avoid pprof panic (#12807)
1 parent 9dccbb1 commit 522b630

File tree

3 files changed

+134
-21
lines changed

3 files changed

+134
-21
lines changed

.changelog/12807.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:improvement
2+
cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2. 0.11.1, and 0.11.0
3+
```

command/operator_debug.go

+101-21
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
"github.com/hashicorp/go-cleanhttp"
2525
"github.com/hashicorp/go-multierror"
26+
goversion "github.com/hashicorp/go-version"
2627
"github.com/hashicorp/nomad/api"
2728
"github.com/hashicorp/nomad/api/contexts"
2829
"github.com/hashicorp/nomad/helper"
@@ -54,14 +55,17 @@ type OperatorDebugCommand struct {
5455
cancel context.CancelFunc
5556
opts *api.QueryOptions
5657
verbose bool
58+
members *api.ServerMembers
59+
nodes []*api.NodeListStub
5760
}
5861

5962
const (
60-
userAgent = "nomad operator debug"
61-
clusterDir = "cluster"
62-
clientDir = "client"
63-
serverDir = "server"
64-
intervalDir = "interval"
63+
userAgent = "nomad operator debug"
64+
clusterDir = "cluster"
65+
clientDir = "client"
66+
serverDir = "server"
67+
intervalDir = "interval"
68+
minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2"
6569
)
6670

6771
func (c *OperatorDebugCommand) Help() string {
@@ -502,6 +506,16 @@ func (c *OperatorDebugCommand) Run(args []string) int {
502506
AuthToken: c.Meta.token,
503507
}
504508

509+
// Get complete list of client nodes
510+
c.nodes, _, err = client.Nodes().List(c.queryOpts())
511+
if err != nil {
512+
c.Ui.Error(fmt.Sprintf("Error querying node info: %v", err))
513+
return 1
514+
}
515+
516+
// Write nodes to file
517+
c.writeJSON(clusterDir, "nodes.json", c.nodes, err)
518+
505519
// Search all nodes If a node class is specified without a list of node id prefixes
506520
if c.nodeClass != "" && nodeIDs == "" {
507521
nodeIDs = "all"
@@ -565,17 +579,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {
565579
}
566580

567581
// Resolve servers
568-
members, err := client.Agent().MembersOpts(c.queryOpts())
582+
c.members, err = client.Agent().MembersOpts(c.queryOpts())
569583
if err != nil {
570584
c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
571585
return 1
572586
}
573587

574588
// Write complete list of server members to file
575-
c.writeJSON(clusterDir, "members.json", members, err)
589+
c.writeJSON(clusterDir, "members.json", c.members, err)
576590

577591
// Filter for servers matching criteria
578-
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
592+
c.serverIDs, err = filterServerMembers(c.members, serverIDs, c.region)
579593
if err != nil {
580594
c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err))
581595
return 1
@@ -584,8 +598,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
584598
serversFound := 0
585599
serverCaptureCount := 0
586600

587-
if members != nil {
588-
serversFound = len(members.Members)
601+
if c.members != nil {
602+
serversFound = len(c.members.Members)
589603
}
590604
if c.serverIDs != nil {
591605
serverCaptureCount = len(c.serverIDs)
@@ -900,9 +914,31 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
900914

901915
func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
902916

917+
pprofNodeIDs := []string{}
918+
pprofServerIDs := []string{}
919+
920+
// threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions
921+
for _, serverID := range c.serverIDs {
922+
version := c.getNomadVersion(serverID, "")
923+
err := checkVersion(version, minimumVersionPprofConstraint)
924+
if err != nil {
925+
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
926+
}
927+
pprofServerIDs = append(pprofServerIDs, serverID)
928+
}
929+
930+
for _, nodeID := range c.nodeIDs {
931+
version := c.getNomadVersion("", nodeID)
932+
err := checkVersion(version, minimumVersionPprofConstraint)
933+
if err != nil {
934+
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
935+
}
936+
pprofNodeIDs = append(pprofNodeIDs, nodeID)
937+
}
938+
903939
// Take the first set of pprofs synchronously...
904940
c.Ui.Output(" Capture pprofInterval 0000")
905-
c.collectPprofs(client, 0)
941+
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, 0)
906942
if c.pprofInterval == c.pprofDuration {
907943
return
908944
}
@@ -921,7 +957,7 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
921957
return
922958
case <-timer.C:
923959
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount))
924-
c.collectPprofs(client, pprofIntervalCount)
960+
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, pprofIntervalCount)
925961
timer.Reset(c.pprofInterval)
926962
pprofIntervalCount++
927963
}
@@ -930,12 +966,12 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
930966
}
931967

932968
// collectPprofs captures the /agent/pprof for each listed node
933-
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
934-
for _, n := range c.nodeIDs {
969+
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, serverIDs, nodeIDs []string, interval int) {
970+
for _, n := range nodeIDs {
935971
c.collectPprof(clientDir, n, client, interval)
936972
}
937973

938-
for _, n := range c.serverIDs {
974+
for _, n := range serverIDs {
939975
c.collectPprof(serverDir, n, client, interval)
940976
}
941977
}
@@ -987,12 +1023,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client,
9871023
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
9881024
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
9891025
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads
990-
991-
// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
992-
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives
993-
994-
// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
995-
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
9961026
}
9971027

9981028
// savePprofProfile retrieves a pprof profile and writes to disk
@@ -1714,3 +1744,53 @@ func isRedirectError(err error) bool {
17141744
const redirectErr string = `invalid character '<' looking for beginning of value`
17151745
return strings.Contains(err.Error(), redirectErr)
17161746
}
1747+
1748+
// getNomadVersion fetches the version of Nomad running on a given server/client node ID
1749+
func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string {
1750+
if serverID == "" && nodeID == "" {
1751+
return ""
1752+
}
1753+
1754+
version := ""
1755+
if serverID != "" {
1756+
for _, server := range c.members.Members {
1757+
// Raft v2 server
1758+
if server.Name == serverID {
1759+
version = server.Tags["build"]
1760+
}
1761+
1762+
// Raft v3 server
1763+
if server.Tags["id"] == serverID {
1764+
version = server.Tags["version"]
1765+
}
1766+
}
1767+
}
1768+
1769+
if nodeID != "" {
1770+
for _, node := range c.nodes {
1771+
if node.ID == nodeID {
1772+
version = node.Version
1773+
}
1774+
}
1775+
}
1776+
1777+
return version
1778+
}
1779+
1780+
// checkVersion verifies that version satisfies the constraint
1781+
func checkVersion(version string, versionConstraint string) error {
1782+
v, err := goversion.NewVersion(version)
1783+
if err != nil {
1784+
return fmt.Errorf("error: %v", err)
1785+
}
1786+
1787+
c, err := goversion.NewConstraint(versionConstraint)
1788+
if err != nil {
1789+
return fmt.Errorf("error: %v", err)
1790+
}
1791+
1792+
if !c.Check(v) {
1793+
return nil
1794+
}
1795+
return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint)
1796+
}

command/operator_debug_test.go

+30
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,36 @@ func TestDebug_Fail_Pprof(t *testing.T) {
578578
require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway
579579
}
580580

581+
// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are
582+
// filtered by the version constraint.
583+
func TestDebug_PprofVersionCheck(t *testing.T) {
584+
cases := []struct {
585+
version string
586+
errMsg string
587+
}{
588+
{"0.8.7", ""},
589+
{"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"},
590+
{"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"},
591+
{"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"},
592+
{"0.11.3", ""},
593+
{"0.11.3+ent", ""},
594+
{"0.12.0", ""},
595+
{"1.3.0", ""},
596+
{"foo.bar", "error: Malformed version: foo.bar"},
597+
}
598+
599+
for _, tc := range cases {
600+
t.Run(tc.version, func(t *testing.T) {
601+
err := checkVersion(tc.version, minimumVersionPprofConstraint)
602+
if tc.errMsg == "" {
603+
require.NoError(t, err, "expected no error from %s", tc.version)
604+
} else {
605+
require.EqualError(t, err, tc.errMsg)
606+
}
607+
})
608+
}
609+
}
610+
581611
func TestDebug_StringToSlice(t *testing.T) {
582612
ci.Parallel(t)
583613

0 commit comments

Comments
 (0)