@@ -23,6 +23,7 @@ import (
23
23
24
24
"github.com/hashicorp/go-cleanhttp"
25
25
"github.com/hashicorp/go-multierror"
26
+ goversion "github.com/hashicorp/go-version"
26
27
"github.com/hashicorp/nomad/api"
27
28
"github.com/hashicorp/nomad/api/contexts"
28
29
"github.com/hashicorp/nomad/helper"
@@ -54,14 +55,17 @@ type OperatorDebugCommand struct {
54
55
cancel context.CancelFunc
55
56
opts * api.QueryOptions
56
57
verbose bool
58
+ members * api.ServerMembers
59
+ nodes []* api.NodeListStub
57
60
}
58
61
59
62
const (
60
- userAgent = "nomad operator debug"
61
- clusterDir = "cluster"
62
- clientDir = "client"
63
- serverDir = "server"
64
- intervalDir = "interval"
63
+ userAgent = "nomad operator debug"
64
+ clusterDir = "cluster"
65
+ clientDir = "client"
66
+ serverDir = "server"
67
+ intervalDir = "interval"
68
+ minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2"
65
69
)
66
70
67
71
func (c * OperatorDebugCommand ) Help () string {
@@ -502,6 +506,16 @@ func (c *OperatorDebugCommand) Run(args []string) int {
502
506
AuthToken : c .Meta .token ,
503
507
}
504
508
509
+ // Get complete list of client nodes
510
+ c .nodes , _ , err = client .Nodes ().List (c .queryOpts ())
511
+ if err != nil {
512
+ c .Ui .Error (fmt .Sprintf ("Error querying node info: %v" , err ))
513
+ return 1
514
+ }
515
+
516
+ // Write nodes to file
517
+ c .writeJSON (clusterDir , "nodes.json" , c .nodes , err )
518
+
505
519
// Search all nodes If a node class is specified without a list of node id prefixes
506
520
if c .nodeClass != "" && nodeIDs == "" {
507
521
nodeIDs = "all"
@@ -565,17 +579,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {
565
579
}
566
580
567
581
// Resolve servers
568
- members , err : = client .Agent ().MembersOpts (c .queryOpts ())
582
+ c . members , err = client .Agent ().MembersOpts (c .queryOpts ())
569
583
if err != nil {
570
584
c .Ui .Error (fmt .Sprintf ("Failed to retrieve server list; err: %v" , err ))
571
585
return 1
572
586
}
573
587
574
588
// Write complete list of server members to file
575
- c .writeJSON (clusterDir , "members.json" , members , err )
589
+ c .writeJSON (clusterDir , "members.json" , c . members , err )
576
590
577
591
// Filter for servers matching criteria
578
- c .serverIDs , err = filterServerMembers (members , serverIDs , c .region )
592
+ c .serverIDs , err = filterServerMembers (c . members , serverIDs , c .region )
579
593
if err != nil {
580
594
c .Ui .Error (fmt .Sprintf ("Failed to parse server list; err: %v" , err ))
581
595
return 1
@@ -584,8 +598,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
584
598
serversFound := 0
585
599
serverCaptureCount := 0
586
600
587
- if members != nil {
588
- serversFound = len (members .Members )
601
+ if c . members != nil {
602
+ serversFound = len (c . members .Members )
589
603
}
590
604
if c .serverIDs != nil {
591
605
serverCaptureCount = len (c .serverIDs )
@@ -900,9 +914,31 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
900
914
901
915
func (c * OperatorDebugCommand ) collectPeriodicPprofs (client * api.Client ) {
902
916
917
+ pprofNodeIDs := []string {}
918
+ pprofServerIDs := []string {}
919
+
920
+ // threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions
921
+ for _ , serverID := range c .serverIDs {
922
+ version := c .getNomadVersion (serverID , "" )
923
+ err := checkVersion (version , minimumVersionPprofConstraint )
924
+ if err != nil {
925
+ c .Ui .Warn (fmt .Sprintf ("Skipping pprof: %v" , err ))
926
+ }
927
+ pprofServerIDs = append (pprofServerIDs , serverID )
928
+ }
929
+
930
+ for _ , nodeID := range c .nodeIDs {
931
+ version := c .getNomadVersion ("" , nodeID )
932
+ err := checkVersion (version , minimumVersionPprofConstraint )
933
+ if err != nil {
934
+ c .Ui .Warn (fmt .Sprintf ("Skipping pprof: %v" , err ))
935
+ }
936
+ pprofNodeIDs = append (pprofNodeIDs , nodeID )
937
+ }
938
+
903
939
// Take the first set of pprofs synchronously...
904
940
c .Ui .Output (" Capture pprofInterval 0000" )
905
- c .collectPprofs (client , 0 )
941
+ c .collectPprofs (client , pprofServerIDs , pprofNodeIDs , 0 )
906
942
if c .pprofInterval == c .pprofDuration {
907
943
return
908
944
}
@@ -921,7 +957,7 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
921
957
return
922
958
case <- timer .C :
923
959
c .Ui .Output (fmt .Sprintf (" Capture pprofInterval %04d" , pprofIntervalCount ))
924
- c .collectPprofs (client , pprofIntervalCount )
960
+ c .collectPprofs (client , pprofServerIDs , pprofNodeIDs , pprofIntervalCount )
925
961
timer .Reset (c .pprofInterval )
926
962
pprofIntervalCount ++
927
963
}
@@ -930,12 +966,12 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
930
966
}
931
967
932
968
// collectPprofs captures the /agent/pprof for each listed node
933
- func (c * OperatorDebugCommand ) collectPprofs (client * api.Client , interval int ) {
934
- for _ , n := range c . nodeIDs {
969
+ func (c * OperatorDebugCommand ) collectPprofs (client * api.Client , serverIDs , nodeIDs [] string , interval int ) {
970
+ for _ , n := range nodeIDs {
935
971
c .collectPprof (clientDir , n , client , interval )
936
972
}
937
973
938
- for _ , n := range c . serverIDs {
974
+ for _ , n := range serverIDs {
939
975
c .collectPprof (serverDir , n , client , interval )
940
976
}
941
977
}
@@ -987,12 +1023,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client,
987
1023
c .savePprofProfile (path , "heap" , opts , client ) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
988
1024
c .savePprofProfile (path , "allocs" , opts , client ) // A sampling of all past memory allocations
989
1025
c .savePprofProfile (path , "threadcreate" , opts , client ) // Stack traces that led to the creation of new OS threads
990
-
991
- // This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
992
- // c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives
993
-
994
- // This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
995
- // c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
996
1026
}
997
1027
998
1028
// savePprofProfile retrieves a pprof profile and writes to disk
@@ -1714,3 +1744,53 @@ func isRedirectError(err error) bool {
1714
1744
const redirectErr string = `invalid character '<' looking for beginning of value`
1715
1745
return strings .Contains (err .Error (), redirectErr )
1716
1746
}
1747
+
1748
+ // getNomadVersion fetches the version of Nomad running on a given server/client node ID
1749
+ func (c * OperatorDebugCommand ) getNomadVersion (serverID string , nodeID string ) string {
1750
+ if serverID == "" && nodeID == "" {
1751
+ return ""
1752
+ }
1753
+
1754
+ version := ""
1755
+ if serverID != "" {
1756
+ for _ , server := range c .members .Members {
1757
+ // Raft v2 server
1758
+ if server .Name == serverID {
1759
+ version = server .Tags ["build" ]
1760
+ }
1761
+
1762
+ // Raft v3 server
1763
+ if server .Tags ["id" ] == serverID {
1764
+ version = server .Tags ["version" ]
1765
+ }
1766
+ }
1767
+ }
1768
+
1769
+ if nodeID != "" {
1770
+ for _ , node := range c .nodes {
1771
+ if node .ID == nodeID {
1772
+ version = node .Version
1773
+ }
1774
+ }
1775
+ }
1776
+
1777
+ return version
1778
+ }
1779
+
1780
+ // checkVersion verifies that version satisfies the constraint
1781
+ func checkVersion (version string , versionConstraint string ) error {
1782
+ v , err := goversion .NewVersion (version )
1783
+ if err != nil {
1784
+ return fmt .Errorf ("error: %v" , err )
1785
+ }
1786
+
1787
+ c , err := goversion .NewConstraint (versionConstraint )
1788
+ if err != nil {
1789
+ return fmt .Errorf ("error: %v" , err )
1790
+ }
1791
+
1792
+ if ! c .Check (v ) {
1793
+ return nil
1794
+ }
1795
+ return fmt .Errorf ("unsupported version=%s matches version filter %s" , version , minimumVersionPprofConstraint )
1796
+ }
0 commit comments