Skip to content

Commit 71a022a

Browse files
Metrics gotemplate support, debug bundle features (#9067)
* add goroutine text profiles to nomad operator debug * add server-id=all to nomad operator debug * fix bug from changing metrics from string to []byte * Add function to return MetricsSummary struct, metrics gotemplate support * fix bug resolving 'server-id=all' when no servers are available * add url to operator_debug tests * removed test section which is used for future operator_debug.go changes * separate metrics from operator, use only structs from go-metrics * ensure parent directories are created as needed * add suggested comments for text debug pprof * move check down to where it is used * add WaitForFiles helper function to wait for multiple files to exist * compact metrics check Co-authored-by: Drew Bailey <[email protected]> * fix github's silly apply suggestion Co-authored-by: Drew Bailey <[email protected]>
1 parent b2fb40e commit 71a022a

14 files changed

+398
-87
lines changed

api/go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,6 @@ require (
1212
github.com/kr/pretty v0.1.0
1313
github.com/mitchellh/go-testing-interface v1.0.0
1414
github.com/stretchr/testify v1.5.1
15+
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
1516
gopkg.in/yaml.v2 v2.2.8 // indirect
1617
)

api/go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H
2929
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
3030
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
3131
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
32+
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
33+
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
3234
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
3335
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
3436
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=

api/operator.go

-19
Original file line numberDiff line numberDiff line change
@@ -304,22 +304,3 @@ func (op *Operator) LicenseGet(q *QueryOptions) (*LicenseReply, *QueryMeta, erro
304304
}
305305
return &reply, qm, nil
306306
}
307-
308-
// Metrics returns a slice of bytes containing metrics, optionally formatted as either json or prometheus
309-
func (op *Operator) Metrics(q *QueryOptions) ([]byte, error) {
310-
if q == nil {
311-
q = &QueryOptions{}
312-
}
313-
314-
metricsReader, err := op.c.rawQuery("/v1/metrics", q)
315-
if err != nil {
316-
return nil, err
317-
}
318-
319-
metricsBytes, err := ioutil.ReadAll(metricsReader)
320-
if err != nil {
321-
return nil, err
322-
}
323-
324-
return metricsBytes, nil
325-
}

api/operator_metrics.go

+87
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
package api
2+
3+
import (
4+
"io/ioutil"
5+
"time"
6+
)
7+
8+
// MetricsSummary holds a roll-up of metrics info for a given interval
9+
type MetricsSummary struct {
10+
Timestamp string
11+
Gauges []GaugeValue
12+
Points []PointValue
13+
Counters []SampledValue
14+
Samples []SampledValue
15+
}
16+
17+
type GaugeValue struct {
18+
Name string
19+
Hash string `json:"-"`
20+
Value float32
21+
22+
Labels []Label `json:"-"`
23+
DisplayLabels map[string]string `json:"Labels"`
24+
}
25+
26+
type PointValue struct {
27+
Name string
28+
Points []float32
29+
}
30+
31+
type SampledValue struct {
32+
Name string
33+
Hash string `json:"-"`
34+
*AggregateSample
35+
Mean float64
36+
Stddev float64
37+
38+
Labels []Label `json:"-"`
39+
DisplayLabels map[string]string `json:"Labels"`
40+
}
41+
42+
// AggregateSample is used to hold aggregate metrics
43+
// about a sample
44+
type AggregateSample struct {
45+
Count int // The count of emitted pairs
46+
Rate float64 // The values rate per time unit (usually 1 second)
47+
Sum float64 // The sum of values
48+
SumSq float64 `json:"-"` // The sum of squared values
49+
Min float64 // Minimum value
50+
Max float64 // Maximum value
51+
LastUpdated time.Time `json:"-"` // When value was last updated
52+
}
53+
54+
type Label struct {
55+
Name string
56+
Value string
57+
}
58+
59+
// Metrics returns a slice of bytes containing metrics, optionally formatted as either json or prometheus
60+
func (op *Operator) Metrics(q *QueryOptions) ([]byte, error) {
61+
if q == nil {
62+
q = &QueryOptions{}
63+
}
64+
65+
metricsReader, err := op.c.rawQuery("/v1/metrics", q)
66+
if err != nil {
67+
return nil, err
68+
}
69+
70+
metricsBytes, err := ioutil.ReadAll(metricsReader)
71+
if err != nil {
72+
return nil, err
73+
}
74+
75+
return metricsBytes, nil
76+
}
77+
78+
// MetricsSummary returns a MetricsSummary struct and query metadata
79+
func (op *Operator) MetricsSummary(q *QueryOptions) (*MetricsSummary, *QueryMeta, error) {
80+
var resp *MetricsSummary
81+
qm, err := op.c.query("/v1/metrics", &resp, q)
82+
if err != nil {
83+
return nil, nil, err
84+
}
85+
86+
return resp, qm, nil
87+
}

api/operator_metrics_test.go

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package api
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/require"
7+
)
8+
9+
func TestOperator_MetricsSummary(t *testing.T) {
10+
t.Parallel()
11+
c, s := makeClient(t, nil, nil)
12+
defer s.Stop()
13+
14+
operator := c.Operator()
15+
qo := &QueryOptions{
16+
Params: map[string]string{
17+
"pretty": "1",
18+
},
19+
}
20+
21+
metrics, qm, err := operator.MetricsSummary(qo)
22+
require.NoError(t, err)
23+
require.NotNil(t, metrics)
24+
require.NotNil(t, qm)
25+
require.NotNil(t, metrics.Timestamp) // should always get a TimeStamp
26+
require.GreaterOrEqual(t, len(metrics.Points), 0) // may not have points yet
27+
require.GreaterOrEqual(t, len(metrics.Gauges), 1) // should have at least 1 gauge
28+
require.GreaterOrEqual(t, len(metrics.Counters), 1) // should have at least 1 counter
29+
require.GreaterOrEqual(t, len(metrics.Samples), 1) // should have at least 1 sample
30+
}
31+
32+
func TestOperator_Metrics_Prometheus(t *testing.T) {
33+
t.Parallel()
34+
c, s := makeClient(t, nil, nil)
35+
defer s.Stop()
36+
37+
operator := c.Operator()
38+
qo := &QueryOptions{
39+
Params: map[string]string{
40+
"format": "prometheus",
41+
},
42+
}
43+
44+
metrics, err := operator.Metrics(qo)
45+
require.NoError(t, err)
46+
require.NotNil(t, metrics)
47+
metricString := string(metrics[:])
48+
require.Containsf(t, metricString, "# HELP", "expected Prometheus format containing \"# HELP\", got: \n%s", metricString)
49+
}

command/metrics.go

+37-9
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ Metrics Specific Options
2828
2929
-format <format>
3030
Specify output format (prometheus)
31+
32+
-json
33+
Output the allocation in its JSON format.
34+
35+
-t
36+
Format and display allocation using a Go template.
37+
3138
`
3239

3340
return strings.TrimSpace(helpText)
@@ -42,19 +49,23 @@ func (c *OperatorMetricsCommand) AutocompleteFlags() complete.Flags {
4249
complete.Flags{
4350
"-pretty": complete.PredictAnything,
4451
"-format": complete.PredictAnything,
52+
"-json": complete.PredictNothing,
53+
"-t": complete.PredictAnything,
4554
})
4655
}
4756

4857
func (c *OperatorMetricsCommand) Name() string { return "metrics" }
4958

5059
func (c *OperatorMetricsCommand) Run(args []string) int {
51-
var pretty bool
52-
var format string
60+
var pretty, json bool
61+
var format, tmpl string
5362

5463
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
5564
flags.Usage = func() { c.Ui.Output(c.Help()) }
5665
flags.BoolVar(&pretty, "pretty", false, "")
5766
flags.StringVar(&format, "format", "", "")
67+
flags.BoolVar(&json, "json", false, "")
68+
flags.StringVar(&tmpl, "t", "", "")
5869

5970
if err := flags.Parse(args); err != nil {
6071
c.Ui.Error(fmt.Sprintf("Error parsing flags: %s", err))
@@ -88,14 +99,31 @@ func (c *OperatorMetricsCommand) Run(args []string) int {
8899
Params: params,
89100
}
90101

91-
bs, err := client.Operator().Metrics(query)
92-
if err != nil {
93-
c.Ui.Error(fmt.Sprintf("Error getting metrics: %v", err))
94-
return 1
102+
if json || len(tmpl) > 0 {
103+
metrics, _, err := client.Operator().MetricsSummary(query)
104+
if err != nil {
105+
c.Ui.Error(fmt.Sprintf("Error querying metrics: %v", err))
106+
return 1
107+
}
108+
109+
out, err := Format(json, tmpl, metrics)
110+
if err != nil {
111+
c.Ui.Error(err.Error())
112+
return 1
113+
}
114+
115+
c.Ui.Output(out)
116+
return 0
117+
} else {
118+
bs, err := client.Operator().Metrics(query)
119+
if err != nil {
120+
c.Ui.Error(fmt.Sprintf("Error getting metrics: %v", err))
121+
return 1
122+
}
123+
124+
resp := string(bs[:])
125+
c.Ui.Output(resp)
95126
}
96127

97-
resp := string(bs[:])
98-
c.Ui.Output(resp)
99-
100128
return 0
101129
}

command/metrics_test.go

+14
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,20 @@ func TestCommand_Metrics_Cases(t *testing.T) {
2525
expectedOutput string
2626
expectedError string
2727
}{
28+
{
29+
"gotemplate MetricsSummary",
30+
[]string{"-address=" + url, "-t", "'{{ .Timestamp }}'"},
31+
0,
32+
"UTC",
33+
"",
34+
},
35+
{
36+
"json formatted MetricsSummary",
37+
[]string{"-address=" + url, "-json"},
38+
0,
39+
"{",
40+
"",
41+
},
2842
{
2943
"pretty print json",
3044
[]string{"-address=" + url, "-pretty"},

command/operator_debug.go

+60-10
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ Debug Options:
7474
profiles. Accepts id prefixes.
7575
7676
-server-id=<server>,<server>
77-
Comma separated list of Nomad server names, or "leader" to monitor for logs and include pprof
77+
Comma separated list of Nomad server names, "leader", or "all" to monitor for logs and include pprof
7878
profiles.
7979
8080
-stale=<true|false>
@@ -251,9 +251,25 @@ func (c *OperatorDebugCommand) Run(args []string) int {
251251
}
252252
}
253253

254-
// Resolve server prefixes
255-
for _, id := range argNodes(serverIDs) {
256-
c.serverIDs = append(c.serverIDs, id)
254+
// Resolve servers
255+
members, err := client.Agent().Members()
256+
c.writeJSON("version", "members.json", members, err)
257+
// We always write the error to the file, but don't range if no members found
258+
if serverIDs == "all" && members != nil {
259+
// Special case to capture from all servers
260+
for _, member := range members.Members {
261+
c.serverIDs = append(c.serverIDs, member.Name)
262+
}
263+
} else {
264+
for _, id := range argNodes(serverIDs) {
265+
c.serverIDs = append(c.serverIDs, id)
266+
}
267+
}
268+
269+
// Return error if servers were specified but not found
270+
if len(serverIDs) > 0 && len(c.serverIDs) == 0 {
271+
c.Ui.Error(fmt.Sprintf("Failed to retrieve servers, 0 members found in list: %s", serverIDs))
272+
return 1
257273
}
258274

259275
c.manifest = make([]string, 0)
@@ -267,6 +283,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
267283
stamped := "nomad-debug-" + c.timestamp
268284

269285
c.Ui.Output("Starting debugger and capturing cluster data...")
286+
c.Ui.Output(fmt.Sprintf("Capturing from servers: %v", c.serverIDs))
287+
c.Ui.Output(fmt.Sprintf("Capturing from client nodes: %v", c.nodeIDs))
270288

271289
c.Ui.Output(fmt.Sprintf(" Interval: '%s'", interval))
272290
c.Ui.Output(fmt.Sprintf(" Duration: '%s'", duration))
@@ -499,6 +517,23 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client)
499517
if err == nil {
500518
c.writeBytes(path, "goroutine.prof", bs)
501519
}
520+
521+
// Gather goroutine text output - debug type 1
522+
// debug type 1 writes the legacy text format for human readable output
523+
opts.Debug = 1
524+
bs, err = client.Agent().Lookup("goroutine", opts, nil)
525+
if err == nil {
526+
c.writeBytes(path, "goroutine-debug1.txt", bs)
527+
}
528+
529+
// Gather goroutine text output - debug type 2
530+
// When printing the "goroutine" profile, debug=2 means to print the goroutine
531+
// stacks in the same form that a Go program uses when dying due to an unrecovered panic.
532+
opts.Debug = 2
533+
bs, err = client.Agent().Lookup("goroutine", opts, nil)
534+
if err == nil {
535+
c.writeBytes(path, "goroutine-debug2.txt", bs)
536+
}
502537
}
503538

504539
// collectPeriodic runs for duration, capturing the cluster state every interval. It flushes and stops
@@ -576,8 +611,11 @@ func (c *OperatorDebugCommand) collectNomad(dir string, client *api.Client) erro
576611
vs, _, err := client.CSIVolumes().List(qo)
577612
c.writeJSON(dir, "volumes.json", vs, err)
578613

579-
metrics, err := client.Operator().Metrics(qo)
580-
c.writeJSON(dir, "metrics.json", metrics, err)
614+
if metricBytes, err := client.Operator().Metrics(qo); err != nil {
615+
c.writeError(dir, "metrics.json", err)
616+
} else {
617+
c.writeBytes(dir, "metrics.json", metricBytes)
618+
}
581619

582620
return nil
583621
}
@@ -628,12 +666,24 @@ func (c *OperatorDebugCommand) collectVault(dir, vault string) error {
628666

629667
// writeBytes writes a file to the archive, recording it in the manifest
630668
func (c *OperatorDebugCommand) writeBytes(dir, file string, data []byte) error {
631-
path := filepath.Join(dir, file)
632-
c.manifest = append(c.manifest, path)
633-
path = filepath.Join(c.collectDir, path)
669+
relativePath := filepath.Join(dir, file)
670+
c.manifest = append(c.manifest, relativePath)
671+
dirPath := filepath.Join(c.collectDir, dir)
672+
filePath := filepath.Join(dirPath, file)
673+
674+
// Ensure parent directories exist
675+
err := os.MkdirAll(dirPath, os.ModePerm)
676+
if err != nil {
677+
// Display error immediately -- may not see this if files aren't written
678+
c.Ui.Error(fmt.Sprintf("failed to create parent directories of \"%s\": %s", dirPath, err.Error()))
679+
return err
680+
}
634681

635-
fh, err := os.Create(path)
682+
// Create the file
683+
fh, err := os.Create(filePath)
636684
if err != nil {
685+
// Display error immediately -- may not see this if files aren't written
686+
c.Ui.Error(fmt.Sprintf("failed to create file \"%s\": %s", filePath, err.Error()))
637687
return err
638688
}
639689
defer fh.Close()

0 commit comments

Comments
 (0)