Skip to content

Commit 8e25502

Browse files
authored
Merge pull request #13786 from hashicorp/b-metrics-for-classless-blocked-evals
metrics: classless blocked evals get metrics
2 parents cd047cd + 582a8a9 commit 8e25502

File tree

4 files changed

+70
-47
lines changed

4 files changed

+70
-47
lines changed

.changelog/13786.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:bug
2+
metrics: Fixed a bug where blocked evals with no class produced no dc:class scope metrics
3+
```

nomad/blocked_evals.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ import (
44
"sync"
55
"time"
66

7-
metrics "github.com/armon/go-metrics"
8-
log "github.com/hashicorp/go-hclog"
7+
"github.com/armon/go-metrics"
8+
"github.com/hashicorp/go-hclog"
99
"github.com/hashicorp/nomad/helper"
1010
"github.com/hashicorp/nomad/nomad/structs"
1111
)
@@ -31,7 +31,7 @@ const (
3131
// failed allocation becomes available.
3232
type BlockedEvals struct {
3333
// logger is the logger to use by the blocked eval tracker.
34-
logger log.Logger
34+
logger hclog.Logger
3535

3636
evalBroker *EvalBroker
3737
enabled bool
@@ -96,7 +96,7 @@ type wrappedEval struct {
9696

9797
// NewBlockedEvals creates a new blocked eval tracker that will enqueue
9898
// unblocked evals into the passed broker.
99-
func NewBlockedEvals(evalBroker *EvalBroker, logger log.Logger) *BlockedEvals {
99+
func NewBlockedEvals(evalBroker *EvalBroker, logger hclog.Logger) *BlockedEvals {
100100
return &BlockedEvals{
101101
logger: logger.Named("blocked_evals"),
102102
evalBroker: evalBroker,
@@ -727,7 +727,7 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) {
727727
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "memory"}, float32(v.MemoryMB), labels)
728728
}
729729

730-
for k, v := range stats.BlockedResources.ByNode {
730+
for k, v := range stats.BlockedResources.ByClassInDC {
731731
labels := []metrics.Label{
732732
{Name: "datacenter", Value: k.dc},
733733
{Name: "node_class", Value: k.class},

nomad/blocked_evals_stats.go

+23-19
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ type BlockedStats struct {
2424
BlockedResources *BlockedResourcesStats
2525
}
2626

27-
// node stores information related to nodes.
28-
type node struct {
27+
// classInDC is a coordinate of a specific class in a specific datacenter
28+
type classInDC struct {
2929
dc string
3030
class string
3131
}
@@ -65,9 +65,9 @@ func (b *BlockedStats) prune(cutoff time.Time) {
6565
}
6666
}
6767

68-
for k, v := range b.BlockedResources.ByNode {
68+
for k, v := range b.BlockedResources.ByClassInDC {
6969
if shouldPrune(v) {
70-
delete(b.BlockedResources.ByNode, k)
70+
delete(b.BlockedResources.ByClassInDC, k)
7171
}
7272
}
7373
}
@@ -89,6 +89,10 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
8989
for class := range allocMetrics.ClassExhausted {
9090
classes[class] = struct{}{}
9191
}
92+
if len(allocMetrics.ClassExhausted) == 0 {
93+
// some evaluations have no class
94+
classes[""] = struct{}{}
95+
}
9296
for _, r := range allocMetrics.ResourcesExhausted {
9397
resources.CPU += r.CPU
9498
resources.MemoryMB += r.MemoryMB
@@ -99,32 +103,32 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
99103
nsID := structs.NewNamespacedID(eval.JobID, eval.Namespace)
100104
byJob[nsID] = resources
101105

102-
byNodeInfo := make(map[node]BlockedResourcesSummary)
106+
byClassInDC := make(map[classInDC]BlockedResourcesSummary)
103107
for dc := range dcs {
104108
for class := range classes {
105-
k := node{dc: dc, class: class}
106-
byNodeInfo[k] = resources
109+
k := classInDC{dc: dc, class: class}
110+
byClassInDC[k] = resources
107111
}
108112
}
109113

110114
return &BlockedResourcesStats{
111-
ByJob: byJob,
112-
ByNode: byNodeInfo,
115+
ByJob: byJob,
116+
ByClassInDC: byClassInDC,
113117
}
114118
}
115119

116120
// BlockedResourcesStats stores resources requested by blocked evaluations,
117121
// tracked both by job and by node.
118122
type BlockedResourcesStats struct {
119-
ByJob map[structs.NamespacedID]BlockedResourcesSummary
120-
ByNode map[node]BlockedResourcesSummary
123+
ByJob map[structs.NamespacedID]BlockedResourcesSummary
124+
ByClassInDC map[classInDC]BlockedResourcesSummary
121125
}
122126

123127
// NewBlockedResourcesStats returns a new BlockedResourcesStats.
124128
func NewBlockedResourcesStats() *BlockedResourcesStats {
125129
return &BlockedResourcesStats{
126-
ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary),
127-
ByNode: make(map[node]BlockedResourcesSummary),
130+
ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary),
131+
ByClassInDC: make(map[classInDC]BlockedResourcesSummary),
128132
}
129133
}
130134

@@ -136,8 +140,8 @@ func (b *BlockedResourcesStats) Copy() *BlockedResourcesStats {
136140
result.ByJob[k] = v // value copy
137141
}
138142

139-
for k, v := range b.ByNode {
140-
result.ByNode[k] = v // value copy
143+
for k, v := range b.ByClassInDC {
144+
result.ByClassInDC[k] = v // value copy
141145
}
142146

143147
return result
@@ -152,8 +156,8 @@ func (b *BlockedResourcesStats) Add(a *BlockedResourcesStats) *BlockedResourcesS
152156
result.ByJob[k] = b.ByJob[k].Add(v)
153157
}
154158

155-
for k, v := range a.ByNode {
156-
result.ByNode[k] = b.ByNode[k].Add(v)
159+
for k, v := range a.ByClassInDC {
160+
result.ByClassInDC[k] = b.ByClassInDC[k].Add(v)
157161
}
158162

159163
return result
@@ -168,8 +172,8 @@ func (b *BlockedResourcesStats) Subtract(a *BlockedResourcesStats) *BlockedResou
168172
result.ByJob[k] = b.ByJob[k].Subtract(v)
169173
}
170174

171-
for k, v := range a.ByNode {
172-
result.ByNode[k] = b.ByNode[k].Subtract(v)
175+
for k, v := range a.ByClassInDC {
176+
result.ByClassInDC[k] = b.ByClassInDC[k].Subtract(v)
173177
}
174178

175179
return result

nomad/blocked_evals_stats_test.go

+39-23
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,8 @@ func TestBlockedResourceStats_New(t *testing.T) {
128128
a := NewBlockedResourcesStats()
129129
require.NotNil(t, a.ByJob)
130130
require.Empty(t, a.ByJob)
131-
require.NotNil(t, a.ByNode)
132-
require.Empty(t, a.ByNode)
131+
require.NotNil(t, a.ByClassInDC)
132+
require.Empty(t, a.ByClassInDC)
133133
}
134134

135135
var (
@@ -143,15 +143,20 @@ var (
143143
Namespace: "two",
144144
}
145145

146-
node1 = node{
146+
node1 = classInDC{
147147
dc: "dc1",
148148
class: "alpha",
149149
}
150150

151-
node2 = node{
151+
node2 = classInDC{
152152
dc: "dc1",
153153
class: "beta",
154154
}
155+
156+
node3 = classInDC{
157+
dc: "dc1",
158+
class: "", // not set
159+
}
155160
)
156161

157162
func TestBlockedResourceStats_Copy(t *testing.T) {
@@ -166,7 +171,7 @@ func TestBlockedResourceStats_Copy(t *testing.T) {
166171
MemoryMB: 256,
167172
},
168173
}
169-
a.ByNode = map[node]BlockedResourcesSummary{
174+
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
170175
node1: {
171176
Timestamp: now1,
172177
CPU: 300,
@@ -180,23 +185,23 @@ func TestBlockedResourceStats_Copy(t *testing.T) {
180185
CPU: 888,
181186
MemoryMB: 888,
182187
}
183-
c.ByNode[node1] = BlockedResourcesSummary{
188+
c.ByClassInDC[node1] = BlockedResourcesSummary{
184189
Timestamp: now2,
185190
CPU: 999,
186191
MemoryMB: 999,
187192
}
188193

189194
// underlying data should have been deep copied
190195
require.Equal(t, 100, a.ByJob[id1].CPU)
191-
require.Equal(t, 300, a.ByNode[node1].CPU)
196+
require.Equal(t, 300, a.ByClassInDC[node1].CPU)
192197
}
193198

194199
func TestBlockedResourcesStats_Add(t *testing.T) {
195200
a := NewBlockedResourcesStats()
196201
a.ByJob = map[structs.NamespacedID]BlockedResourcesSummary{
197202
id1: {Timestamp: now(1), CPU: 111, MemoryMB: 222},
198203
}
199-
a.ByNode = map[node]BlockedResourcesSummary{
204+
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
200205
node1: {Timestamp: now(2), CPU: 333, MemoryMB: 444},
201206
}
202207

@@ -205,7 +210,7 @@ func TestBlockedResourcesStats_Add(t *testing.T) {
205210
id1: {Timestamp: now(3), CPU: 200, MemoryMB: 300},
206211
id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500},
207212
}
208-
b.ByNode = map[node]BlockedResourcesSummary{
213+
b.ByClassInDC = map[classInDC]BlockedResourcesSummary{
209214
node1: {Timestamp: now(5), CPU: 600, MemoryMB: 700},
210215
node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900},
211216
}
@@ -218,10 +223,10 @@ func TestBlockedResourcesStats_Add(t *testing.T) {
218223
id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500},
219224
}, result.ByJob)
220225

221-
require.Equal(t, map[node]BlockedResourcesSummary{
226+
require.Equal(t, map[classInDC]BlockedResourcesSummary{
222227
node1: {Timestamp: now(5), CPU: 933, MemoryMB: 1144},
223228
node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900},
224-
}, result.ByNode)
229+
}, result.ByClassInDC)
225230
})
226231

227232
// make sure we handle zeros in both directions
@@ -233,20 +238,31 @@ func TestBlockedResourcesStats_Add(t *testing.T) {
233238
id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500},
234239
}, result.ByJob)
235240

236-
require.Equal(t, map[node]BlockedResourcesSummary{
241+
require.Equal(t, map[classInDC]BlockedResourcesSummary{
237242
node1: {Timestamp: now(2), CPU: 933, MemoryMB: 1144},
238243
node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900},
239-
}, result.ByNode)
244+
}, result.ByClassInDC)
240245
})
241246
}
242247

248+
func TestBlockedResourcesStats_Add_NoClass(t *testing.T) {
249+
a := NewBlockedResourcesStats()
250+
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
251+
node3: {Timestamp: now(1), CPU: 111, MemoryMB: 1111},
252+
}
253+
result := a.Add(a)
254+
require.Equal(t, map[classInDC]BlockedResourcesSummary{
255+
node3: {Timestamp: now(1), CPU: 222, MemoryMB: 2222},
256+
}, result.ByClassInDC)
257+
}
258+
243259
func TestBlockedResourcesStats_Subtract(t *testing.T) {
244260
a := NewBlockedResourcesStats()
245261
a.ByJob = map[structs.NamespacedID]BlockedResourcesSummary{
246262
id1: {Timestamp: now(1), CPU: 100, MemoryMB: 100},
247263
id2: {Timestamp: now(2), CPU: 200, MemoryMB: 200},
248264
}
249-
a.ByNode = map[node]BlockedResourcesSummary{
265+
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
250266
node1: {Timestamp: now(3), CPU: 300, MemoryMB: 300},
251267
node2: {Timestamp: now(4), CPU: 400, MemoryMB: 400},
252268
}
@@ -256,7 +272,7 @@ func TestBlockedResourcesStats_Subtract(t *testing.T) {
256272
id1: {Timestamp: now(5), CPU: 10, MemoryMB: 11},
257273
id2: {Timestamp: now(6), CPU: 12, MemoryMB: 13},
258274
}
259-
b.ByNode = map[node]BlockedResourcesSummary{
275+
b.ByClassInDC = map[classInDC]BlockedResourcesSummary{
260276
node1: {Timestamp: now(7), CPU: 14, MemoryMB: 15},
261277
node2: {Timestamp: now(8), CPU: 16, MemoryMB: 17},
262278
}
@@ -274,14 +290,14 @@ func TestBlockedResourcesStats_Subtract(t *testing.T) {
274290
require.Equal(t, 187, result.ByJob[id2].MemoryMB)
275291

276292
// node1
277-
require.Equal(t, now(7), result.ByNode[node1].Timestamp)
278-
require.Equal(t, 286, result.ByNode[node1].CPU)
279-
require.Equal(t, 285, result.ByNode[node1].MemoryMB)
293+
require.Equal(t, now(7), result.ByClassInDC[node1].Timestamp)
294+
require.Equal(t, 286, result.ByClassInDC[node1].CPU)
295+
require.Equal(t, 285, result.ByClassInDC[node1].MemoryMB)
280296

281297
// node2
282-
require.Equal(t, now(8), result.ByNode[node2].Timestamp)
283-
require.Equal(t, 384, result.ByNode[node2].CPU)
284-
require.Equal(t, 383, result.ByNode[node2].MemoryMB)
298+
require.Equal(t, now(8), result.ByClassInDC[node2].Timestamp)
299+
require.Equal(t, 384, result.ByClassInDC[node2].CPU)
300+
require.Equal(t, 383, result.ByClassInDC[node2].MemoryMB)
285301
}
286302

287303
// testBlockedEvalsRandomBlockedEval wraps an eval that is randomly generated.
@@ -363,9 +379,9 @@ func clearTimestampFromBlockedResourceStats(b *BlockedResourcesStats) {
363379
v.Timestamp = time.Time{}
364380
b.ByJob[k] = v
365381
}
366-
for k, v := range b.ByNode {
382+
for k, v := range b.ByClassInDC {
367383
v.Timestamp = time.Time{}
368-
b.ByNode[k] = v
384+
b.ByClassInDC[k] = v
369385
}
370386
}
371387

0 commit comments

Comments
 (0)