Skip to content

Commit b66a341

Browse files
committed
scheduler/csi: fix early return when multiple volumes are requested
When multiple CSI volumes are requested, the feasibility check could return early for read/write volumes with free claims, even if a later volume in the request was not feasible for any other reason (including not existing at all). This can result in random failure to fail feasibility checking, depending on how the map of volumes was being ordered at runtime. Remove the early return from the feasibility check. Add a test to verify that missing volumes in the map will cause a failure; this test will not catch a regression every test run because of the random map ordering, but any failure will be caught over the course of several CI runs.
1 parent ded978c commit b66a341

File tree

3 files changed

+27
-9
lines changed

3 files changed

+27
-9
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ BUG FIXES:
66
* cli: Fixed a bug where non-int proxy port would panic CLI [[GH-10072](https://github.com/hashicorp/nomad/issues/10072)]
77
* cli: Fixed a bug where `nomad operator debug` incorrectly parsed https Consul API URLs. [[GH-10082](https://github.com/hashicorp/nomad/pull/10082)]
88
* client: Fixed log formatting when killing tasks. [[GH-10135](https://github.com/hashicorp/nomad/issues/10135)]
9+
* scheduler: Fixed a bug where jobs requesting multiple CSI volumes could be incorrectly scheduled if only one of the volumes passed feasibility checking. [[GH-10143](https://github.com/hashicorp/nomad/issues/10143)]
910
* ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)]
1011

1112
IMPROVEMENTS:

scheduler/feasible.go

+9-9
Original file line numberDiff line numberDiff line change
@@ -312,15 +312,15 @@ func (c *CSIVolumeChecker) hasPlugins(n *structs.Node) (bool, string) {
312312
if !vol.WriteSchedulable() {
313313
return false, fmt.Sprintf(FilterConstraintCSIVolumeNoWriteTemplate, vol.ID)
314314
}
315-
if vol.WriteFreeClaims() {
316-
return true, ""
317-
}
318-
319-
// Check the blocking allocations to see if they belong to this job
320-
for id := range vol.WriteAllocs {
321-
a, err := c.ctx.State().AllocByID(ws, id)
322-
if err != nil || a == nil || a.Namespace != c.namespace || a.JobID != c.jobID {
323-
return false, fmt.Sprintf(FilterConstraintCSIVolumeInUseTemplate, vol.ID)
315+
if !vol.WriteFreeClaims() {
316+
// Check the blocking allocations to see if they belong to this job
317+
for id := range vol.WriteAllocs {
318+
a, err := c.ctx.State().AllocByID(ws, id)
319+
if err != nil || a == nil ||
320+
a.Namespace != c.namespace || a.JobID != c.jobID {
321+
return false, fmt.Sprintf(
322+
FilterConstraintCSIVolumeInUseTemplate, vol.ID)
323+
}
324324
}
325325
}
326326
}

scheduler/feasible_test.go

+17
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,23 @@ func TestCSIVolumeChecker(t *testing.T) {
395395
t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result)
396396
}
397397
}
398+
399+
// add a missing volume
400+
volumes["missing"] = &structs.VolumeRequest{
401+
Type: "csi",
402+
Name: "bar",
403+
Source: "does-not-exist",
404+
}
405+
406+
checker = NewCSIVolumeChecker(ctx)
407+
checker.SetNamespace(structs.DefaultNamespace)
408+
409+
for _, node := range nodes {
410+
checker.SetVolumes(volumes)
411+
act := checker.Feasible(node)
412+
require.False(t, act, "request with missing volume should never be feasible")
413+
}
414+
398415
}
399416

400417
func TestNetworkChecker(t *testing.T) {

0 commit comments

Comments
 (0)