Skip to content

Commit 7c75696

Browse files
authored
CSI: unique volume per allocation
Add a `PerAlloc` field to volume requests that directs the scheduler to test feasibility for volumes with a source ID that includes the allocation index suffix (ex. `[0]`), rather than the exact source ID. Read the `PerAlloc` field when making the volume claim at the client to determine if the allocation index suffix (ex. `[0]`) should be added to the volume source ID.
1 parent a1eaad9 commit 7c75696

File tree

25 files changed

+329
-71
lines changed

25 files changed

+329
-71
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ BUG FIXES:
1414
IMPROVEMENTS:
1515
* cli: Update defaults for `nomad operator debug` flags `-interval` and `-server-id` to match common usage. [[GH-10121](https://github.com/hashicorp/nomad/issues/10121)]
1616
* consul/connect: Enable setting `local_bind_address` field on connect upstreams [[GH-6248](https://github.com/hashicorp/nomad/issues/6248)]
17+
* csi: Added support for jobs to request a unique volume ID per allocation. [[GH-10136](https://github.com/hashicorp/nomad/issues/10136)]
1718
* driver/docker: Added support for optional extra container labels. [[GH-9885](https://github.com/hashicorp/nomad/issues/9885)]
1819
* driver/docker: Added support for configuring default logger behavior in the client configuration. [[GH-10156](https://github.com/hashicorp/nomad/issues/10156)]
1920

api/tasks.go

+1
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ type VolumeRequest struct {
382382
Source string `hcl:"source,optional"`
383383
ReadOnly bool `hcl:"read_only,optional"`
384384
MountOptions *CSIMountOptions `hcl:"mount_options,block"`
385+
PerAlloc bool `hcl:"per_alloc,optional"`
385386
ExtraKeysHCL []string `hcl1:",unusedKeys,optional" json:"-"`
386387
}
387388

client/allocrunner/csi_hook.go

+13-2
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,14 @@ func (c *csiHook) Postrun() error {
9292
mode = structs.CSIVolumeClaimWrite
9393
}
9494

95+
source := pair.request.Source
96+
if pair.request.PerAlloc {
97+
// NOTE: PerAlloc can't be set if we have canaries
98+
source = source + structs.AllocSuffix(c.alloc.Name)
99+
}
100+
95101
req := &structs.CSIVolumeUnpublishRequest{
96-
VolumeID: pair.request.Source,
102+
VolumeID: source,
97103
Claim: &structs.CSIVolumeClaim{
98104
AllocationID: c.alloc.ID,
99105
NodeID: c.alloc.NodeID,
@@ -159,8 +165,13 @@ func (c *csiHook) claimVolumesFromAlloc() (map[string]*volumeAndRequest, error)
159165
claimType = structs.CSIVolumeClaimRead
160166
}
161167

168+
source := pair.request.Source
169+
if pair.request.PerAlloc {
170+
source = source + structs.AllocSuffix(c.alloc.Name)
171+
}
172+
162173
req := &structs.CSIVolumeClaimRequest{
163-
VolumeID: pair.request.Source,
174+
VolumeID: source,
164175
AllocationID: c.alloc.ID,
165176
NodeID: c.alloc.NodeID,
166177
Claim: claimType,

command/agent/job_endpoint.go

+1
Original file line numberDiff line numberDiff line change
@@ -944,6 +944,7 @@ func ApiTgToStructsTG(job *structs.Job, taskGroup *api.TaskGroup, tg *structs.Ta
944944
Type: v.Type,
945945
ReadOnly: v.ReadOnly,
946946
Source: v.Source,
947+
PerAlloc: v.PerAlloc,
947948
}
948949

949950
if v.MountOptions != nil {

e2e/csi/input/use-ebs-volume.nomad

+3-2
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@ job "use-ebs-volume" {
99

1010
group "group" {
1111
volume "test" {
12-
type = "csi"
13-
source = "ebs-vol0"
12+
type = "csi"
13+
source = "ebs-vol"
14+
per_alloc = true
1415
}
1516

1617
task "task" {

e2e/terraform/compute.tf

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ data "external" "packer_sha" {
6363
sha=$(git log -n 1 --pretty=format:%H packer)
6464
echo "{\"sha\":\"$${sha}\"}"
6565
EOT
66-
]
66+
]
6767

6868
}
6969

e2e/terraform/terraform.tfvars

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,5 @@ nomad_local_binary = "" # overrides nomad_sha and nomad_version if set
1515

1616
# Example overrides:
1717
# nomad_sha = "38e23b62a7700c96f4898be777543869499fea0a"
18-
# nomad_local_binary = "../../pkg/linux_amd/nomad"
18+
# nomad_local_binary = "../../pkg/linux_amd64/nomad"
1919
# nomad_local_binary_client_windows_2016_amd64 = ["../../pkg/windows_amd64/nomad.exe"]

e2e/terraform/volumes.tf

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ data "template_file" "ebs_volume_hcl" {
3030
count = var.volumes ? 1 : 0
3131
template = <<EOT
3232
type = "csi"
33-
id = "ebs-vol0"
34-
name = "ebs-vol0"
33+
id = "ebs-vol[0]"
34+
name = "ebs-vol"
3535
external_id = "${aws_ebs_volume.csi[0].id}"
3636
access_mode = "single-node-writer"
3737
attachment_mode = "file-system"

nomad/state/state_store.go

+1-4
Original file line numberDiff line numberDiff line change
@@ -2135,10 +2135,7 @@ func (s *StateStore) CSIVolumeByID(ws memdb.WatchSet, namespace, id string) (*st
21352135
if obj == nil {
21362136
return nil, nil
21372137
}
2138-
vol, ok := obj.(*structs.CSIVolume)
2139-
if !ok {
2140-
return nil, fmt.Errorf("volume row conversion error")
2141-
}
2138+
vol := obj.(*structs.CSIVolume)
21422139

21432140
// we return the volume with the plugins denormalized by default,
21442141
// because the scheduler needs them for feasibility checking

nomad/structs/diff_test.go

+7
Original file line numberDiff line numberDiff line change
@@ -3639,6 +3639,7 @@ func TestTaskGroupDiff(t *testing.T) {
36393639
Type: "host",
36403640
Source: "foo-src",
36413641
ReadOnly: true,
3642+
PerAlloc: true,
36423643
},
36433644
},
36443645
},
@@ -3656,6 +3657,12 @@ func TestTaskGroupDiff(t *testing.T) {
36563657
Old: "",
36573658
New: "foo",
36583659
},
3660+
{
3661+
Type: DiffTypeAdded,
3662+
Name: "PerAlloc",
3663+
Old: "",
3664+
New: "true",
3665+
},
36593666
{
36603667
Type: DiffTypeAdded,
36613668
Name: "ReadOnly",

nomad/structs/funcs.go

+11
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,17 @@ func AllocName(job, group string, idx uint) string {
329329
return fmt.Sprintf("%s.%s[%d]", job, group, idx)
330330
}
331331

332+
// AllocSuffix returns the alloc index suffix that was added by the AllocName
333+
// function above.
334+
func AllocSuffix(name string) string {
335+
idx := strings.LastIndex(name, "[")
336+
if idx == -1 {
337+
return ""
338+
}
339+
suffix := name[idx:]
340+
return suffix
341+
}
342+
332343
// ACLPolicyListHash returns a consistent hash for a set of policies.
333344
func ACLPolicyListHash(policies []*ACLPolicy) string {
334345
cacheKeyHash, err := blake2b.New256(nil)

nomad/structs/structs.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -6100,14 +6100,19 @@ func (tg *TaskGroup) Validate(j *Job) error {
61006100
mErr.Errors = append(mErr.Errors, fmt.Errorf("Only one task may be marked as leader"))
61016101
}
61026102

6103-
// Validate the Host Volumes
6103+
// Validate the volume requests
61046104
for name, decl := range tg.Volumes {
61056105
if !(decl.Type == VolumeTypeHost ||
61066106
decl.Type == VolumeTypeCSI) {
61076107
mErr.Errors = append(mErr.Errors, fmt.Errorf("Volume %s has unrecognised type %s", name, decl.Type))
61086108
continue
61096109
}
61106110

6111+
if decl.PerAlloc && tg.Update != nil && tg.Update.Canary > 0 {
6112+
mErr.Errors = append(mErr.Errors,
6113+
fmt.Errorf("Volume %s cannot be per_alloc when canaries are in use", name))
6114+
}
6115+
61116116
if decl.Source == "" {
61126117
mErr.Errors = append(mErr.Errors, fmt.Errorf("Volume %s has an empty source", name))
61136118
}

nomad/structs/structs_test.go

+22
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,28 @@ func TestTaskGroup_Validate(t *testing.T) {
11061106
err = tg.Validate(&Job{})
11071107
require.Contains(t, err.Error(), `Volume foo has an empty source`)
11081108

1109+
tg = &TaskGroup{
1110+
Name: "group-a",
1111+
Update: &UpdateStrategy{
1112+
Canary: 1,
1113+
},
1114+
Volumes: map[string]*VolumeRequest{
1115+
"foo": {
1116+
Type: "csi",
1117+
PerAlloc: true,
1118+
},
1119+
},
1120+
Tasks: []*Task{
1121+
{
1122+
Name: "task-a",
1123+
Resources: &Resources{},
1124+
},
1125+
},
1126+
}
1127+
err = tg.Validate(&Job{})
1128+
require.Contains(t, err.Error(), `Volume foo has an empty source`)
1129+
require.Contains(t, err.Error(), `Volume foo cannot be per_alloc when canaries are in use`)
1130+
11091131
tg = &TaskGroup{
11101132
Volumes: map[string]*VolumeRequest{
11111133
"foo": {

nomad/structs/volumes.go

+1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ type VolumeRequest struct {
9191
Source string
9292
ReadOnly bool
9393
MountOptions *CSIMountOptions
94+
PerAlloc bool
9495
}
9596

9697
func (v *VolumeRequest) Copy() *VolumeRequest {

scheduler/feasible.go

+14-7
Original file line numberDiff line numberDiff line change
@@ -227,31 +227,38 @@ func (c *CSIVolumeChecker) SetNamespace(namespace string) {
227227
c.namespace = namespace
228228
}
229229

230-
func (c *CSIVolumeChecker) SetVolumes(volumes map[string]*structs.VolumeRequest) {
230+
func (c *CSIVolumeChecker) SetVolumes(allocName string, volumes map[string]*structs.VolumeRequest) {
231+
231232
xs := make(map[string]*structs.VolumeRequest)
233+
232234
// Filter to only CSI Volumes
233235
for alias, req := range volumes {
234236
if req.Type != structs.VolumeTypeCSI {
235237
continue
236238
}
237-
238-
xs[alias] = req
239+
if req.PerAlloc {
240+
// provide a unique volume source per allocation
241+
copied := req.Copy()
242+
copied.Source = copied.Source + structs.AllocSuffix(allocName)
243+
xs[alias] = copied
244+
} else {
245+
xs[alias] = req
246+
}
239247
}
240248
c.volumes = xs
241249
}
242250

243251
func (c *CSIVolumeChecker) Feasible(n *structs.Node) bool {
244-
hasPlugins, failReason := c.hasPlugins(n)
245-
246-
if hasPlugins {
252+
ok, failReason := c.isFeasible(n)
253+
if ok {
247254
return true
248255
}
249256

250257
c.ctx.Metrics().FilterNode(n, failReason)
251258
return false
252259
}
253260

254-
func (c *CSIVolumeChecker) hasPlugins(n *structs.Node) (bool, string) {
261+
func (c *CSIVolumeChecker) isFeasible(n *structs.Node) (bool, string) {
255262
// We can mount the volume if
256263
// - if required, a healthy controller plugin is running the driver
257264
// - the volume has free claims, or this job owns the claims

scheduler/feasible_test.go

+16-3
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,13 @@ func TestCSIVolumeChecker(t *testing.T) {
309309
require.NoError(t, err)
310310
index++
311311

312+
vid3 := "volume-id[0]"
313+
vol3 := vol.Copy()
314+
vol3.ID = vid3
315+
err = state.CSIVolumeRegister(index, []*structs.CSIVolume{vol3})
316+
require.NoError(t, err)
317+
index++
318+
312319
alloc := mock.Alloc()
313320
alloc.NodeID = nodes[4].ID
314321
alloc.Job.TaskGroups[0].Volumes = map[string]*structs.VolumeRequest{
@@ -332,11 +339,17 @@ func TestCSIVolumeChecker(t *testing.T) {
332339
noVolumes := map[string]*structs.VolumeRequest{}
333340

334341
volumes := map[string]*structs.VolumeRequest{
335-
"baz": {
342+
"shared": {
336343
Type: "csi",
337344
Name: "baz",
338345
Source: "volume-id",
339346
},
347+
"unique": {
348+
Type: "csi",
349+
Name: "baz",
350+
Source: "volume-id[0]",
351+
PerAlloc: true,
352+
},
340353
"nonsense": {
341354
Type: "host",
342355
Name: "nonsense",
@@ -390,7 +403,7 @@ func TestCSIVolumeChecker(t *testing.T) {
390403
}
391404

392405
for i, c := range cases {
393-
checker.SetVolumes(c.RequestedVolumes)
406+
checker.SetVolumes(alloc.Name, c.RequestedVolumes)
394407
if act := checker.Feasible(c.Node); act != c.Result {
395408
t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result)
396409
}
@@ -407,7 +420,7 @@ func TestCSIVolumeChecker(t *testing.T) {
407420
checker.SetNamespace(structs.DefaultNamespace)
408421

409422
for _, node := range nodes {
410-
checker.SetVolumes(volumes)
423+
checker.SetVolumes(alloc.Name, volumes)
411424
act := checker.Feasible(node)
412425
require.False(t, act, "request with missing volume should never be feasible")
413426
}

scheduler/generic_sched.go

+1
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,7 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
547547

548548
// Compute penalty nodes for rescheduled allocs
549549
selectOptions := getSelectOptions(prevAllocation, preferredNode)
550+
selectOptions.AllocName = missing.Name()
550551
option := s.selectNextOption(tg, selectOptions)
551552

552553
// Store the available nodes by datacenter

0 commit comments

Comments
 (0)