@@ -38,6 +38,7 @@ type csiPluginSupervisorHook struct {
38
38
39
39
// eventEmitter is used to emit events to the task
40
40
eventEmitter ti.EventEmitter
41
+ lifecycle ti.TaskLifecycle
41
42
42
43
shutdownCtx context.Context
43
44
shutdownCancelFn context.CancelFunc
@@ -54,6 +55,7 @@ type csiPluginSupervisorHookConfig struct {
54
55
clientStateDirPath string
55
56
events ti.EventEmitter
56
57
runner * TaskRunner
58
+ lifecycle ti.TaskLifecycle
57
59
capabilities * drivers.Capabilities
58
60
logger hclog.Logger
59
61
}
@@ -90,6 +92,7 @@ func newCSIPluginSupervisorHook(config *csiPluginSupervisorHookConfig) *csiPlugi
90
92
hook := & csiPluginSupervisorHook {
91
93
alloc : config .runner .Alloc (),
92
94
runner : config .runner ,
95
+ lifecycle : config .lifecycle ,
93
96
logger : config .logger ,
94
97
task : task ,
95
98
mountPoint : pluginRoot ,
@@ -201,27 +204,41 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) {
201
204
}()
202
205
203
206
socketPath := filepath .Join (h .mountPoint , structs .CSISocketName )
207
+
208
+ client := csi .NewClient (socketPath , h .logger .Named ("csi_client" ).With (
209
+ "plugin.name" , h .task .CSIPluginConfig .ID ,
210
+ "plugin.type" , h .task .CSIPluginConfig .Type ))
211
+ defer client .Close ()
212
+
204
213
t := time .NewTimer (0 )
205
214
215
+ // We're in Poststart at this point, so if we can't connect within
216
+ // this deadline, assume it's broken so we can restart the task
217
+ startCtx , startCancelFn := context .WithTimeout (ctx , 30 * time .Second )
218
+ defer startCancelFn ()
219
+
220
+ var err error
221
+ var pluginHealthy bool
222
+
206
223
// Step 1: Wait for the plugin to initially become available.
207
224
WAITFORREADY:
208
225
for {
209
226
select {
210
- case <- ctx .Done ():
227
+ case <- startCtx .Done ():
228
+ h .kill (ctx , fmt .Errorf ("CSI plugin failed probe: %v" , err ))
211
229
return
212
230
case <- t .C :
213
- pluginHealthy , err : = h .supervisorLoopOnce (ctx , socketPath )
231
+ pluginHealthy , err = h .supervisorLoopOnce (startCtx , client )
214
232
if err != nil || ! pluginHealthy {
215
- h .logger .Debug ("CSI Plugin not ready" , "error" , err )
216
-
217
- // Plugin is not yet returning healthy, because we want to optimise for
218
- // quickly bringing a plugin online, we use a short timeout here.
219
- // TODO(dani): Test with more plugins and adjust.
233
+ h .logger .Debug ("CSI plugin not ready" , "error" , err )
234
+ // Use only a short delay here to optimize for quickly
235
+ // bringing up a plugin
220
236
t .Reset (5 * time .Second )
221
237
continue
222
238
}
223
239
224
240
// Mark the plugin as healthy in a task event
241
+ h .logger .Debug ("CSI plugin is ready" )
225
242
h .previousHealthState = pluginHealthy
226
243
event := structs .NewTaskEvent (structs .TaskPluginHealthy )
227
244
event .SetMessage (fmt .Sprintf ("plugin: %s" , h .task .CSIPluginConfig .ID ))
@@ -232,15 +249,14 @@ WAITFORREADY:
232
249
}
233
250
234
251
// Step 2: Register the plugin with the catalog.
235
- deregisterPluginFn , err := h .registerPlugin (socketPath )
252
+ deregisterPluginFn , err := h .registerPlugin (client , socketPath )
236
253
if err != nil {
237
- h .logger .Error ("CSI Plugin registration failed" , "error" , err )
238
- event := structs .NewTaskEvent (structs .TaskPluginUnhealthy )
239
- event .SetMessage (fmt .Sprintf ("failed to register plugin: %s, reason: %v" , h .task .CSIPluginConfig .ID , err ))
240
- h .eventEmitter .EmitEvent (event )
254
+ h .kill (ctx , fmt .Errorf ("CSI plugin failed to register: %v" , err ))
255
+ return
241
256
}
242
257
243
- // Step 3: Start the lightweight supervisor loop.
258
+ // Step 3: Start the lightweight supervisor loop. At this point, failures
259
+ // don't cause the task to restart
244
260
t .Reset (0 )
245
261
for {
246
262
select {
@@ -249,9 +265,9 @@ WAITFORREADY:
249
265
deregisterPluginFn ()
250
266
return
251
267
case <- t .C :
252
- pluginHealthy , err := h .supervisorLoopOnce (ctx , socketPath )
268
+ pluginHealthy , err := h .supervisorLoopOnce (ctx , client )
253
269
if err != nil {
254
- h .logger .Error ("CSI Plugin fingerprinting failed" , "error" , err )
270
+ h .logger .Error ("CSI plugin fingerprinting failed" , "error" , err )
255
271
}
256
272
257
273
// The plugin has transitioned to a healthy state. Emit an event.
@@ -265,7 +281,7 @@ WAITFORREADY:
265
281
if h .previousHealthState && ! pluginHealthy {
266
282
event := structs .NewTaskEvent (structs .TaskPluginUnhealthy )
267
283
if err != nil {
268
- event .SetMessage (fmt .Sprintf ("error : %v" , err ))
284
+ event .SetMessage (fmt .Sprintf ("Error : %v" , err ))
269
285
} else {
270
286
event .SetMessage ("Unknown Reason" )
271
287
}
@@ -281,16 +297,9 @@ WAITFORREADY:
281
297
}
282
298
}
283
299
284
- func (h * csiPluginSupervisorHook ) registerPlugin (socketPath string ) (func (), error ) {
285
-
300
+ func (h * csiPluginSupervisorHook ) registerPlugin (client csi.CSIPlugin , socketPath string ) (func (), error ) {
286
301
// At this point we know the plugin is ready and we can fingerprint it
287
302
// to get its vendor name and version
288
- client , err := csi .NewClient (socketPath , h .logger .Named ("csi_client" ).With ("plugin.name" , h .task .CSIPluginConfig .ID , "plugin.type" , h .task .CSIPluginConfig .Type ))
289
- if err != nil {
290
- return nil , fmt .Errorf ("failed to create csi client: %v" , err )
291
- }
292
- defer client .Close ()
293
-
294
303
info , err := client .PluginInfo ()
295
304
if err != nil {
296
305
return nil , fmt .Errorf ("failed to probe plugin: %v" , err )
@@ -354,21 +363,13 @@ func (h *csiPluginSupervisorHook) registerPlugin(socketPath string) (func(), err
354
363
}, nil
355
364
}
356
365
357
- func (h * csiPluginSupervisorHook ) supervisorLoopOnce (ctx context.Context , socketPath string ) (bool , error ) {
358
- _ , err := os .Stat (socketPath )
359
- if err != nil {
360
- return false , fmt .Errorf ("failed to stat socket: %v" , err )
361
- }
366
+ func (h * csiPluginSupervisorHook ) supervisorLoopOnce (ctx context.Context , client csi.CSIPlugin ) (bool , error ) {
367
+ probeCtx , probeCancelFn := context .WithTimeout (ctx , 5 * time .Second )
368
+ defer probeCancelFn ()
362
369
363
- client , err := csi . NewClient ( socketPath , h . logger . Named ( "csi_client" ). With ( "plugin.name" , h . task . CSIPluginConfig . ID , "plugin.type" , h . task . CSIPluginConfig . Type ) )
370
+ healthy , err := client . PluginProbe ( probeCtx )
364
371
if err != nil {
365
- return false , fmt .Errorf ("failed to create csi client: %v" , err )
366
- }
367
- defer client .Close ()
368
-
369
- healthy , err := client .PluginProbe (ctx )
370
- if err != nil {
371
- return false , fmt .Errorf ("failed to probe plugin: %v" , err )
372
+ return false , err
372
373
}
373
374
374
375
return healthy , nil
@@ -387,6 +388,21 @@ func (h *csiPluginSupervisorHook) Stop(_ context.Context, req *interfaces.TaskSt
387
388
return nil
388
389
}
389
390
391
+ func (h * csiPluginSupervisorHook ) kill (ctx context.Context , reason error ) {
392
+ h .logger .Error ("killing task because plugin failed" , "error" , reason )
393
+ event := structs .NewTaskEvent (structs .TaskPluginUnhealthy )
394
+ event .SetMessage (fmt .Sprintf ("Error: %v" , reason .Error ()))
395
+ h .eventEmitter .EmitEvent (event )
396
+
397
+ if err := h .lifecycle .Kill (ctx ,
398
+ structs .NewTaskEvent (structs .TaskKilling ).
399
+ SetFailsTask ().
400
+ SetDisplayMessage ("CSI plugin did not become healthy before timeout" ),
401
+ ); err != nil {
402
+ h .logger .Error ("failed to kill task" , "kill_reason" , reason , "error" , err )
403
+ }
404
+ }
405
+
390
406
func ensureMountpointInserted (mounts []* drivers.MountConfig , mount * drivers.MountConfig ) []* drivers.MountConfig {
391
407
for _ , mnt := range mounts {
392
408
if mnt .IsEqual (mount ) {
0 commit comments