From 2999704ca0d10ebeb0caac9794736898eb339c9c Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Thu, 17 Nov 2016 12:25:15 -0800 Subject: [PATCH] orchestrator/global: Fix deadlock on updates The updater is wrongly called from inside a store transaction, which can lead to a deadlock if an update is already running. The new update tries to cancel the running one, but that existing update may be stuck trying to start a store transaction and therefore the new update waits forever for it to stop. To fix this, keep track of tasks that need to be updated, and call the updater outside the transaction. Signed-off-by: Aaron Lehmann --- manager/orchestrator/global/global.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/manager/orchestrator/global/global.go b/manager/orchestrator/global/global.go index 25329ff9ea..28c2a436d3 100644 --- a/manager/orchestrator/global/global.go +++ b/manager/orchestrator/global/global.go @@ -240,6 +240,8 @@ func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []strin } }) + updates := make(map[*api.Service][]orchestrator.Slot) + _, err := g.store.Batch(func(batch *store.Batch) error { var updateTasks []orchestrator.Slot for _, serviceID := range serviceIDs { @@ -274,8 +276,9 @@ func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []strin updateTasks = append(updateTasks, ntasks) } } + if len(updateTasks) > 0 { - g.updater.Update(ctx, g.cluster, service.Service, updateTasks) + updates[service.Service] = updateTasks } // Remove any tasks assigned to nodes not found in g.nodes. @@ -287,9 +290,15 @@ func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []strin } return nil }) + if err != nil { log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices transaction failed") } + + for service, updateTasks := range updates { + g.updater.Update(ctx, g.cluster, service, updateTasks) + } + } // updateNode updates g.nodes based on the current node value