Skip to content

Commit

Permalink
orchestrator/global: Fix deadlock on updates
Browse files Browse the repository at this point in the history
The updater is wrongly called from inside a store transaction, which can
lead to a deadlock if an update is already running. The new update tries
to cancel the running one, but that existing update may be stuck trying
to start a store transaction and therefore the new update waits forever
for it to stop. To fix this, keep track of tasks that need to be
updated, and call the updater outside the transaction.

Signed-off-by: Aaron Lehmann <[email protected]>
  • Loading branch information
aaronlehmann committed Nov 17, 2016
1 parent efd44df commit 2999704
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion manager/orchestrator/global/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []strin
}
})

updates := make(map[*api.Service][]orchestrator.Slot)

_, err := g.store.Batch(func(batch *store.Batch) error {
var updateTasks []orchestrator.Slot
for _, serviceID := range serviceIDs {
Expand Down Expand Up @@ -274,8 +276,9 @@ func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []strin
updateTasks = append(updateTasks, ntasks)
}
}

if len(updateTasks) > 0 {
g.updater.Update(ctx, g.cluster, service.Service, updateTasks)
updates[service.Service] = updateTasks
}

// Remove any tasks assigned to nodes not found in g.nodes.
Expand All @@ -287,9 +290,15 @@ func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []strin
}
return nil
})

if err != nil {
log.G(ctx).WithError(err).Errorf("global orchestrator: reconcileServices transaction failed")
}

for service, updateTasks := range updates {
g.updater.Update(ctx, g.cluster, service, updateTasks)
}

}

// updateNode updates g.nodes based on the current node value
Expand Down

0 comments on commit 2999704

Please sign in to comment.