Skip to content

Commit 6bfd86b

Browse files
committed
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags associated with a service can be modified through the Catalog API, overriding the value(s) configured in the agent's service configuration. To enable this feature, the flag enable_tag_override must be configured in the service definition. Previously, Nomad did not allow configuring this flag, and thus the default value of false was used. Now, it is configurable. Because Nomad itself acts as a state machine around the the service definitions of the tasks it manages, it's worth describing what happens when this feature is enabled and why. Consider the basic case where there is no Nomad, and your service is provided to consul as a boring JSON file. The ultimate source of truth for the definition of that service is the file, and is stored in the agent. Later, Consul performs "anti-entropy" which synchronizes the Catalog (stored only the leaders). Then with enable_tag_override=true, the tags field is available for "external" modification through the Catalog API (rather than directly configuring the service definition file, or using the Agent API). The important observation is that if the service definition ever changes (i.e. the file is changed & config reloaded OR the Agent API is used to modify the service), those "external" tag values are thrown away, and the new service definition is once again the source of truth. In the Nomad case, Nomad itself is the source of truth over the Agent in the same way the JSON file was the source of truth in the example above. That means any time Nomad sets a new service definition, any externally configured tags are going to be replaced. When does this happen? Only on major lifecycle events, for example when a task is modified because of an updated job spec from the 'nomad job run <existing>' command. Otherwise, Nomad's periodic re-sync's with Consul will now no longer try to restore the externally modified tag values (as long as enable_tag_override=true). Fixes #2057
1 parent 4757f87 commit 6bfd86b

16 files changed

+654
-183
lines changed

api/services.go

+12-11
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,18 @@ type ServiceCheck struct {
9797
// Service represents a Consul service definition.
9898
type Service struct {
9999
//FIXME Id is unused. Remove?
100-
Id string
101-
Name string
102-
Tags []string
103-
CanaryTags []string `mapstructure:"canary_tags"`
104-
PortLabel string `mapstructure:"port"`
105-
AddressMode string `mapstructure:"address_mode"`
106-
Checks []ServiceCheck
107-
CheckRestart *CheckRestart `mapstructure:"check_restart"`
108-
Connect *ConsulConnect
109-
Meta map[string]string
110-
CanaryMeta map[string]string
100+
Id string
101+
Name string
102+
Tags []string
103+
CanaryTags []string `mapstructure:"canary_tags"`
104+
EnableTagOverride bool `mapstructure:"enable_tag_override"`
105+
PortLabel string `mapstructure:"port"`
106+
AddressMode string `mapstructure:"address_mode"`
107+
Checks []ServiceCheck
108+
CheckRestart *CheckRestart `mapstructure:"check_restart"`
109+
Connect *ConsulConnect
110+
Meta map[string]string
111+
CanaryMeta map[string]string
111112
}
112113

113114
// Canonicalize the Service by ensuring its name and address mode are set. Task

api/services_test.go

+25
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@ import (
55
"time"
66

77
"github.com/stretchr/testify/assert"
8+
"github.com/stretchr/testify/require"
89
)
910

1011
// TestService_CheckRestart asserts Service.CheckRestart settings are properly
1112
// inherited by Checks.
1213
func TestService_CheckRestart(t *testing.T) {
14+
t.Parallel()
15+
1316
job := &Job{Name: stringToPtr("job")}
1417
tg := &TaskGroup{Name: stringToPtr("group")}
1518
task := &Task{Name: "task"}
@@ -58,6 +61,8 @@ func TestService_CheckRestart(t *testing.T) {
5861
// TestService_Connect asserts Service.Connect settings are properly
5962
// inherited by Checks.
6063
func TestService_Connect(t *testing.T) {
64+
t.Parallel()
65+
6166
job := &Job{Name: stringToPtr("job")}
6267
tg := &TaskGroup{Name: stringToPtr("group")}
6368
task := &Task{Name: "task"}
@@ -83,3 +88,23 @@ func TestService_Connect(t *testing.T) {
8388
assert.Equal(t, proxy.Upstreams[0].DestinationName, "upstream")
8489
assert.Equal(t, proxy.LocalServicePort, 8000)
8590
}
91+
92+
func TestService_Tags(t *testing.T) {
93+
t.Parallel()
94+
r := require.New(t)
95+
96+
// canonicalize does not modify eto or tags
97+
job := &Job{Name: stringToPtr("job")}
98+
tg := &TaskGroup{Name: stringToPtr("group")}
99+
task := &Task{Name: "task"}
100+
service := &Service{
101+
Tags: []string{"a", "b"},
102+
CanaryTags: []string{"c", "d"},
103+
EnableTagOverride: true,
104+
}
105+
106+
service.Canonicalize(task, tg, job)
107+
r.True(service.EnableTagOverride)
108+
r.Equal([]string{"a", "b"}, service.Tags)
109+
r.Equal([]string{"c", "d"}, service.CanaryTags)
110+
}

client/allocrunner/groupservice_hook.go

+1
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ func (h *groupServiceHook) Prerun() error {
9797
func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error {
9898
h.mu.Lock()
9999
defer h.mu.Unlock()
100+
100101
oldWorkloadServices := h.getWorkloadServices()
101102

102103
// Store new updated values out of request

command/agent/consul/catalog_testing.go

+14
Original file line numberDiff line numberDiff line change
@@ -204,3 +204,17 @@ func (c *MockAgent) UpdateTTL(id string, output string, status string) error {
204204
c.checkTTLs[id]++
205205
return nil
206206
}
207+
208+
// a convenience method for looking up a registered service by name
209+
func (c *MockAgent) lookupService(name string) []*api.AgentServiceRegistration {
210+
c.mu.Lock()
211+
defer c.mu.Unlock()
212+
213+
var services []*api.AgentServiceRegistration
214+
for _, service := range c.services {
215+
if service.Name == name {
216+
services = append(services, service)
217+
}
218+
}
219+
return services
220+
}

command/agent/consul/client.go

+84-22
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,56 @@ type ACLsAPI interface {
106106
TokenList(q *api.QueryOptions) ([]*api.ACLTokenListEntry, *api.QueryMeta, error)
107107
}
108108

109-
func agentServiceUpdateRequired(reg *api.AgentServiceRegistration, svc *api.AgentService) bool {
110-
return !(reg.Kind == svc.Kind &&
111-
reg.ID == svc.ID &&
112-
reg.Port == svc.Port &&
113-
reg.Address == svc.Address &&
114-
reg.Name == svc.Service &&
115-
reflect.DeepEqual(reg.Tags, svc.Tags) &&
116-
reflect.DeepEqual(reg.Meta, svc.Meta))
109+
// agentServiceUpdateRequired checks if any critical fields in Nomad's version
110+
// of a service definition are different from the existing service definition as
111+
// known by Consul.
112+
func agentServiceUpdateRequired(reason syncReason, wanted *api.AgentServiceRegistration, existing *api.AgentService) bool {
113+
switch reason {
114+
case syncPeriodic:
115+
// In a periodic sync with Consul, we need to respect the value of
116+
// the enable_tag_override field so that we maintain the illusion that the
117+
// user is in control of the Consul tags, as they may be externally edited
118+
// via the Consul catalog API (e.g. a user manually sets them).
119+
//
120+
// As Consul does by disabling anti-entropy for the tags field, Nomad will
121+
// ignore differences in the tags field during the periodic syncs with
122+
// the Consul agent API.
123+
//
124+
// We do so by over-writing the nomad service registration by the value
125+
// of the tags that Consul contains, if enable_tag_override = true.
126+
maybeTweakTags(wanted, existing)
127+
return different(wanted, existing)
128+
129+
default:
130+
// A non-periodic sync with Consul indicates an operation has been set
131+
// on the queue. This happens when service has been added / removed / modified
132+
// and implies the Consul agent should be sync'd with nomad, because
133+
// nomad is the ultimate source of truth for the service definition.
134+
return different(wanted, existing)
135+
}
136+
}
137+
138+
// maybeTweakTags will override wanted.Tags with a copy of existing.Tags only if
139+
// EnableTagOverride is true. Otherwise the wanted service registration is left
140+
// unchanged.
141+
func maybeTweakTags(wanted *api.AgentServiceRegistration, existing *api.AgentService) {
142+
if wanted.EnableTagOverride {
143+
wanted.Tags = helper.CopySliceString(existing.Tags)
144+
}
145+
}
146+
147+
// different compares the wanted state of the service registration with the actual
148+
// (cached) state of the service registration reported by Consul. If any of the
149+
// critical fields are not deeply equal, they considered different.
150+
func different(wanted *api.AgentServiceRegistration, existing *api.AgentService) bool {
151+
return !(wanted.Kind == existing.Kind &&
152+
wanted.ID == existing.ID &&
153+
wanted.Port == existing.Port &&
154+
wanted.Address == existing.Address &&
155+
wanted.Name == existing.Service &&
156+
wanted.EnableTagOverride == existing.EnableTagOverride &&
157+
reflect.DeepEqual(wanted.Meta, existing.Meta) &&
158+
reflect.DeepEqual(wanted.Tags, existing.Tags))
117159
}
118160

119161
// operations are submitted to the main loop via commit() for synchronizing
@@ -320,6 +362,18 @@ func (c *ServiceClient) hasSeen() bool {
320362
return atomic.LoadInt32(&c.seen) == seen
321363
}
322364

365+
// syncReason indicates why a sync operation with consul is about to happen.
366+
//
367+
// The trigger for a sync may have implications on the behavior of the sync itself.
368+
// In particular, if a service is defined with enable_tag_override=true
369+
type syncReason byte
370+
371+
const (
372+
syncPeriodic = iota
373+
syncShutdown
374+
syncNewOps
375+
)
376+
323377
// Run the Consul main loop which retries operations against Consul. It should
324378
// be called exactly once.
325379
func (c *ServiceClient) Run() {
@@ -357,16 +411,23 @@ INIT:
357411

358412
failures := 0
359413
for {
414+
// On every iteration take note of what the trigger for the next sync
415+
// was, so that it may be referenced during the sync itself.
416+
var reasonForSync syncReason
417+
360418
select {
361419
case <-retryTimer.C:
420+
reasonForSync = syncPeriodic
362421
case <-c.shutdownCh:
422+
reasonForSync = syncShutdown
363423
// Cancel check watcher but sync one last time
364424
cancel()
365425
case ops := <-c.opCh:
426+
reasonForSync = syncNewOps
366427
c.merge(ops)
367428
}
368429

369-
if err := c.sync(); err != nil {
430+
if err := c.sync(reasonForSync); err != nil {
370431
if failures == 0 {
371432
// Log on the first failure
372433
c.logger.Warn("failed to update services in Consul", "error", err)
@@ -460,7 +521,7 @@ func (c *ServiceClient) merge(ops *operations) {
460521
}
461522

462523
// sync enqueued operations.
463-
func (c *ServiceClient) sync() error {
524+
func (c *ServiceClient) sync(reason syncReason) error {
464525
sreg, creg, sdereg, cdereg := 0, 0, 0, 0
465526

466527
consulServices, err := c.client.Services()
@@ -518,20 +579,20 @@ func (c *ServiceClient) sync() error {
518579
}
519580

520581
// Add Nomad services missing from Consul, or where the service has been updated.
521-
for id, locals := range c.services {
582+
for id, local := range c.services {
522583
existingSvc, ok := consulServices[id]
523584

524585
if ok {
525586
// There is an existing registration of this service in Consul, so here
526587
// we validate to see if the service has been invalidated to see if it
527588
// should be updated.
528-
if !agentServiceUpdateRequired(locals, existingSvc) {
589+
if !agentServiceUpdateRequired(reason, local, existingSvc) {
529590
// No Need to update services that have not changed
530591
continue
531592
}
532593
}
533594

534-
if err = c.client.ServiceRegister(locals); err != nil {
595+
if err = c.client.ServiceRegister(local); err != nil {
535596
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
536597
return err
537598
}
@@ -746,13 +807,14 @@ func (c *ServiceClient) serviceRegs(ops *operations, service *structs.Service, w
746807

747808
// Build the Consul Service registration request
748809
serviceReg := &api.AgentServiceRegistration{
749-
ID: id,
750-
Name: service.Name,
751-
Tags: tags,
752-
Address: ip,
753-
Port: port,
754-
Meta: meta,
755-
Connect: connect, // will be nil if no Connect stanza
810+
ID: id,
811+
Name: service.Name,
812+
Tags: tags,
813+
EnableTagOverride: service.EnableTagOverride,
814+
Address: ip,
815+
Port: port,
816+
Meta: meta,
817+
Connect: connect, // will be nil if no Connect stanza
756818
}
757819
ops.regServices = append(ops.regServices, serviceReg)
758820

@@ -868,8 +930,7 @@ func (c *ServiceClient) RegisterWorkload(workload *WorkloadServices) error {
868930
//
869931
// DriverNetwork must not change between invocations for the same allocation.
870932
func (c *ServiceClient) UpdateWorkload(old, newWorkload *WorkloadServices) error {
871-
ops := &operations{}
872-
933+
ops := new(operations)
873934
regs := new(ServiceRegistrations)
874935
regs.Services = make(map[string]*ServiceRegistration, len(newWorkload.Services))
875936

@@ -984,6 +1045,7 @@ func (c *ServiceClient) UpdateWorkload(old, newWorkload *WorkloadServices) error
9841045
}
9851046
}
9861047
}
1048+
9871049
return nil
9881050
}
9891051

0 commit comments

Comments
 (0)