Skip to content

Commit 65bfbe5

Browse files
committed
Hash fields used in task service IDs
Fixes #3620 Previously we concatenated tags into task service IDs. This could break deregistration of tag names that contained double //s like some Fabio tags. This change breaks service ID backward compatibility so on upgrade all users services and checks will be removed and re-added with new IDs. This change has the side effect of including all service fields in the ID's hash, so we no longer have to track PortLabel and AddressMode changes independently.
1 parent 7bf5cec commit 65bfbe5

File tree

5 files changed

+116
-90
lines changed

5 files changed

+116
-90
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
## 0.7.1 (Unreleased)
22

33
__BACKWARDS INCOMPATIBILITIES:__
4+
* client: The format of service IDs in Consul has changed. If you rely upon
5+
Nomad's service IDs (*not* service names; those are stable), you will need
6+
to update your code. [GH-3632]
47
* config: Nomad no longer parses Atlas configuration stanzas. Atlas has been
58
deprecated since earlier this year. If you have an Atlas stanza in your
69
config file it will have to be removed.
@@ -57,6 +60,8 @@ BUG FIXES:
5760
explicitly [GH-3520]
5861
* cli: Fix passing Consul address via flags [GH-3504]
5962
* cli: Fix panic when running `keyring` commands [GH-3509]
63+
* client: Fix advertising services with tags that require URL escaping
64+
[GH-3632]
6065
* client: Fix a panic when restoring an allocation with a dead leader task
6166
[GH-3502]
6267
* client: Fix crash when following logs from a Windows node [GH-3608]

command/agent/consul/client.go

+90-70
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ package consul
22

33
import (
44
"context"
5+
"crypto/sha1"
6+
"encoding/base32"
57
"fmt"
8+
"io"
69
"log"
710
"net"
811
"net/url"
@@ -21,10 +24,14 @@ import (
2124
)
2225

2326
const (
24-
// nomadServicePrefix is the first prefix that scopes all Nomad registered
25-
// services
27+
// nomadServicePrefix is the prefix that scopes all Nomad registered
28+
// services (both agent and task entries).
2629
nomadServicePrefix = "_nomad"
2730

31+
// nomadTaskPrefix is the prefix that scopes Nomad registered services
32+
// for tasks.
33+
nomadTaskPrefix = nomadServicePrefix + "-task-"
34+
2835
// defaultRetryInterval is how quickly to retry syncing services and
2936
// checks to Consul when an error occurs. Will backoff up to a max.
3037
defaultRetryInterval = time.Second
@@ -288,8 +295,13 @@ func (c *ServiceClient) Run() {
288295

289296
if err := c.sync(); err != nil {
290297
if failures == 0 {
298+
// Log on the first failure
291299
c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err)
300+
} else if failures%10 == 0 {
301+
// Log every 10th consecutive failure
302+
c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err)
292303
}
304+
293305
failures++
294306
if !retryTimer.Stop() {
295307
// Timer already expired, since the timer may
@@ -389,38 +401,31 @@ func (c *ServiceClient) sync() error {
389401
// Not managed by Nomad, skip
390402
continue
391403
}
404+
392405
// Unknown Nomad managed service; kill
393406
if err := c.client.ServiceDeregister(id); err != nil {
407+
if isOldNomadService(id) {
408+
// Don't hard-fail on old entries. See #3620
409+
continue
410+
}
411+
394412
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
395413
return err
396414
}
397415
sdereg++
398416
metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
399417
}
400418

401-
// Track services whose ports have changed as their checks may also
402-
// need updating
403-
portsChanged := make(map[string]struct{}, len(c.services))
404-
405419
// Add Nomad services missing from Consul
406420
for id, locals := range c.services {
407-
if remotes, ok := consulServices[id]; ok {
408-
// Make sure Port and Address are stable since
409-
// PortLabel and AddressMode aren't included in the
410-
// service ID.
411-
if locals.Port == remotes.Port && locals.Address == remotes.Address {
412-
// Already exists in Consul; skip
413-
continue
421+
if _, ok := consulServices[id]; !ok {
422+
if err = c.client.ServiceRegister(locals); err != nil {
423+
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
424+
return err
414425
}
415-
// Port changed, reregister it and its checks
416-
portsChanged[id] = struct{}{}
417-
}
418-
if err = c.client.ServiceRegister(locals); err != nil {
419-
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
420-
return err
426+
sreg++
427+
metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
421428
}
422-
sreg++
423-
metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
424429
}
425430

426431
// Remove Nomad checks in Consul but unknown locally
@@ -433,8 +438,14 @@ func (c *ServiceClient) sync() error {
433438
// Service not managed by Nomad, skip
434439
continue
435440
}
436-
// Unknown Nomad managed check; kill
441+
442+
// Unknown Nomad managed check; remove
437443
if err := c.client.CheckDeregister(id); err != nil {
444+
if isOldNomadService(check.ServiceID) {
445+
// Don't hard-fail on old entries.
446+
continue
447+
}
448+
438449
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
439450
return err
440451
}
@@ -444,12 +455,11 @@ func (c *ServiceClient) sync() error {
444455

445456
// Add Nomad checks missing from Consul
446457
for id, check := range c.checks {
447-
if check, ok := consulChecks[id]; ok {
448-
if _, changed := portsChanged[check.ServiceID]; !changed {
449-
// Already in Consul and ports didn't change; skipping
450-
continue
451-
}
458+
if _, ok := consulChecks[id]; ok {
459+
// Already in Consul; skipping
460+
continue
452461
}
462+
453463
if err := c.client.CheckRegister(check); err != nil {
454464
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
455465
return err
@@ -751,22 +761,17 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta
751761
continue
752762
}
753763

764+
// Service exists and hasn't changed, don't re-add it later
765+
delete(newIDs, existingID)
766+
754767
// Service still exists so add it to the task's registration
755768
sreg := &ServiceRegistration{
756769
serviceID: existingID,
757770
checkIDs: make(map[string]struct{}, len(newSvc.Checks)),
758771
}
759772
taskReg.Services[existingID] = sreg
760773

761-
// PortLabel and AddressMode aren't included in the ID, so we
762-
// have to compare manually.
763-
serviceUnchanged := newSvc.PortLabel == existingSvc.PortLabel && newSvc.AddressMode == existingSvc.AddressMode
764-
if serviceUnchanged {
765-
// Service exists and hasn't changed, don't add it later
766-
delete(newIDs, existingID)
767-
}
768-
769-
// See what checks were updated
774+
// See if any checks were updated
770775
existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks))
771776
for _, check := range existingSvc.Checks {
772777
existingChecks[makeCheckID(existingID, check)] = check
@@ -779,17 +784,16 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta
779784
// Check exists, so don't remove it
780785
delete(existingChecks, checkID)
781786
sreg.checkIDs[checkID] = struct{}{}
782-
} else if serviceUnchanged {
783-
// New check on an unchanged service; add them now
784-
newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net)
785-
if err != nil {
786-
return err
787-
}
787+
}
788788

789-
for _, checkID := range newCheckIDs {
790-
sreg.checkIDs[checkID] = struct{}{}
789+
// New check on an unchanged service; add them now
790+
newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net)
791+
if err != nil {
792+
return err
793+
}
791794

792-
}
795+
for _, checkID := range newCheckIDs {
796+
sreg.checkIDs[checkID] = struct{}{}
793797

794798
}
795799

@@ -999,36 +1003,40 @@ func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) {
9991003
//
10001004
// Agent service IDs are of the form:
10011005
//
1002-
// {nomadServicePrefix}-{ROLE}-{Service.Name}-{Service.Tags...}
1003-
// Example Server ID: _nomad-server-nomad-serf
1004-
// Example Client ID: _nomad-client-nomad-client-http
1006+
// {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...})
1007+
// Example Server ID: _nomad-server-FBBK265QN4TMT25ND4EP42TJVMYJ3HR4
1008+
// Example Client ID: _nomad-client-GGNJPGL7YN7RGMVXZILMPVRZZVRSZC7L
10051009
//
10061010
func makeAgentServiceID(role string, service *structs.Service) string {
1007-
parts := make([]string, len(service.Tags)+3)
1008-
parts[0] = nomadServicePrefix
1009-
parts[1] = role
1010-
parts[2] = service.Name
1011-
copy(parts[3:], service.Tags)
1012-
return strings.Join(parts, "-")
1011+
h := sha1.New()
1012+
io.WriteString(h, service.Name)
1013+
for _, tag := range service.Tags {
1014+
io.WriteString(h, tag)
1015+
}
1016+
b32 := base32.StdEncoding.EncodeToString(h.Sum(nil))
1017+
return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, b32)
10131018
}
10141019

10151020
// makeTaskServiceID creates a unique ID for identifying a task service in
1016-
// Consul.
1017-
//
1018-
// Task service IDs are of the form:
1019-
//
1020-
// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
1021-
// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
1021+
// Consul. All structs.Service fields are included in the ID's hash except
1022+
// Checks. This allows updates to merely compare IDs.
10221023
//
1024+
// Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH
10231025
func makeTaskServiceID(allocID, taskName string, service *structs.Service) string {
1024-
parts := make([]string, len(service.Tags)+5)
1025-
parts[0] = nomadServicePrefix
1026-
parts[1] = "executor"
1027-
parts[2] = allocID
1028-
parts[3] = taskName
1029-
parts[4] = service.Name
1030-
copy(parts[5:], service.Tags)
1031-
return strings.Join(parts, "-")
1026+
h := sha1.New()
1027+
io.WriteString(h, allocID)
1028+
io.WriteString(h, taskName)
1029+
io.WriteString(h, service.Name)
1030+
io.WriteString(h, service.PortLabel)
1031+
io.WriteString(h, service.AddressMode)
1032+
for _, tag := range service.Tags {
1033+
io.WriteString(h, tag)
1034+
}
1035+
1036+
// Base32 is used for encoding the hash as sha1 hashes can always be
1037+
// encoded without padding, only 4 bytes larger than base64, and saves
1038+
// 8 bytes vs hex.
1039+
return nomadTaskPrefix + base32.StdEncoding.EncodeToString(h.Sum(nil))
10321040
}
10331041

10341042
// makeCheckID creates a unique ID for a check.
@@ -1084,9 +1092,21 @@ func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host
10841092
}
10851093

10861094
// isNomadService returns true if the ID matches the pattern of a Nomad managed
1087-
// service. Agent services return false as independent client and server agents
1088-
// may be running on the same machine. #2827
1095+
// service (new or old formats). Agent services return false as independent
1096+
// client and server agents may be running on the same machine. #2827
10891097
func isNomadService(id string) bool {
1098+
return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
1099+
}
1100+
1101+
// isOldNomadService returns true if the ID matches an old pattern managed by
1102+
// Nomad.
1103+
//
1104+
// Pre-0.7.1 task service IDs are of the form:
1105+
//
1106+
// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
1107+
// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
1108+
//
1109+
func isOldNomadService(id string) bool {
10901110
const prefix = nomadServicePrefix + "-executor"
10911111
return strings.HasPrefix(id, prefix)
10921112
}

command/agent/consul/int_test.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,12 @@ func TestConsul_Integration(t *testing.T) {
116116
{
117117
Name: "httpd2",
118118
PortLabel: "http",
119-
Tags: []string{"test", "http2"},
119+
Tags: []string{
120+
"test",
121+
// Use URL-unfriendly tags to test #3620
122+
"public-test.ettaviation.com:80/ redirect=302,https://test.ettaviation.com",
123+
"public-test.ettaviation.com:443/",
124+
},
120125
},
121126
}
122127

command/agent/consul/unit_test.go

+15-8
Original file line numberDiff line numberDiff line change
@@ -220,8 +220,8 @@ func TestConsul_ChangeTags(t *testing.T) {
220220
}
221221

222222
// TestConsul_ChangePorts asserts that changing the ports on a service updates
223-
// it in Consul. Since ports are not part of the service ID this is a slightly
224-
// different code path than changing tags.
223+
// it in Consul. Pre-0.7.1 ports were not part of the service ID and this was a
224+
// slightly different code path than changing tags.
225225
func TestConsul_ChangePorts(t *testing.T) {
226226
ctx := setupFake()
227227
ctx.Task.Services[0].Checks = []*structs.ServiceCheck{
@@ -349,8 +349,8 @@ func TestConsul_ChangePorts(t *testing.T) {
349349
}
350350

351351
for k, v := range ctx.FakeConsul.services {
352-
if k != origServiceKey {
353-
t.Errorf("unexpected key change; was: %q -- but found %q", origServiceKey, k)
352+
if k == origServiceKey {
353+
t.Errorf("expected key change; still: %q", k)
354354
}
355355
if v.Name != ctx.Task.Services[0].Name {
356356
t.Errorf("expected Name=%q != %q", ctx.Task.Services[0].Name, v.Name)
@@ -370,15 +370,15 @@ func TestConsul_ChangePorts(t *testing.T) {
370370
for k, v := range ctx.FakeConsul.checks {
371371
switch v.Name {
372372
case "c1":
373-
if k != origTCPKey {
374-
t.Errorf("unexpected key change for %s from %q to %q", v.Name, origTCPKey, k)
373+
if k == origTCPKey {
374+
t.Errorf("expected key change for %s from %q", v.Name, origTCPKey)
375375
}
376376
if expected := fmt.Sprintf(":%d", xPort); v.TCP != expected {
377377
t.Errorf("expected Port x=%v but found: %v", expected, v.TCP)
378378
}
379379
case "c2":
380-
if k != origScriptKey {
381-
t.Errorf("unexpected key change for %s from %q to %q", v.Name, origScriptKey, k)
380+
if k == origScriptKey {
381+
t.Errorf("expected key change for %s from %q", v.Name, origScriptKey)
382382
}
383383
select {
384384
case <-ctx.execs:
@@ -1383,9 +1383,16 @@ func TestIsNomadService(t *testing.T) {
13831383
}{
13841384
{"_nomad-client-nomad-client-http", false},
13851385
{"_nomad-server-nomad-serf", false},
1386+
1387+
// Pre-0.7.1 style IDs still match
13861388
{"_nomad-executor-abc", true},
13871389
{"_nomad-executor", true},
1390+
1391+
// Post-0.7.1 style IDs match
1392+
{"_nomad-task-FBBK265QN4TMT25ND4EP42TJVMYJ3HR4", true},
1393+
13881394
{"not-nomad", false},
1395+
{"_nomad", false},
13891396
}
13901397

13911398
for _, test := range tests {

nomad/structs/structs.go

-11
Original file line numberDiff line numberDiff line change
@@ -3140,17 +3140,6 @@ func (s *Service) ValidateName(name string) error {
31403140
return nil
31413141
}
31423142

3143-
// Hash calculates the hash of the check based on it's content and the service
3144-
// which owns it
3145-
func (s *Service) Hash() string {
3146-
h := sha1.New()
3147-
io.WriteString(h, s.Name)
3148-
io.WriteString(h, strings.Join(s.Tags, ""))
3149-
io.WriteString(h, s.PortLabel)
3150-
io.WriteString(h, s.AddressMode)
3151-
return fmt.Sprintf("%x", h.Sum(nil))
3152-
}
3153-
31543143
const (
31553144
// DefaultKillTimeout is the default timeout between signaling a task it
31563145
// will be killed and killing it.

0 commit comments

Comments
 (0)