Skip to content

Commit cd3dab9

Browse files
authored
Merge pull request #2009 from hashicorp/f-use-embedded-consul
Add a chaos test for consul syncer and fix some races it found
2 parents ec4a0d2 + 19fd195 commit cd3dab9

File tree

3 files changed

+307
-26
lines changed

3 files changed

+307
-26
lines changed

command/agent/consul/chaos_test.go

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
// +build chaos
2+
3+
package consul
4+
5+
import (
6+
"fmt"
7+
"io/ioutil"
8+
"sort"
9+
"strings"
10+
"sync"
11+
"testing"
12+
"time"
13+
14+
"github.com/hashicorp/consul/testutil"
15+
"github.com/hashicorp/nomad/nomad/structs"
16+
"github.com/hashicorp/nomad/nomad/structs/config"
17+
)
18+
19+
func TestSyncerChaos(t *testing.T) {
20+
// Create an embedded Consul server
21+
testconsul := testutil.NewTestServerConfig(t, func(c *testutil.TestServerConfig) {
22+
// If -v wasn't specified squelch consul logging
23+
if !testing.Verbose() {
24+
c.Stdout = ioutil.Discard
25+
c.Stderr = ioutil.Discard
26+
}
27+
})
28+
defer testconsul.Stop()
29+
30+
// Configure Syncer to talk to the test server
31+
cconf := config.DefaultConsulConfig()
32+
cconf.Addr = testconsul.HTTPAddr
33+
34+
clientSyncer, err := NewSyncer(cconf, nil, logger)
35+
if err != nil {
36+
t.Fatalf("Error creating Syncer: %v", err)
37+
}
38+
defer clientSyncer.Shutdown()
39+
40+
execSyncer, err := NewSyncer(cconf, nil, logger)
41+
if err != nil {
42+
t.Fatalf("Error creating Syncer: %v", err)
43+
}
44+
defer execSyncer.Shutdown()
45+
46+
clientService := &structs.Service{Name: "nomad-client"}
47+
services := map[ServiceKey]*structs.Service{
48+
GenerateServiceKey(clientService): clientService,
49+
}
50+
if err := clientSyncer.SetServices("client", services); err != nil {
51+
t.Fatalf("error setting client service: %v", err)
52+
}
53+
54+
const execn = 100
55+
const reapern = 2
56+
errors := make(chan error, 100)
57+
wg := sync.WaitGroup{}
58+
59+
// Start goroutines to concurrently SetServices
60+
for i := 0; i < execn; i++ {
61+
wg.Add(1)
62+
go func(i int) {
63+
defer wg.Done()
64+
domain := ServiceDomain(fmt.Sprintf("exec-%d", i))
65+
services := map[ServiceKey]*structs.Service{}
66+
for ii := 0; ii < 10; ii++ {
67+
s := &structs.Service{Name: fmt.Sprintf("exec-%d-%d", i, ii)}
68+
services[GenerateServiceKey(s)] = s
69+
if err := execSyncer.SetServices(domain, services); err != nil {
70+
select {
71+
case errors <- err:
72+
default:
73+
}
74+
return
75+
}
76+
time.Sleep(1)
77+
}
78+
}(i)
79+
}
80+
81+
// SyncServices runs a timer started by Syncer.Run which we don't use
82+
// in this test, so run SyncServices concurrently
83+
wg.Add(1)
84+
go func() {
85+
defer wg.Done()
86+
for i := 0; i < execn; i++ {
87+
if err := execSyncer.SyncServices(); err != nil {
88+
select {
89+
case errors <- err:
90+
default:
91+
}
92+
return
93+
}
94+
time.Sleep(100)
95+
}
96+
}()
97+
98+
wg.Add(1)
99+
go func() {
100+
defer wg.Done()
101+
if err := clientSyncer.ReapUnmatched([]ServiceDomain{"nomad-client"}); err != nil {
102+
select {
103+
case errors <- err:
104+
default:
105+
}
106+
return
107+
}
108+
}()
109+
110+
// Reap all but exec-0-*
111+
wg.Add(1)
112+
go func() {
113+
defer wg.Done()
114+
for i := 0; i < execn; i++ {
115+
if err := execSyncer.ReapUnmatched([]ServiceDomain{"exec-0", ServiceDomain(fmt.Sprintf("exec-%d", i))}); err != nil {
116+
select {
117+
case errors <- err:
118+
default:
119+
}
120+
}
121+
time.Sleep(100)
122+
}
123+
}()
124+
125+
go func() {
126+
wg.Wait()
127+
close(errors)
128+
}()
129+
130+
for err := range errors {
131+
if err != nil {
132+
t.Errorf("error setting service from executor goroutine: %v", err)
133+
}
134+
}
135+
136+
// Do a final ReapUnmatched to get consul back into a deterministic state
137+
if err := execSyncer.ReapUnmatched([]ServiceDomain{"exec-0"}); err != nil {
138+
t.Fatalf("error doing final reap: %v", err)
139+
}
140+
141+
// flattenedServices should be fully populated as ReapUnmatched doesn't
142+
// touch Syncer's internal state
143+
expected := map[string]struct{}{}
144+
for i := 0; i < execn; i++ {
145+
for ii := 0; ii < 10; ii++ {
146+
expected[fmt.Sprintf("exec-%d-%d", i, ii)] = struct{}{}
147+
}
148+
}
149+
150+
for _, s := range execSyncer.flattenedServices() {
151+
_, ok := expected[s.Name]
152+
if !ok {
153+
t.Errorf("%s unexpected", s.Name)
154+
}
155+
delete(expected, s.Name)
156+
}
157+
if len(expected) > 0 {
158+
left := []string{}
159+
for s := range expected {
160+
left = append(left, s)
161+
}
162+
sort.Strings(left)
163+
t.Errorf("Couldn't find %d names in flattened services:\n%s", len(expected), strings.Join(left, "\n"))
164+
}
165+
166+
// All but exec-0 and possibly some of exec-99 should have been reaped
167+
{
168+
services, err := execSyncer.client.Agent().Services()
169+
if err != nil {
170+
t.Fatalf("Error getting services: %v", err)
171+
}
172+
expected := []int{}
173+
for k, service := range services {
174+
if service.Service == "consul" {
175+
continue
176+
}
177+
i := -1
178+
ii := -1
179+
fmt.Sscanf(service.Service, "exec-%d-%d", &i, &ii)
180+
switch {
181+
case i == -1 || ii == -1:
182+
t.Errorf("invalid service: %s -> %s", k, service.Service)
183+
case i != 0 || ii > 9:
184+
t.Errorf("unexpected service: %s -> %s", k, service.Service)
185+
default:
186+
expected = append(expected, ii)
187+
}
188+
}
189+
if len(expected) != 10 {
190+
t.Errorf("expected 0-9 but found: %#q", expected)
191+
}
192+
}
193+
}

command/agent/consul/syncer.go

+20-11
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ import (
3535
"time"
3636

3737
consul "github.com/hashicorp/consul/api"
38-
"github.com/hashicorp/consul/lib"
3938
"github.com/hashicorp/go-multierror"
4039

4140
"github.com/hashicorp/nomad/nomad/structs"
@@ -56,11 +55,11 @@ const (
5655
nomadServicePrefix = "_nomad"
5756

5857
// The periodic time interval for syncing services and checks with Consul
59-
syncInterval = 5 * time.Second
58+
defaultSyncInterval = 6 * time.Second
6059

61-
// syncJitter provides a little variance in the frequency at which
60+
// defaultSyncJitter provides a little variance in the frequency at which
6261
// Syncer polls Consul.
63-
syncJitter = 8
62+
defaultSyncJitter = time.Second
6463

6564
// ttlCheckBuffer is the time interval that Nomad can take to report Consul
6665
// the check result
@@ -144,6 +143,13 @@ type Syncer struct {
144143
periodicCallbacks map[string]types.PeriodicCallback
145144
notifySyncCh chan struct{}
146145
periodicLock sync.RWMutex
146+
147+
// The periodic time interval for syncing services and checks with Consul
148+
syncInterval time.Duration
149+
150+
// syncJitter provides a little variance in the frequency at which
151+
// Syncer polls Consul.
152+
syncJitter time.Duration
147153
}
148154

149155
// NewSyncer returns a new consul.Syncer
@@ -168,8 +174,11 @@ func NewSyncer(consulConfig *config.ConsulConfig, shutdownCh chan struct{}, logg
168174
checkGroups: make(map[ServiceDomain]map[ServiceKey][]*consul.AgentCheckRegistration),
169175
checkRunners: make(map[consulCheckID]*CheckRunner),
170176
periodicCallbacks: make(map[string]types.PeriodicCallback),
177+
notifySyncCh: make(chan struct{}, 1),
171178
// default noop implementation of addrFinder
172-
addrFinder: func(string) (string, int) { return "", 0 },
179+
addrFinder: func(string) (string, int) { return "", 0 },
180+
syncInterval: defaultSyncInterval,
181+
syncJitter: defaultSyncJitter,
173182
}
174183

175184
return &consulSyncer, nil
@@ -809,7 +818,7 @@ func (c *Syncer) Run() {
809818
for {
810819
select {
811820
case <-sync.C:
812-
d := syncInterval - lib.RandomStagger(syncInterval/syncJitter)
821+
d := c.syncInterval - c.syncJitter
813822
sync.Reset(d)
814823

815824
if err := c.SyncServices(); err != nil {
@@ -824,7 +833,7 @@ func (c *Syncer) Run() {
824833
c.consulAvailable = true
825834
}
826835
case <-c.notifySyncCh:
827-
sync.Reset(syncInterval)
836+
sync.Reset(0)
828837
case <-c.shutdownCh:
829838
c.Shutdown()
830839
case <-c.notifyShutdownCh:
@@ -872,8 +881,8 @@ func (c *Syncer) SyncServices() error {
872881
// the syncer
873882
func (c *Syncer) filterConsulServices(consulServices map[string]*consul.AgentService) map[consulServiceID]*consul.AgentService {
874883
localServices := make(map[consulServiceID]*consul.AgentService, len(consulServices))
875-
c.registryLock.RLock()
876-
defer c.registryLock.RUnlock()
884+
c.groupsLock.RLock()
885+
defer c.groupsLock.RUnlock()
877886
for serviceID, service := range consulServices {
878887
for domain := range c.servicesGroups {
879888
if strings.HasPrefix(service.ID, fmt.Sprintf("%s-%s", nomadServicePrefix, domain)) {
@@ -889,8 +898,8 @@ func (c *Syncer) filterConsulServices(consulServices map[string]*consul.AgentSer
889898
// services with Syncer's idPrefix.
890899
func (c *Syncer) filterConsulChecks(consulChecks map[string]*consul.AgentCheck) map[consulCheckID]*consul.AgentCheck {
891900
localChecks := make(map[consulCheckID]*consul.AgentCheck, len(consulChecks))
892-
c.registryLock.RLock()
893-
defer c.registryLock.RUnlock()
901+
c.groupsLock.RLock()
902+
defer c.groupsLock.RUnlock()
894903
for checkID, check := range consulChecks {
895904
for domain := range c.checkGroups {
896905
if strings.HasPrefix(check.ServiceID, fmt.Sprintf("%s-%s", nomadServicePrefix, domain)) {

0 commit comments

Comments
 (0)