Skip to content

Commit 6dc8f20

Browse files
client: don't use Status RPC for Consul discovery (#16490) (#16567)
In #16217 we switched clients using Consul discovery to the `Status.Members` endpoint for getting the list of servers so that we're using the correct address. This endpoint has an authorization gate, so this fails if the anonymous policy doesn't have `node:read`. We also can't check the `AuthToken` for the request for the client secret, because the client hasn't yet registered so the server doesn't have anything to compare against. Instead of hitting the `Status.Peers` or `Status.Members` RPC endpoint, use the Consul response directly. Update the `registerNode` method to handle the list of servers we get back in the response; if we get a "no servers" or "no path to region" response we'll kick off discovery again and retry immediately rather than waiting 15s. Co-authored-by: Tim Gross <[email protected]>
1 parent e1174da commit 6dc8f20

File tree

2 files changed

+33
-45
lines changed

2 files changed

+33
-45
lines changed

.changelog/16490.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:bug
2+
client: Fixed a bug where clients using Consul discovery to join the cluster would get permission denied errors
3+
```

client/client.go

+30-45
Original file line numberDiff line numberDiff line change
@@ -1876,7 +1876,7 @@ func (c *Client) retryRegisterNode() {
18761876
}
18771877

18781878
retryIntv := registerRetryIntv
1879-
if err == noServersErr {
1879+
if err == noServersErr || structs.IsErrNoRegionPath(err) {
18801880
c.logger.Debug("registration waiting on servers")
18811881
c.triggerDiscovery()
18821882
retryIntv = noServerRetryIntv
@@ -1903,6 +1903,11 @@ func (c *Client) registerNode() error {
19031903
return err
19041904
}
19051905

1906+
err := c.handleNodeUpdateResponse(resp)
1907+
if err != nil {
1908+
return err
1909+
}
1910+
19061911
// Update the node status to ready after we register.
19071912
c.UpdateConfig(func(c *config.Config) {
19081913
c.Node.Status = structs.NodeStatusReady
@@ -1917,6 +1922,7 @@ func (c *Client) registerNode() error {
19171922
defer c.heartbeatLock.Unlock()
19181923
c.heartbeatStop.setLastOk(time.Now())
19191924
c.heartbeatTTL = resp.HeartbeatTTL
1925+
19201926
return nil
19211927
}
19221928

@@ -1968,6 +1974,22 @@ func (c *Client) updateNodeStatus() error {
19681974
}
19691975
})
19701976

1977+
err := c.handleNodeUpdateResponse(resp)
1978+
if err != nil {
1979+
return fmt.Errorf("heartbeat response returned no valid servers")
1980+
}
1981+
1982+
// If there's no Leader in the response we may be talking to a partitioned
1983+
// server. Redo discovery to ensure our server list is up to date.
1984+
if resp.LeaderRPCAddr == "" {
1985+
c.triggerDiscovery()
1986+
}
1987+
1988+
c.EnterpriseClient.SetFeatures(resp.Features)
1989+
return nil
1990+
}
1991+
1992+
func (c *Client) handleNodeUpdateResponse(resp structs.NodeUpdateResponse) error {
19711993
// Update the number of nodes in the cluster so we can adjust our server
19721994
// rebalance rate.
19731995
c.servers.SetNumNodes(resp.NumNodes)
@@ -1984,20 +2006,9 @@ func (c *Client) updateNodeStatus() error {
19842006
nomadServers = append(nomadServers, e)
19852007
}
19862008
if len(nomadServers) == 0 {
1987-
return fmt.Errorf("heartbeat response returned no valid servers")
2009+
return noServersErr
19882010
}
19892011
c.servers.SetServers(nomadServers)
1990-
1991-
// Begin polling Consul if there is no Nomad leader. We could be
1992-
// heartbeating to a Nomad server that is in the minority of a
1993-
// partition of the Nomad server quorum, but this Nomad Agent still
1994-
// has connectivity to the existing majority of Nomad Servers, but
1995-
// only if it queries Consul.
1996-
if resp.LeaderRPCAddr == "" {
1997-
c.triggerDiscovery()
1998-
}
1999-
2000-
c.EnterpriseClient.SetFeatures(resp.Features)
20012012
return nil
20022013
}
20032014

@@ -2839,14 +2850,6 @@ func (c *Client) consulDiscoveryImpl() error {
28392850
dcs = dcs[0:helper.Min(len(dcs), datacenterQueryLimit)]
28402851
}
28412852

2842-
// Query for servers in this client's region only
2843-
region := c.Region()
2844-
rpcargs := structs.GenericRequest{
2845-
QueryOptions: structs.QueryOptions{
2846-
Region: region,
2847-
},
2848-
}
2849-
28502853
serviceName := c.GetConfig().ConsulConfig.ServerServiceName
28512854
var mErr multierror.Error
28522855
var nomadServers servers.Servers
@@ -2877,32 +2880,14 @@ DISCOLOOP:
28772880
continue
28782881
}
28792882

2880-
// Query the members from the region that Consul gave us, and
2881-
// extract the client-advertise RPC address from each member
2882-
var membersResp structs.ServerMembersResponse
2883-
if err := c.connPool.RPC(region, addr, "Status.Members", rpcargs, &membersResp); err != nil {
2884-
mErr.Errors = append(mErr.Errors, err)
2885-
continue
2886-
}
2887-
for _, member := range membersResp.Members {
2888-
if addrTag, ok := member.Tags["rpc_addr"]; ok {
2889-
if portTag, ok := member.Tags["port"]; ok {
2890-
addr, err := net.ResolveTCPAddr("tcp",
2891-
fmt.Sprintf("%s:%s", addrTag, portTag))
2892-
if err != nil {
2893-
mErr.Errors = append(mErr.Errors, err)
2894-
continue
2895-
}
2896-
srv := &servers.Server{Addr: addr}
2897-
nomadServers = append(nomadServers, srv)
2898-
}
2899-
}
2900-
}
2883+
srv := &servers.Server{Addr: addr}
2884+
nomadServers = append(nomadServers, srv)
2885+
}
29012886

2902-
if len(nomadServers) > 0 {
2903-
break DISCOLOOP
2904-
}
2887+
if len(nomadServers) > 0 {
2888+
break DISCOLOOP
29052889
}
2890+
29062891
}
29072892
if len(nomadServers) == 0 {
29082893
if len(mErr.Errors) > 0 {

0 commit comments

Comments
 (0)