Skip to content

Commit 08d3cfc

Browse files
committed
client: don't use Status RPC for Consul discovery (#16490)
In #16217 we switched clients using Consul discovery to the `Status.Members` endpoint for getting the list of servers so that we're using the correct address. This endpoint has an authorization gate, so this fails if the anonymous policy doesn't have `node:read`. We also can't check the `AuthToken` for the request for the client secret, because the client hasn't yet registered so the server doesn't have anything to compare against. Instead of hitting the `Status.Peers` or `Status.Members` RPC endpoint, use the Consul response directly. Update the `registerNode` method to handle the list of servers we get back in the response; if we get a "no servers" or "no path to region" response we'll kick off discovery again and retry immediately rather than waiting 15s.
1 parent 372ca8b commit 08d3cfc

File tree

2 files changed

+33
-45
lines changed

2 files changed

+33
-45
lines changed

.changelog/16490.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:bug
2+
client: Fixed a bug where clients using Consul discovery to join the cluster would get permission denied errors
3+
```

client/client.go

+30-45
Original file line numberDiff line numberDiff line change
@@ -1862,7 +1862,7 @@ func (c *Client) retryRegisterNode() {
18621862
}
18631863

18641864
retryIntv := registerRetryIntv
1865-
if err == noServersErr {
1865+
if err == noServersErr || structs.IsErrNoRegionPath(err) {
18661866
c.logger.Debug("registration waiting on servers")
18671867
c.triggerDiscovery()
18681868
retryIntv = noServerRetryIntv
@@ -1889,6 +1889,11 @@ func (c *Client) registerNode() error {
18891889
return err
18901890
}
18911891

1892+
err := c.handleNodeUpdateResponse(resp)
1893+
if err != nil {
1894+
return err
1895+
}
1896+
18921897
// Update the node status to ready after we register.
18931898
c.UpdateConfig(func(c *config.Config) {
18941899
c.Node.Status = structs.NodeStatusReady
@@ -1903,6 +1908,7 @@ func (c *Client) registerNode() error {
19031908
defer c.heartbeatLock.Unlock()
19041909
c.heartbeatStop.setLastOk(time.Now())
19051910
c.heartbeatTTL = resp.HeartbeatTTL
1911+
19061912
return nil
19071913
}
19081914

@@ -1954,6 +1960,22 @@ func (c *Client) updateNodeStatus() error {
19541960
}
19551961
})
19561962

1963+
err := c.handleNodeUpdateResponse(resp)
1964+
if err != nil {
1965+
return fmt.Errorf("heartbeat response returned no valid servers")
1966+
}
1967+
1968+
// If there's no Leader in the response we may be talking to a partitioned
1969+
// server. Redo discovery to ensure our server list is up to date.
1970+
if resp.LeaderRPCAddr == "" {
1971+
c.triggerDiscovery()
1972+
}
1973+
1974+
c.EnterpriseClient.SetFeatures(resp.Features)
1975+
return nil
1976+
}
1977+
1978+
func (c *Client) handleNodeUpdateResponse(resp structs.NodeUpdateResponse) error {
19571979
// Update the number of nodes in the cluster so we can adjust our server
19581980
// rebalance rate.
19591981
c.servers.SetNumNodes(resp.NumNodes)
@@ -1970,20 +1992,9 @@ func (c *Client) updateNodeStatus() error {
19701992
nomadServers = append(nomadServers, e)
19711993
}
19721994
if len(nomadServers) == 0 {
1973-
return fmt.Errorf("heartbeat response returned no valid servers")
1995+
return noServersErr
19741996
}
19751997
c.servers.SetServers(nomadServers)
1976-
1977-
// Begin polling Consul if there is no Nomad leader. We could be
1978-
// heartbeating to a Nomad server that is in the minority of a
1979-
// partition of the Nomad server quorum, but this Nomad Agent still
1980-
// has connectivity to the existing majority of Nomad Servers, but
1981-
// only if it queries Consul.
1982-
if resp.LeaderRPCAddr == "" {
1983-
c.triggerDiscovery()
1984-
}
1985-
1986-
c.EnterpriseClient.SetFeatures(resp.Features)
19871998
return nil
19881999
}
19892000

@@ -2821,14 +2832,6 @@ func (c *Client) consulDiscoveryImpl() error {
28212832
dcs = dcs[0:helper.Min(len(dcs), datacenterQueryLimit)]
28222833
}
28232834

2824-
// Query for servers in this client's region only
2825-
region := c.Region()
2826-
rpcargs := structs.GenericRequest{
2827-
QueryOptions: structs.QueryOptions{
2828-
Region: region,
2829-
},
2830-
}
2831-
28322835
serviceName := c.GetConfig().ConsulConfig.ServerServiceName
28332836
var mErr multierror.Error
28342837
var nomadServers servers.Servers
@@ -2859,32 +2862,14 @@ DISCOLOOP:
28592862
continue
28602863
}
28612864

2862-
// Query the members from the region that Consul gave us, and
2863-
// extract the client-advertise RPC address from each member
2864-
var membersResp structs.ServerMembersResponse
2865-
if err := c.connPool.RPC(region, addr, "Status.Members", rpcargs, &membersResp); err != nil {
2866-
mErr.Errors = append(mErr.Errors, err)
2867-
continue
2868-
}
2869-
for _, member := range membersResp.Members {
2870-
if addrTag, ok := member.Tags["rpc_addr"]; ok {
2871-
if portTag, ok := member.Tags["port"]; ok {
2872-
addr, err := net.ResolveTCPAddr("tcp",
2873-
fmt.Sprintf("%s:%s", addrTag, portTag))
2874-
if err != nil {
2875-
mErr.Errors = append(mErr.Errors, err)
2876-
continue
2877-
}
2878-
srv := &servers.Server{Addr: addr}
2879-
nomadServers = append(nomadServers, srv)
2880-
}
2881-
}
2882-
}
2865+
srv := &servers.Server{Addr: addr}
2866+
nomadServers = append(nomadServers, srv)
2867+
}
28832868

2884-
if len(nomadServers) > 0 {
2885-
break DISCOLOOP
2886-
}
2869+
if len(nomadServers) > 0 {
2870+
break DISCOLOOP
28872871
}
2872+
28882873
}
28892874
if len(nomadServers) == 0 {
28902875
if len(mErr.Errors) > 0 {

0 commit comments

Comments
 (0)