Skip to content

Commit

Permalink
gossip: reintroduce GossipClientsKey gossip
Browse files Browse the repository at this point in the history
In 2114678, we stopped gossiping the connectivity graph via
`GossipClientsKey`. However, this broke backwards compatibility with
`cockroach node status` which relied on this key to determine `is_live`
liveness.

This patch partially reverts that change, by reintroducing gossip of
these keys in mixed 22.1/22.2 clusters, but otherwise not using them for
anything. We stop gossiping them in finalized 22.2 clusters. This breaks
backwards compatibility with <22.2.3, we'll just have to live with that.

Epic: none
Release note (bug fix): Fixed a bug where `cockroach node status` could
incorrectly report nodes as `is_live = false` in mixed 22.1/22.2
clusters. The bug still exists between 22.2 patch versions before and
after 22.2.3.
  • Loading branch information
erikgrinaker committed May 24, 2023
1 parent 62a0f7d commit 5e935c7
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 0 deletions.
47 changes: 47 additions & 0 deletions pkg/gossip/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ the system with minimal total hops. The algorithm is as follows:
package gossip

import (
"bytes"
"context"
"fmt"
"math"
Expand All @@ -59,6 +60,7 @@ import (

circuit "github.com/cockroachdb/circuitbreaker"
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/config"
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
Expand Down Expand Up @@ -105,6 +107,12 @@ const (
// efficiently targeted connection to the most distant node.
defaultCullInterval = 60 * time.Second

// defaultClientsInterval is the default interval for updating the gossip
// clients key which allows every node in the cluster to create a map of
// gossip connectivity. This value is intentionally small as we want to
// detect gossip partitions faster that the node liveness timeout (9s).
defaultClientsInterval = 2 * time.Second

// NodeDescriptorInterval is the interval for gossiping the node descriptor.
// Note that increasing this duration may increase the likelihood of gossip
// thrashing, since node descriptors are used to determine the number of gossip
Expand Down Expand Up @@ -821,6 +829,31 @@ func (g *Gossip) updateStoreMap(key string, content roachpb.Value) {
g.storeMap[desc.StoreID] = desc.Node.NodeID
}

func (g *Gossip) updateClients() {
nodeID := g.NodeID.Get()
if nodeID == 0 {
return
}

var buf bytes.Buffer
var sep string

g.mu.RLock()
g.clientsMu.Lock()
for _, c := range g.clientsMu.clients {
if c.peerID != 0 {
fmt.Fprintf(&buf, "%s%d", sep, c.peerID)
sep = ","
}
}
g.clientsMu.Unlock()
g.mu.RUnlock()

if err := g.AddInfo(MakeGossipClientsKey(nodeID), buf.Bytes(), 2*defaultClientsInterval); err != nil {
log.Errorf(g.AnnotateCtx(context.Background()), "%v", err)
}
}

// recomputeMaxPeersLocked recomputes max peers based on size of
// network and set the max sizes for incoming and outgoing node sets.
//
Expand Down Expand Up @@ -1282,11 +1315,13 @@ func (g *Gossip) bootstrap() {
func (g *Gossip) manage() {
ctx := g.AnnotateCtx(context.Background())
_ = g.server.stopper.RunAsyncTask(ctx, "gossip-manage", func(ctx context.Context) {
clientsTimer := timeutil.NewTimer()
cullTimer := timeutil.NewTimer()
stallTimer := timeutil.NewTimer()
defer cullTimer.Stop()
defer stallTimer.Stop()

clientsTimer.Reset(defaultClientsInterval)
cullTimer.Reset(jitteredInterval(g.cullInterval))
stallTimer.Reset(jitteredInterval(g.stallInterval))
for {
Expand All @@ -1297,6 +1332,18 @@ func (g *Gossip) manage() {
g.doDisconnected(c)
case <-g.tighten:
g.tightenNetwork(ctx)
case <-clientsTimer.C:
clientsTimer.Read = true
if g.rpcContext.Settings.Version.IsActive(ctx, clusterversion.V22_2) {
// 22.2.3 and later no longer uses GossipClientsKey, and gossiping it
// has a significant cost. We continue to gossip it in mixed 22.1/22.2
// clusters, but stop in finalized 22.2 clusters. This breaks
// crdb_internal.gossip_network and is_live in 'cockroach node status'
// with 22.2 before 22.2.3, we'll just have to live with that.
continue
}
g.updateClients()
clientsTimer.Reset(defaultClientsInterval)
case <-cullTimer.C:
cullTimer.Read = true
cullTimer.Reset(jitteredInterval(g.cullInterval))
Expand Down
10 changes: 10 additions & 0 deletions pkg/gossip/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ const (
// KeyDistSQLDrainingPrefix is the key prefix for each node's DistSQL
// draining state.
KeyDistSQLDrainingPrefix = "distsql-draining"

// KeyGossipClientsPrefix is the prefix for keys that indicate which gossip
// client connections a node has open. This is used by other nodes in the
// cluster to build a map of the gossip network.
KeyGossipClientsPrefix = "gossip-clients"
)

// MakeKey creates a canonical key under which to gossip a piece of
Expand Down Expand Up @@ -125,6 +130,11 @@ func DecodeNodeDescKey(key string, prefix string) (roachpb.NodeID, error) {
return roachpb.NodeID(nodeID), nil
}

// MakeGossipClientsKey returns the gossip client key for the given node.
func MakeGossipClientsKey(nodeID roachpb.NodeID) string {
return MakeKey(KeyGossipClientsPrefix, nodeID.String())
}

// MakeNodeHealthAlertKey returns the gossip key under which the given node can
// gossip health alerts.
func MakeNodeHealthAlertKey(nodeID roachpb.NodeID) string {
Expand Down

0 comments on commit 5e935c7

Please sign in to comment.