Skip to content

Commit

Permalink
gossip: reintroduce GossipClientsKey gossip
Browse files Browse the repository at this point in the history
In 2114678, we stopped gossiping the connectivity graph via
`GossipClientsKey`. However, this broke backwards compatibility with
`cockroach node status` which relied on this key to determine `is_live`
liveness.

This patch partially reverts that change, by reintroducing gossip of
these keys in mixed 22.1/22.2 clusters, but otherwise not using them for
anything. We stop gossiping them in finalized 22.2 clusters, since the
cost of gossiping these is significant in large clusters (which
motivated their removal in the first place). This breaks backwards
compatibility with <22.2.3, we'll just have to live with that.

Epic: none
Release note (bug fix): Fixed a bug where `cockroach node status` could
incorrectly report nodes as `is_live = false` in mixed 22.1/22.2
clusters. The bug still exists between 22.2 patch versions before and
after 22.2.3.
  • Loading branch information
erikgrinaker committed May 25, 2023
1 parent 62a0f7d commit a6ee652
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 0 deletions.
1 change: 1 addition & 0 deletions pkg/gossip/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//pkg/base",
"//pkg/clusterversion",
"//pkg/config",
"//pkg/config/zonepb",
"//pkg/roachpb",
Expand Down
47 changes: 47 additions & 0 deletions pkg/gossip/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ the system with minimal total hops. The algorithm is as follows:
package gossip

import (
"bytes"
"context"
"fmt"
"math"
Expand All @@ -59,6 +60,7 @@ import (

circuit "github.com/cockroachdb/circuitbreaker"
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/config"
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
Expand Down Expand Up @@ -105,6 +107,12 @@ const (
// efficiently targeted connection to the most distant node.
defaultCullInterval = 60 * time.Second

// defaultClientsInterval is the default interval for updating the gossip
// clients key which allows every node in the cluster to create a map of
// gossip connectivity. This value is intentionally small as we want to
// detect gossip partitions faster that the node liveness timeout (9s).
defaultClientsInterval = 2 * time.Second

// NodeDescriptorInterval is the interval for gossiping the node descriptor.
// Note that increasing this duration may increase the likelihood of gossip
// thrashing, since node descriptors are used to determine the number of gossip
Expand Down Expand Up @@ -821,6 +829,31 @@ func (g *Gossip) updateStoreMap(key string, content roachpb.Value) {
g.storeMap[desc.StoreID] = desc.Node.NodeID
}

func (g *Gossip) updateClients() {
nodeID := g.NodeID.Get()
if nodeID == 0 {
return
}

var buf bytes.Buffer
var sep string

g.mu.RLock()
g.clientsMu.Lock()
for _, c := range g.clientsMu.clients {
if c.peerID != 0 {
fmt.Fprintf(&buf, "%s%d", sep, c.peerID)
sep = ","
}
}
g.clientsMu.Unlock()
g.mu.RUnlock()

if err := g.AddInfo(MakeGossipClientsKey(nodeID), buf.Bytes(), 2*defaultClientsInterval); err != nil {
log.Errorf(g.AnnotateCtx(context.Background()), "%v", err)
}
}

// recomputeMaxPeersLocked recomputes max peers based on size of
// network and set the max sizes for incoming and outgoing node sets.
//
Expand Down Expand Up @@ -1282,11 +1315,13 @@ func (g *Gossip) bootstrap() {
func (g *Gossip) manage() {
ctx := g.AnnotateCtx(context.Background())
_ = g.server.stopper.RunAsyncTask(ctx, "gossip-manage", func(ctx context.Context) {
clientsTimer := timeutil.NewTimer()
cullTimer := timeutil.NewTimer()
stallTimer := timeutil.NewTimer()
defer cullTimer.Stop()
defer stallTimer.Stop()

clientsTimer.Reset(defaultClientsInterval)
cullTimer.Reset(jitteredInterval(g.cullInterval))
stallTimer.Reset(jitteredInterval(g.stallInterval))
for {
Expand All @@ -1297,6 +1332,18 @@ func (g *Gossip) manage() {
g.doDisconnected(c)
case <-g.tighten:
g.tightenNetwork(ctx)
case <-clientsTimer.C:
clientsTimer.Read = true
if g.rpcContext.Settings.Version.IsActive(ctx, clusterversion.V22_2) {
// 22.2.3 and later no longer uses GossipClientsKey, and gossiping it
// has a significant cost. We continue to gossip it in mixed 22.1/22.2
// clusters, but stop in finalized 22.2 clusters. This breaks
// crdb_internal.gossip_network and is_live in 'cockroach node status'
// with 22.2 before 22.2.3, we'll just have to live with that.
continue
}
g.updateClients()
clientsTimer.Reset(defaultClientsInterval)
case <-cullTimer.C:
cullTimer.Read = true
cullTimer.Reset(jitteredInterval(g.cullInterval))
Expand Down
10 changes: 10 additions & 0 deletions pkg/gossip/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ const (
// KeyDistSQLDrainingPrefix is the key prefix for each node's DistSQL
// draining state.
KeyDistSQLDrainingPrefix = "distsql-draining"

// KeyGossipClientsPrefix is the prefix for keys that indicate which gossip
// client connections a node has open. This is used by other nodes in the
// cluster to build a map of the gossip network.
KeyGossipClientsPrefix = "gossip-clients"
)

// MakeKey creates a canonical key under which to gossip a piece of
Expand Down Expand Up @@ -125,6 +130,11 @@ func DecodeNodeDescKey(key string, prefix string) (roachpb.NodeID, error) {
return roachpb.NodeID(nodeID), nil
}

// MakeGossipClientsKey returns the gossip client key for the given node.
func MakeGossipClientsKey(nodeID roachpb.NodeID) string {
return MakeKey(KeyGossipClientsPrefix, nodeID.String())
}

// MakeNodeHealthAlertKey returns the gossip key under which the given node can
// gossip health alerts.
func MakeNodeHealthAlertKey(nodeID roachpb.NodeID) string {
Expand Down

0 comments on commit a6ee652

Please sign in to comment.