From 2114678be6d1ac279ec1fed3420fb635cc255476 Mon Sep 17 00:00:00 2001 From: Alex Robinson Date: Mon, 26 Sep 2022 22:45:44 -0500 Subject: [PATCH] gossip: remove frequent gossiping of gossip client connections These gossip-clients keys make up two thirds or more of the gossip traffic in various large clusters I've inspected, consuming almost an entire CPU core in the worst case I've seen. They don't provide enough value to justify that sort of ongoing cost, so this commit removes them entirely as well as the periodic logging of the gossip network and the crdb_internal.gossip_network table, both of which relied on them. Release note (backward-incompatible change): We've stopped supporting/populating the crdb_internal.gossip_network table. It was an internal table with no API guarantees (so perhaps no meriting a release note?). Release note (performance improvement): Significantly reduced CPU usage of the underlying gossip network in large clusters. --- pkg/cli/debug.go | 2 - pkg/cli/testdata/zip/partial1 | 5 - pkg/cli/testdata/zip/partial1_excluded | 2 - pkg/cli/testdata/zip/partial2 | 2 - pkg/cli/testdata/zip/testzip | 1 - pkg/cli/testdata/zip/testzip_concurrent | 9 -- pkg/cli/testdata/zip/testzip_tenant | 3 - pkg/cli/testdata/zip/unavailable | 1 - pkg/cli/zip_table_registry.go | 6 -- pkg/cli/zip_test.go | 1 + pkg/cmd/roachtest/tests/cli.go | 2 +- pkg/cmd/roachtest/tests/gossip.go | 64 +++++------- pkg/gossip/client.go | 1 - pkg/gossip/gossip.go | 123 +----------------------- pkg/gossip/gossip_test.go | 2 +- pkg/gossip/keys.go | 10 -- pkg/sql/crdb_internal.go | 30 ++---- 17 files changed, 40 insertions(+), 224 deletions(-) diff --git a/pkg/cli/debug.go b/pkg/cli/debug.go index 060e02934ff9..a39b54ae70d9 100644 --- a/pkg/cli/debug.go +++ b/pkg/cli/debug.go @@ -1069,8 +1069,6 @@ func parseGossipValues(gossipInfo *gossip.InfoStatus) (string, error) { return "", errors.Wrapf(err, "failed to parse value for key %q", key) } output = append(output, fmt.Sprintf("%q: %+v", key, drainingInfo)) - } else if strings.HasPrefix(key, gossip.KeyGossipClientsPrefix) { - output = append(output, fmt.Sprintf("%q: %v", key, string(bytes))) } } diff --git a/pkg/cli/testdata/zip/partial1 b/pkg/cli/testdata/zip/partial1 index f93018abf1c5..b18bf3648319 100644 --- a/pkg/cli/testdata/zip/partial1 +++ b/pkg/cli/testdata/zip/partial1 @@ -78,7 +78,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done @@ -179,9 +178,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null [node 2] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/2/crdb_internal.gossip_liveness.txt... [node 2] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: dial tcp ... [node 2] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/2/crdb_internal.gossip_liveness.txt.err.txt... done -[node 2] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/2/crdb_internal.gossip_network.txt... -[node 2] retrieving SQL data for crdb_internal.gossip_network: last request failed: dial tcp ... -[node 2] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/2/crdb_internal.gossip_network.txt.err.txt... done [node 2] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/2/crdb_internal.gossip_nodes.txt... [node 2] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: dial tcp ... [node 2] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/2/crdb_internal.gossip_nodes.txt.err.txt... done @@ -263,7 +259,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null [node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done -[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done [node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done [node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/testdata/zip/partial1_excluded b/pkg/cli/testdata/zip/partial1_excluded index d882b90e92a2..0654e9d7d724 100644 --- a/pkg/cli/testdata/zip/partial1_excluded +++ b/pkg/cli/testdata/zip/partial1_excluded @@ -78,7 +78,6 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done @@ -172,7 +171,6 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 [node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done -[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done [node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done [node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/testdata/zip/partial2 b/pkg/cli/testdata/zip/partial2 index 38c20451217f..421a4dd75551 100644 --- a/pkg/cli/testdata/zip/partial2 +++ b/pkg/cli/testdata/zip/partial2 @@ -78,7 +78,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done @@ -171,7 +170,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null [node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done -[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done [node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done [node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/testdata/zip/testzip b/pkg/cli/testdata/zip/testzip index 7a8d72cb1ca5..a38ab6229695 100644 --- a/pkg/cli/testdata/zip/testzip +++ b/pkg/cli/testdata/zip/testzip @@ -81,7 +81,6 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/testdata/zip/testzip_concurrent b/pkg/cli/testdata/zip/testzip_concurrent index dbf587a74982..fb49a3865560 100644 --- a/pkg/cli/testdata/zip/testzip_concurrent +++ b/pkg/cli/testdata/zip/testzip_concurrent @@ -292,9 +292,6 @@ zip [node 1] retrieving SQL data for crdb_internal.gossip_liveness... [node 1] retrieving SQL data for crdb_internal.gossip_liveness: done [node 1] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... -[node 1] retrieving SQL data for crdb_internal.gossip_network... -[node 1] retrieving SQL data for crdb_internal.gossip_network: done -[node 1] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/1/crdb_internal.gossip_network.txt... [node 1] retrieving SQL data for crdb_internal.gossip_nodes... [node 1] retrieving SQL data for crdb_internal.gossip_nodes: done [node 1] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... @@ -622,9 +619,6 @@ zip [node 2] retrieving SQL data for crdb_internal.gossip_liveness... [node 2] retrieving SQL data for crdb_internal.gossip_liveness: done [node 2] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/2/crdb_internal.gossip_liveness.txt... -[node 2] retrieving SQL data for crdb_internal.gossip_network... -[node 2] retrieving SQL data for crdb_internal.gossip_network: done -[node 2] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/2/crdb_internal.gossip_network.txt... [node 2] retrieving SQL data for crdb_internal.gossip_nodes... [node 2] retrieving SQL data for crdb_internal.gossip_nodes: done [node 2] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/2/crdb_internal.gossip_nodes.txt... @@ -952,9 +946,6 @@ zip [node 3] retrieving SQL data for crdb_internal.gossip_liveness... [node 3] retrieving SQL data for crdb_internal.gossip_liveness: done [node 3] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... -[node 3] retrieving SQL data for crdb_internal.gossip_network... -[node 3] retrieving SQL data for crdb_internal.gossip_network: done -[node 3] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/3/crdb_internal.gossip_network.txt... [node 3] retrieving SQL data for crdb_internal.gossip_nodes... [node 3] retrieving SQL data for crdb_internal.gossip_nodes: done [node 3] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... diff --git a/pkg/cli/testdata/zip/testzip_tenant b/pkg/cli/testdata/zip/testzip_tenant index ceab965fade4..7ca3c80c608d 100644 --- a/pkg/cli/testdata/zip/testzip_tenant +++ b/pkg/cli/testdata/zip/testzip_tenant @@ -110,9 +110,6 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... [node 1] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000) [node 1] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/1/crdb_internal.gossip_liveness.txt.err.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... -[node 1] retrieving SQL data for crdb_internal.gossip_network: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000) -[node 1] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/1/crdb_internal.gossip_network.txt.err.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... [node 1] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000) [node 1] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/1/crdb_internal.gossip_nodes.txt.err.txt... done diff --git a/pkg/cli/testdata/zip/unavailable b/pkg/cli/testdata/zip/unavailable index f48e3c425dab..a25c97f0c941 100644 --- a/pkg/cli/testdata/zip/unavailable +++ b/pkg/cli/testdata/zip/unavailable @@ -90,7 +90,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null --timeout=.5s [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/zip_table_registry.go b/pkg/cli/zip_table_registry.go index 631402fff76c..8bb5c171f224 100644 --- a/pkg/cli/zip_table_registry.go +++ b/pkg/cli/zip_table_registry.go @@ -533,12 +533,6 @@ var zipInternalTablesPerNode = DebugZipTableRegistry{ "updated_at", }, }, - "crdb_internal.gossip_network": { - nonSensitiveCols: NonSensitiveColumns{ - "source_id", - "target_id", - }, - }, "crdb_internal.gossip_nodes": { // `cluster_name` is hashed as we only care to see whether values are // identical across nodes. diff --git a/pkg/cli/zip_test.go b/pkg/cli/zip_test.go index eb743b1098ec..513ac0edd491 100644 --- a/pkg/cli/zip_test.go +++ b/pkg/cli/zip_test.go @@ -76,6 +76,7 @@ table_name NOT IN ( 'cross_db_references', 'databases', 'forward_dependencies', + 'gossip_network', 'index_columns', 'kv_catalog_comments', 'kv_catalog_descriptor', diff --git a/pkg/cmd/roachtest/tests/cli.go b/pkg/cmd/roachtest/tests/cli.go index a76ebdd54eb3..7bd12b7b5ba6 100644 --- a/pkg/cmd/roachtest/tests/cli.go +++ b/pkg/cmd/roachtest/tests/cli.go @@ -109,7 +109,7 @@ func runCLINodeStatus(ctx context.Context, t test.Test, c cluster.Cluster) { c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(3)) waitUntil([]string{ "is_available is_live", - "false true", + "false false", "false false", "false false", }) diff --git a/pkg/cmd/roachtest/tests/gossip.go b/pkg/cmd/roachtest/tests/gossip.go index eccf93bce211..cf8bc936a1f6 100644 --- a/pkg/cmd/roachtest/tests/gossip.go +++ b/pkg/cmd/roachtest/tests/gossip.go @@ -21,7 +21,6 @@ import ( "strconv" "strings" "time" - "unicode" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" @@ -47,52 +46,36 @@ func registerGossip(r registry.Registry) { err := WaitFor3XReplication(ctx, t, c.Conn(ctx, t.L(), 1)) require.NoError(t, err) - // TODO(irfansharif): We could also look at gossip_liveness to determine - // cluster membership as seen by each gossip module, and ensure each - // node's gossip excludes the dead node and includes all other live - // ones. - gossipNetworkAccordingTo := func(node int) (network string) { + gossipNetworkAccordingTo := func(node int) (nodes []int) { + // Expiration timestamp format: .,. We'll + // capture just the portion. const query = ` -SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') - FROM (SELECT * FROM crdb_internal.gossip_network ORDER BY source_id, target_id) +SELECT node_id + FROM (SELECT node_id, to_timestamp(split_part(split_part(expiration, ',', 1), '.', 1)::FLOAT8) AS expiration + FROM crdb_internal.gossip_liveness) + WHERE expiration > now(); ` db := c.Conn(ctx, t.L(), node) defer db.Close() - var s gosql.NullString - if err = db.QueryRow(query).Scan(&s); err != nil { - t.Fatal(err) - } - if s.Valid { - return s.String - } - return "" - } - nodesInNetworkAccordingTo := func(node int) (nodes []int, network string) { - split := func(c rune) bool { - return !unicode.IsNumber(c) + rows, err := db.Query(query) + if err != nil { + t.Fatal(err) } - uniqueNodes := make(map[int]struct{}) - network = gossipNetworkAccordingTo(node) - for _, idStr := range strings.FieldsFunc(network, split) { - nodeID, err := strconv.Atoi(idStr) - if err != nil { - t.Fatal(err) - } - uniqueNodes[nodeID] = struct{}{} - } - for node := range uniqueNodes { - nodes = append(nodes, node) + for rows.Next() { + var nodeID int + require.NoError(t, rows.Scan(&nodeID)) + require.NotZero(t, nodeID) + nodes = append(nodes, nodeID) } sort.Ints(nodes) - return nodes, network + return nodes } gossipOK := func(start time.Time, deadNode int) bool { var expLiveNodes []int - var expGossipNetwork string for i := 1; i <= c.Spec().NodeCount; i++ { if elapsed := timeutil.Since(start); elapsed >= 20*time.Second { @@ -104,36 +87,35 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') } t.L().Printf("%d: checking gossip\n", i) - liveNodes, gossipNetwork := nodesInNetworkAccordingTo(i) + liveNodes := gossipNetworkAccordingTo(i) for _, id := range liveNodes { if id == deadNode { - t.L().Printf("%d: gossip not ok (dead node %d present): %s (%.0fs)\n", - i, deadNode, gossipNetwork, timeutil.Since(start).Seconds()) + t.L().Printf("%d: gossip not ok (dead node %d present) (%.0fs)\n", + i, deadNode, timeutil.Since(start).Seconds()) return false } } if len(expLiveNodes) == 0 { expLiveNodes = liveNodes - expGossipNetwork = gossipNetwork continue } if len(liveNodes) != len(expLiveNodes) { - t.L().Printf("%d: gossip not ok (mismatched size of network: %s); expected %d, got %d (%.0fs)\n", - i, gossipNetwork, len(expLiveNodes), len(liveNodes), timeutil.Since(start).Seconds()) + t.L().Printf("%d: gossip not ok (mismatched size of network); expected %d, got %d (%.0fs)\n", + i, len(expLiveNodes), len(liveNodes), timeutil.Since(start).Seconds()) return false } for i := range liveNodes { if liveNodes[i] != expLiveNodes[i] { t.L().Printf("%d: gossip not ok (mismatched view of live nodes); expected %s, got %s (%.0fs)\n", - i, gossipNetwork, expLiveNodes, liveNodes, timeutil.Since(start).Seconds()) + i, expLiveNodes, liveNodes, timeutil.Since(start).Seconds()) return false } } } - t.L().Printf("gossip ok: %s (size: %d) (%0.0fs)\n", expGossipNetwork, len(expLiveNodes), timeutil.Since(start).Seconds()) + t.L().Printf("gossip ok (size: %d) (%0.0fs)\n", len(expLiveNodes), timeutil.Since(start).Seconds()) return true } diff --git a/pkg/gossip/client.go b/pkg/gossip/client.go index 6ffb4c5acf2b..a279c00a0860 100644 --- a/pkg/gossip/client.go +++ b/pkg/gossip/client.go @@ -333,7 +333,6 @@ func (c *client) gossip( } if peerID == 0 && c.peerID != 0 { peerID = c.peerID - g.updateClients() } } }() diff --git a/pkg/gossip/gossip.go b/pkg/gossip/gossip.go index 624808418c90..20bc448f3e43 100644 --- a/pkg/gossip/gossip.go +++ b/pkg/gossip/gossip.go @@ -47,14 +47,11 @@ the system with minimal total hops. The algorithm is as follows: package gossip import ( - "bytes" "context" "fmt" "math" "math/rand" "net" - "sort" - "strconv" "strings" "sync" "time" @@ -108,12 +105,6 @@ const ( // efficiently targeted connection to the most distant node. defaultCullInterval = 60 * time.Second - // defaultClientsInterval is the default interval for updating the gossip - // clients key which allows every node in the cluster to create a map of - // gossip connectivity. This value is intentionally small as we want to - // detect gossip partitions faster that the node liveness timeout (9s). - defaultClientsInterval = 2 * time.Second - // NodeDescriptorInterval is the interval for gossiping the node descriptor. // Note that increasing this duration may increase the likelihood of gossip // thrashing, since node descriptors are used to determine the number of gossip @@ -280,8 +271,6 @@ type Gossip struct { locality roachpb.Locality - lastConnectivity redact.RedactableString - defaultZoneConfig *zonepb.ZoneConfig } @@ -340,7 +329,6 @@ func New( // Add ourselves as a node descriptor watcher. g.mu.is.registerCallback(MakePrefixPattern(KeyNodeDescPrefix), g.updateNodeAddress) g.mu.is.registerCallback(MakePrefixPattern(KeyStoreDescPrefix), g.updateStoreMap) - // Log gossip connectivity whenever we receive an update. g.mu.Unlock() if grpcServer != nil { @@ -415,7 +403,6 @@ func (g *Gossip) SetNodeDescriptor(desc *roachpb.NodeDescriptor) error { if err := g.AddInfoProto(MakeNodeIDKey(desc.NodeID), desc, NodeDescriptorTTL); err != nil { return errors.Wrapf(err, "n%d: couldn't gossip descriptor", desc.NodeID) } - g.updateClients() return nil } @@ -596,17 +583,10 @@ func (g *Gossip) LogStatus() { } g.mu.RUnlock() - var connectivity redact.RedactableString - if s := redact.Sprint(g.Connectivity()); s != g.lastConnectivity { - g.lastConnectivity = s - connectivity = s - } - ctx := g.AnnotateCtx(context.TODO()) - log.Health.Infof(ctx, "gossip status (%s, %d node%s)\n%s%s%s", + log.Health.Infof(ctx, "gossip status (%s, %d node%s)\n%s%s", status, n, util.Pluralize(int64(n)), - g.clientStatus(), g.server.status(), - connectivity) + g.clientStatus(), g.server.status()) } func (g *Gossip) clientStatus() ClientStatus { @@ -632,63 +612,6 @@ func (g *Gossip) clientStatus() ClientStatus { return status } -// Connectivity returns the current view of the gossip network as seen by this -// node. -func (g *Gossip) Connectivity() Connectivity { - ctx := g.AnnotateCtx(context.TODO()) - var c Connectivity - - g.mu.RLock() - - if i := g.mu.is.getInfo(KeySentinel); i != nil { - c.SentinelNodeID = i.NodeID - } - - g.nodeDescs.Range(func(nodeID int64, _ unsafe.Pointer) bool { - key := MakeGossipClientsKey(roachpb.NodeID(nodeID)) - i := g.mu.is.getInfo(key) - if i == nil { - return true - } - - v, err := i.Value.GetBytes() - if err != nil { - log.Errorf(ctx, "unable to retrieve gossip value for %s: %v", key, err) - return true - } - if len(v) == 0 { - return true - } - - for _, part := range strings.Split(string(v), ",") { - id, err := strconv.ParseInt(part, 10 /* base */, 64 /* bitSize */) - if err != nil { - log.Errorf(ctx, "unable to parse node ID: %v", err) - } - c.ClientConns = append(c.ClientConns, Connectivity_Conn{ - SourceID: roachpb.NodeID(nodeID), - TargetID: roachpb.NodeID(id), - }) - } - return true - }) - - g.mu.RUnlock() - - sort.Slice(c.ClientConns, func(i, j int) bool { - a, b := &c.ClientConns[i], &c.ClientConns[j] - if a.SourceID < b.SourceID { - return true - } - if a.SourceID > b.SourceID { - return false - } - return a.TargetID < b.TargetID - }) - - return c -} - // EnableSimulationCycler is for TESTING PURPOSES ONLY. It sets a // condition variable which is signaled at each cycle of the // simulation via SimulationCycle(). The gossip server makes each @@ -919,31 +842,6 @@ func (g *Gossip) updateStoreMap(key string, content roachpb.Value) { g.storeDescs.Store(int64(desc.StoreID), unsafe.Pointer(&desc)) } -func (g *Gossip) updateClients() { - nodeID := g.NodeID.Get() - if nodeID == 0 { - return - } - - var buf bytes.Buffer - var sep string - - g.mu.RLock() - g.clientsMu.Lock() - for _, c := range g.clientsMu.clients { - if c.peerID != 0 { - fmt.Fprintf(&buf, "%s%d", sep, c.peerID) - sep = "," - } - } - g.clientsMu.Unlock() - g.mu.RUnlock() - - if err := g.AddInfo(MakeGossipClientsKey(nodeID), buf.Bytes(), 2*defaultClientsInterval); err != nil { - log.Errorf(g.AnnotateCtx(context.Background()), "%v", err) - } -} - // recomputeMaxPeersLocked recomputes max peers based on size of // network and set the max sizes for incoming and outgoing node sets. // @@ -1149,15 +1047,13 @@ func (g *Gossip) InfoOriginatedHere(key string) bool { func (g *Gossip) GetInfoStatus() InfoStatus { clientStatus := g.clientStatus() serverStatus := g.server.status() - connectivity := g.Connectivity() g.mu.RLock() defer g.mu.RUnlock() is := InfoStatus{ - Infos: make(map[string]Info), - Client: clientStatus, - Server: serverStatus, - Connectivity: connectivity, + Infos: make(map[string]Info), + Client: clientStatus, + Server: serverStatus, } for k, v := range g.mu.is.Infos { is.Infos[k] = *protoutil.Clone(v).(*Info) @@ -1407,14 +1303,11 @@ func (g *Gossip) bootstrap() { func (g *Gossip) manage() { ctx := g.AnnotateCtx(context.Background()) _ = g.server.stopper.RunAsyncTask(ctx, "gossip-manage", func(ctx context.Context) { - clientsTimer := timeutil.NewTimer() cullTimer := timeutil.NewTimer() stallTimer := timeutil.NewTimer() - defer clientsTimer.Stop() defer cullTimer.Stop() defer stallTimer.Stop() - clientsTimer.Reset(defaultClientsInterval) cullTimer.Reset(jitteredInterval(g.cullInterval)) stallTimer.Reset(jitteredInterval(g.stallInterval)) for { @@ -1425,10 +1318,6 @@ func (g *Gossip) manage() { g.doDisconnected(c) case <-g.tighten: g.tightenNetwork(ctx) - case <-clientsTimer.C: - clientsTimer.Read = true - g.updateClients() - clientsTimer.Reset(defaultClientsInterval) case <-cullTimer.C: cullTimer.Read = true cullTimer.Reset(jitteredInterval(g.cullInterval)) @@ -1512,8 +1401,6 @@ func (g *Gossip) tightenNetwork(ctx context.Context) { } func (g *Gossip) doDisconnected(c *client) { - defer g.updateClients() - g.mu.Lock() defer g.mu.Unlock() g.removeClientLocked(c) diff --git a/pkg/gossip/gossip_test.go b/pkg/gossip/gossip_test.go index 51734f5458ce..92a6ebd6ad0e 100644 --- a/pkg/gossip/gossip_test.go +++ b/pkg/gossip/gossip_test.go @@ -817,7 +817,7 @@ func TestGossipPropagation(t *testing.T) { getInfo := func(g *Gossip, key string) *Info { g.mu.RLock() defer g.mu.RUnlock() - return g.mu.is.Infos[key] + return g.mu.is.getInfo(key) } var localInfo *Info diff --git a/pkg/gossip/keys.go b/pkg/gossip/keys.go index 34a2a5a53300..6b80da82b28d 100644 --- a/pkg/gossip/keys.go +++ b/pkg/gossip/keys.go @@ -83,11 +83,6 @@ const ( // KeyDistSQLDrainingPrefix is the key prefix for each node's DistSQL // draining state. KeyDistSQLDrainingPrefix = "distsql-draining" - - // KeyGossipClientsPrefix is the prefix for keys that indicate which gossip - // client connections a node has open. This is used by other nodes in the - // cluster to build a map of the gossip network. - KeyGossipClientsPrefix = "gossip-clients" ) // MakeKey creates a canonical key under which to gossip a piece of @@ -130,11 +125,6 @@ func DecodeNodeDescKey(key string, prefix string) (roachpb.NodeID, error) { return roachpb.NodeID(nodeID), nil } -// MakeGossipClientsKey returns the gossip client key for the given node. -func MakeGossipClientsKey(nodeID roachpb.NodeID) string { - return MakeKey(KeyGossipClientsPrefix, nodeID.String()) -} - // MakeNodeHealthAlertKey returns the gossip key under which the given node can // gossip health alerts. func MakeNodeHealthAlertKey(nodeID roachpb.NodeID) string { diff --git a/pkg/sql/crdb_internal.go b/pkg/sql/crdb_internal.go index 54b9d02ddf67..0d1abf4a4bb4 100644 --- a/pkg/sql/crdb_internal.go +++ b/pkg/sql/crdb_internal.go @@ -57,6 +57,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/parser" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/privilege" "github.com/cockroachdb/cockroach/pkg/sql/protoreflect" "github.com/cockroachdb/cockroach/pkg/sql/roleoption" @@ -4099,9 +4100,13 @@ CREATE TABLE crdb_internal.gossip_nodes ( } alive := make(map[roachpb.NodeID]tree.DBool) + now := timeutil.Now() for _, d := range descriptors { - if _, err := g.GetInfo(gossip.MakeGossipClientsKey(d.NodeID)); err == nil { - alive[d.NodeID] = true + var gossipLiveness livenesspb.Liveness + if err := g.GetInfoProto(gossip.MakeNodeLivenessKey(d.NodeID), &gossipLiveness); err == nil { + if now.Before(gossipLiveness.Expiration.ToTimestamp().GoTime()) { + alive[d.NodeID] = true + } } } @@ -4400,26 +4405,9 @@ CREATE TABLE crdb_internal.gossip_network ( source_id INT NOT NULL, -- source node of a gossip connection target_id INT NOT NULL -- target node of a gossip connection ) - `, +`, populate: func(ctx context.Context, p *planner, _ catalog.DatabaseDescriptor, addRow func(...tree.Datum) error) error { - if err := p.RequireAdminRole(ctx, "read crdb_internal.gossip_network"); err != nil { - return err - } - - g, err := p.ExecCfg().Gossip.OptionalErr(47899) - if err != nil { - return err - } - - c := g.Connectivity() - for _, conn := range c.ClientConns { - if err := addRow( - tree.NewDInt(tree.DInt(conn.SourceID)), - tree.NewDInt(tree.DInt(conn.TargetID)), - ); err != nil { - return err - } - } + p.BufferClientNotice(ctx, pgnotice.Newf("This table is no longer supported/populated, and will be removed in a future version.")) return nil }, }