From 65c00a82ffa567665bda5384ee52ef35818d9833 Mon Sep 17 00:00:00 2001
From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com>
Date: Fri, 24 Jan 2025 11:17:12 +0100
Subject: [PATCH 1/7] Retry waiting for tablets during vtgate initialization if
they timeout.
Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>
---
go/vt/vtgate/tabletgateway.go | 3 +--
go/vt/vtgate/vtgate.go | 14 ++++++++++++--
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/go/vt/vtgate/tabletgateway.go b/go/vt/vtgate/tabletgateway.go
index c36c6981fa2..48cd37b28fe 100644
--- a/go/vt/vtgate/tabletgateway.go
+++ b/go/vt/vtgate/tabletgateway.go
@@ -207,9 +207,8 @@ func (gw *TabletGateway) WaitForTablets(ctx context.Context, tabletTypesToWait [
case context.DeadlineExceeded:
// In this scenario, we were able to reach the
// topology service, but some tablets may not be
- // ready. We just warn and keep going.
+ // ready.
log.Warningf("Timeout waiting for all keyspaces / shards to have healthy tablets of types %v, may be in degraded mode", tabletTypesToWait)
- err = nil
}
}()
diff --git a/go/vt/vtgate/vtgate.go b/go/vt/vtgate/vtgate.go
index 8b8302d77d4..298f683be16 100644
--- a/go/vt/vtgate/vtgate.go
+++ b/go/vt/vtgate/vtgate.go
@@ -295,8 +295,18 @@ func Init(
// TabletGateway can create it's own healthcheck
gw := NewTabletGateway(ctx, hc, serv, cell)
gw.RegisterStats()
- if err := gw.WaitForTablets(ctx, tabletTypesToWait); err != nil {
- log.Fatalf("tabletGateway.WaitForTablets failed: %v", err)
+
+ // Retry loop for potential time-outs waiting for all tablets.
+ for {
+ err := gw.WaitForTablets(ctx, tabletTypesToWait)
+ switch {
+ case errors.Is(err, context.DeadlineExceeded):
+ log.Warning("TabletGateway timed out waiting for tablets to become available - retrying.")
+
+ continue
+ default:
+ log.Fatalf("tabletGateway.WaitForTablets failed: %v", err)
+ }
}
dynamicConfig := NewDynamicViperConfig()
From abb53cb4610af9f129cf16ef3cf410a3ea37d1d9 Mon Sep 17 00:00:00 2001
From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com>
Date: Fri, 24 Jan 2025 11:17:34 +0100
Subject: [PATCH 2/7] Retry waiting for tablets during vtgate initialization if
they timeout.
Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>
---
go/vt/vtgate/vtgate.go | 3 +++
1 file changed, 3 insertions(+)
diff --git a/go/vt/vtgate/vtgate.go b/go/vt/vtgate/vtgate.go
index 298f683be16..6d297d38fa8 100644
--- a/go/vt/vtgate/vtgate.go
+++ b/go/vt/vtgate/vtgate.go
@@ -297,9 +297,12 @@ func Init(
gw.RegisterStats()
// Retry loop for potential time-outs waiting for all tablets.
+OuterLoop:
for {
err := gw.WaitForTablets(ctx, tabletTypesToWait)
switch {
+ case err == nil:
+ break OuterLoop
case errors.Is(err, context.DeadlineExceeded):
log.Warning("TabletGateway timed out waiting for tablets to become available - retrying.")
From c59a6426f68032dcf86e347b534399d4706d551e Mon Sep 17 00:00:00 2001
From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com>
Date: Mon, 3 Feb 2025 14:00:18 +0100
Subject: [PATCH 3/7] Fix region_example test
The test needed an extra tablet because the vtgate start-up script is waiting
for primary and replica, but we were only provisioning primary.
Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>
---
examples/region_sharding/101_initial_cluster.sh | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/examples/region_sharding/101_initial_cluster.sh b/examples/region_sharding/101_initial_cluster.sh
index 6dd8989a32f..aec4f546d0e 100755
--- a/examples/region_sharding/101_initial_cluster.sh
+++ b/examples/region_sharding/101_initial_cluster.sh
@@ -30,8 +30,10 @@ fi
CELL=zone1 ../common/scripts/vtctld-up.sh
# start unsharded keyspace and tablet
-CELL=zone1 TABLET_UID=100 ../common/scripts/mysqlctl-up.sh
-SHARD=0 CELL=zone1 KEYSPACE=main TABLET_UID=100 ../common/scripts/vttablet-up.sh
+for T_UID in 100 101; do
+ CELL=zone1 TABLET_UID="${T_UID}" ../common/scripts/mysqlctl-up.sh
+ SHARD=0 CELL=zone1 KEYSPACE=main TABLET_UID="${T_UID}" ../common/scripts/vttablet-up.sh
+done
# set the correct durability policy for the keyspace
vtctldclient --server localhost:15999 SetKeyspaceDurabilityPolicy --durability-policy=none main || fail "Failed to set keyspace durability policy on the main keyspace"
From bc1d135da2fce6391c4a617266b5d7d6042837d9 Mon Sep 17 00:00:00 2001
From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com>
Date: Mon, 3 Feb 2025 15:46:14 +0100
Subject: [PATCH 4/7] Add logging to debug CI failures
Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>
---
go/test/endtoend/cluster/cluster_process.go | 2 ++
go/vt/discovery/healthcheck.go | 3 ++-
go/vt/discovery/topology_watcher.go | 7 +++++++
3 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go
index ed61f7fd0c9..61d7145db86 100644
--- a/go/test/endtoend/cluster/cluster_process.go
+++ b/go/test/endtoend/cluster/cluster_process.go
@@ -513,6 +513,8 @@ func (cluster *LocalProcessCluster) AddShard(keyspaceName string, shardName stri
for _, proc := range mysqlctlProcessList {
if err := proc.Wait(); err != nil {
log.Errorf("unable to start mysql process %v: %v", proc, err)
+ log.Errorf("Stdout: %s", proc.Stdout)
+ log.Errorf("Stderr: %s", proc.Stderr)
return nil, err
}
}
diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go
index 2f270bd7518..d8e16f03d81 100644
--- a/go/vt/discovery/healthcheck.go
+++ b/go/vt/discovery/healthcheck.go
@@ -395,6 +395,7 @@ func NewHealthCheck(ctx context.Context, retryDelay, healthCheckTimeout time.Dur
func (hc *HealthCheckImpl) AddTablet(tablet *topodata.Tablet) {
// check whether grpc port is present on tablet, if not return
if tablet.PortMap["grpc"] == 0 {
+ log.Infof("grpc port missing for tablet, not adding: %v", tablet.PortMap)
return
}
@@ -831,7 +832,7 @@ func (hc *HealthCheckImpl) waitForTablets(ctx context.Context, targets []*query.
timer.Stop()
for _, target := range targets {
if target != nil {
- log.Infof("couldn't find tablets for target: %v", target)
+ log.Infof("couldn't find tablets for target: %v - Require serving is %+v", target, requireServing)
}
}
return ctx.Err()
diff --git a/go/vt/discovery/topology_watcher.go b/go/vt/discovery/topology_watcher.go
index d1e358e1aa5..8df02d4fc27 100644
--- a/go/vt/discovery/topology_watcher.go
+++ b/go/vt/discovery/topology_watcher.go
@@ -163,6 +163,7 @@ func (tw *TopologyWatcher) loadTablets() {
defer tw.mu.Unlock()
for _, tInfo := range tabletInfos {
+ log.Infof("Got tabletInfo for %+v, isInServingGraph is %v", tInfo, tInfo.IsInServingGraph())
aliasStr := topoproto.TabletAliasString(tInfo.Alias)
tabletAliasStrs = append(tabletAliasStrs, aliasStr)
@@ -170,6 +171,7 @@ func (tw *TopologyWatcher) loadTablets() {
// We already have a tabletInfo for this and the flag tells us to not refresh.
if val, ok := tw.tablets[aliasStr]; ok {
newTablets[aliasStr] = val
+ log.Infof("Tablet Info already known and not refreshing for %+v", *tInfo)
continue
}
}
@@ -192,17 +194,21 @@ func (tw *TopologyWatcher) loadTablets() {
}
}
+ log.Infof("New tablets %+v", newTablets)
for alias, newVal := range newTablets {
if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(newVal.tablet) {
+ log.Infof("Tablet %s filtered out", alias)
continue
}
// Trust the alias from topo and add it if it doesn't exist.
if val, ok := tw.tablets[alias]; ok {
+ log.Infof("Tablet %s already in tw.tablets", alias)
// check if the host and port have changed. If yes, replace tablet.
oldKey := TabletToMapKey(val.tablet)
newKey := TabletToMapKey(newVal.tablet)
if oldKey != newKey {
+ log.Infof("Tablet key mismatch: old %s new %s, replacing in healthcheck", oldKey, newKey)
// This is the case where the same tablet alias is now reporting
// a different address (host:port) key.
tw.healthcheck.ReplaceTablet(val.tablet, newVal.tablet)
@@ -210,6 +216,7 @@ func (tw *TopologyWatcher) loadTablets() {
}
} else {
// This is a new tablet record, let's add it to the HealthCheck.
+ log.Infof("Tablet %s not in tw.tablets, adding to healthcheck", alias)
tw.healthcheck.AddTablet(newVal.tablet)
topologyWatcherOperations.Add(topologyWatcherOpAddTablet, 1)
}
From 97e8f3e9f5078e753a40c4afa988c2ade21cd80b Mon Sep 17 00:00:00 2001
From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com>
Date: Wed, 5 Feb 2025 10:08:01 +0100
Subject: [PATCH 5/7] Revert "Add logging to debug CI failures"
This reverts commit bc1d135da2fce6391c4a617266b5d7d6042837d9.
Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>
---
go/test/endtoend/cluster/cluster_process.go | 2 --
go/vt/discovery/healthcheck.go | 3 +--
go/vt/discovery/topology_watcher.go | 7 -------
3 files changed, 1 insertion(+), 11 deletions(-)
diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go
index 61d7145db86..ed61f7fd0c9 100644
--- a/go/test/endtoend/cluster/cluster_process.go
+++ b/go/test/endtoend/cluster/cluster_process.go
@@ -513,8 +513,6 @@ func (cluster *LocalProcessCluster) AddShard(keyspaceName string, shardName stri
for _, proc := range mysqlctlProcessList {
if err := proc.Wait(); err != nil {
log.Errorf("unable to start mysql process %v: %v", proc, err)
- log.Errorf("Stdout: %s", proc.Stdout)
- log.Errorf("Stderr: %s", proc.Stderr)
return nil, err
}
}
diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go
index d8e16f03d81..2f270bd7518 100644
--- a/go/vt/discovery/healthcheck.go
+++ b/go/vt/discovery/healthcheck.go
@@ -395,7 +395,6 @@ func NewHealthCheck(ctx context.Context, retryDelay, healthCheckTimeout time.Dur
func (hc *HealthCheckImpl) AddTablet(tablet *topodata.Tablet) {
// check whether grpc port is present on tablet, if not return
if tablet.PortMap["grpc"] == 0 {
- log.Infof("grpc port missing for tablet, not adding: %v", tablet.PortMap)
return
}
@@ -832,7 +831,7 @@ func (hc *HealthCheckImpl) waitForTablets(ctx context.Context, targets []*query.
timer.Stop()
for _, target := range targets {
if target != nil {
- log.Infof("couldn't find tablets for target: %v - Require serving is %+v", target, requireServing)
+ log.Infof("couldn't find tablets for target: %v", target)
}
}
return ctx.Err()
diff --git a/go/vt/discovery/topology_watcher.go b/go/vt/discovery/topology_watcher.go
index 8df02d4fc27..d1e358e1aa5 100644
--- a/go/vt/discovery/topology_watcher.go
+++ b/go/vt/discovery/topology_watcher.go
@@ -163,7 +163,6 @@ func (tw *TopologyWatcher) loadTablets() {
defer tw.mu.Unlock()
for _, tInfo := range tabletInfos {
- log.Infof("Got tabletInfo for %+v, isInServingGraph is %v", tInfo, tInfo.IsInServingGraph())
aliasStr := topoproto.TabletAliasString(tInfo.Alias)
tabletAliasStrs = append(tabletAliasStrs, aliasStr)
@@ -171,7 +170,6 @@ func (tw *TopologyWatcher) loadTablets() {
// We already have a tabletInfo for this and the flag tells us to not refresh.
if val, ok := tw.tablets[aliasStr]; ok {
newTablets[aliasStr] = val
- log.Infof("Tablet Info already known and not refreshing for %+v", *tInfo)
continue
}
}
@@ -194,21 +192,17 @@ func (tw *TopologyWatcher) loadTablets() {
}
}
- log.Infof("New tablets %+v", newTablets)
for alias, newVal := range newTablets {
if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(newVal.tablet) {
- log.Infof("Tablet %s filtered out", alias)
continue
}
// Trust the alias from topo and add it if it doesn't exist.
if val, ok := tw.tablets[alias]; ok {
- log.Infof("Tablet %s already in tw.tablets", alias)
// check if the host and port have changed. If yes, replace tablet.
oldKey := TabletToMapKey(val.tablet)
newKey := TabletToMapKey(newVal.tablet)
if oldKey != newKey {
- log.Infof("Tablet key mismatch: old %s new %s, replacing in healthcheck", oldKey, newKey)
// This is the case where the same tablet alias is now reporting
// a different address (host:port) key.
tw.healthcheck.ReplaceTablet(val.tablet, newVal.tablet)
@@ -216,7 +210,6 @@ func (tw *TopologyWatcher) loadTablets() {
}
} else {
// This is a new tablet record, let's add it to the HealthCheck.
- log.Infof("Tablet %s not in tw.tablets, adding to healthcheck", alias)
tw.healthcheck.AddTablet(newVal.tablet)
topologyWatcherOperations.Add(topologyWatcherOpAddTablet, 1)
}
From f96ba9fc2589acbd903a394d6f91da6839213ed9 Mon Sep 17 00:00:00 2001
From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com>
Date: Wed, 5 Feb 2025 10:10:23 +0100
Subject: [PATCH 6/7] Remove schematracker test TestLoadKeyspaceWithNoTablet
This test is no longer applicable as the vtgate won't start without tablets of the typs specified in --tablet_types_to_wait
Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com>
---
.../loadkeyspace/schema_load_keyspace_test.go | 66 -------------------
1 file changed, 66 deletions(-)
diff --git a/go/test/endtoend/vtgate/schematracker/loadkeyspace/schema_load_keyspace_test.go b/go/test/endtoend/vtgate/schematracker/loadkeyspace/schema_load_keyspace_test.go
index ec201487887..1b816c2af3e 100644
--- a/go/test/endtoend/vtgate/schematracker/loadkeyspace/schema_load_keyspace_test.go
+++ b/go/test/endtoend/vtgate/schematracker/loadkeyspace/schema_load_keyspace_test.go
@@ -19,85 +19,19 @@ package loadkeyspace
import (
"os"
"path"
- "strings"
"testing"
- "time"
"github.com/stretchr/testify/require"
- "vitess.io/vitess/go/test/endtoend/utils"
-
"vitess.io/vitess/go/test/endtoend/cluster"
)
var (
clusterInstance *cluster.LocalProcessCluster
hostname = "localhost"
- keyspaceName = "ks"
cell = "zone1"
- sqlSchema = `
- create table vt_user (
- id bigint,
- name varchar(64),
- primary key (id)
- ) Engine=InnoDB;
-
- create table main (
- id bigint,
- val varchar(128),
- primary key(id)
- ) Engine=InnoDB;
-
- create table test_table (
- id bigint,
- val varchar(128),
- primary key(id)
- ) Engine=InnoDB;
-`
)
-func TestLoadKeyspaceWithNoTablet(t *testing.T) {
- var err error
-
- clusterInstance = cluster.NewCluster(cell, hostname)
- defer clusterInstance.Teardown()
-
- // Start topo server
- err = clusterInstance.StartTopo()
- require.NoError(t, err)
-
- // create keyspace
- keyspace := &cluster.Keyspace{
- Name: keyspaceName,
- SchemaSQL: sqlSchema,
- }
- clusterInstance.VtTabletExtraArgs = append(clusterInstance.VtTabletExtraArgs, "--queryserver-config-schema-change-signal")
- err = clusterInstance.StartUnshardedKeyspace(*keyspace, 0, false)
- require.NoError(t, err)
-
- // teardown vttablets
- for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets {
- err = vttablet.VttabletProcess.TearDown()
- require.NoError(t, err)
- utils.TimeoutAction(t, 1*time.Minute, "timeout - teardown of VTTablet", func() bool {
- return vttablet.VttabletProcess.GetStatus() == ""
- })
- }
-
- // Start vtgate with the schema_change_signal flag
- clusterInstance.VtGateExtraArgs = append(clusterInstance.VtGateExtraArgs, "--schema_change_signal")
- err = clusterInstance.StartVtgate()
- require.NoError(t, err)
-
- // After starting VTGate we need to leave enough time for resolveAndLoadKeyspace to reach
- // the schema tracking timeout (5 seconds).
- utils.TimeoutAction(t, 5*time.Minute, "timeout - could not find 'Unable to get initial schema reload' in 'vtgate-stderr.txt'", func() bool {
- logDir := clusterInstance.VtgateProcess.LogDir
- all, _ := os.ReadFile(path.Join(logDir, "vtgate-stderr.txt"))
- return strings.Contains(string(all), "Unable to get initial schema reload")
- })
-}
-
func TestNoInitialKeyspace(t *testing.T) {
var err error
From dbbf8792aae667e4721e8def22a3a32c01fee46c Mon Sep 17 00:00:00 2001
From: Eduardo Ortega <5791035+ejortegau@users.noreply.github.com>
Date: Wed, 12 Feb 2025 12:26:50 +0100
Subject: [PATCH 7/7] Add change to release notes.
Signed-off-by: Eduardo Ortega <5791035+ejortegau@users.noreply.github.com>
---
changelog/22.0/22.0.0/summary.md | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/changelog/22.0/22.0.0/summary.md b/changelog/22.0/22.0.0/summary.md
index f773df2b10c..8137b8e3769 100644
--- a/changelog/22.0/22.0.0/summary.md
+++ b/changelog/22.0/22.0.0/summary.md
@@ -22,6 +22,8 @@
- **[Topology read concurrency behaviour changes](#topo-read-concurrency-changes)**
- **[VTAdmin](#vtadmin)**
- [Updated to node v22.13.1](#updated-node)
+ - **[VTGate Flags](#vtgate-flags)**
+ - [vtgate behavior for `--tablet_types_to_wait`](#vtgate-init-tabletgw-wait-tablet-type)
## Major Changes
@@ -167,3 +169,12 @@ All topology read calls _(`Get`, `GetVersion`, `List` and `ListDir`)_ now respec
Building `vtadmin-web` now requires node >= v22.13.0 (LTS). Breaking changes from v20 to v22 can be found at https://nodejs.org/en/blog/release/v22.13.0 -- with no known issues that apply to VTAdmin.
Full details on the node v20.12.2 release can be found at https://nodejs.org/en/blog/release/v22.13.1.
+
+## VTGate Flags
+
+### vtgate behavior for `--tablet_types_to_wait`
+Previously, if waiting for them took longer than the time specified in `--gateway_initial_tablet_timeout`, the `vtgate`
+would log a warning and start serving. If queries were received by the `vtgate` for the tablet types that were not
+received on time, they would fail. Now, if waiting times out, a warning is logged, and the `vtgate` retries until it
+succeeds. This means it does not enter service until it has gotten health checks for all tablet types, preventing those
+errors - at the expense of potentially taking longer to start serving.
\ No newline at end of file