From 65c00a82ffa567665bda5384ee52ef35818d9833 Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com> Date: Fri, 24 Jan 2025 11:17:12 +0100 Subject: [PATCH 1/7] Retry waiting for tablets during vtgate initialization if they timeout. Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com> --- go/vt/vtgate/tabletgateway.go | 3 +-- go/vt/vtgate/vtgate.go | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/go/vt/vtgate/tabletgateway.go b/go/vt/vtgate/tabletgateway.go index c36c6981fa2..48cd37b28fe 100644 --- a/go/vt/vtgate/tabletgateway.go +++ b/go/vt/vtgate/tabletgateway.go @@ -207,9 +207,8 @@ func (gw *TabletGateway) WaitForTablets(ctx context.Context, tabletTypesToWait [ case context.DeadlineExceeded: // In this scenario, we were able to reach the // topology service, but some tablets may not be - // ready. We just warn and keep going. + // ready. log.Warningf("Timeout waiting for all keyspaces / shards to have healthy tablets of types %v, may be in degraded mode", tabletTypesToWait) - err = nil } }() diff --git a/go/vt/vtgate/vtgate.go b/go/vt/vtgate/vtgate.go index 8b8302d77d4..298f683be16 100644 --- a/go/vt/vtgate/vtgate.go +++ b/go/vt/vtgate/vtgate.go @@ -295,8 +295,18 @@ func Init( // TabletGateway can create it's own healthcheck gw := NewTabletGateway(ctx, hc, serv, cell) gw.RegisterStats() - if err := gw.WaitForTablets(ctx, tabletTypesToWait); err != nil { - log.Fatalf("tabletGateway.WaitForTablets failed: %v", err) + + // Retry loop for potential time-outs waiting for all tablets. + for { + err := gw.WaitForTablets(ctx, tabletTypesToWait) + switch { + case errors.Is(err, context.DeadlineExceeded): + log.Warning("TabletGateway timed out waiting for tablets to become available - retrying.") + + continue + default: + log.Fatalf("tabletGateway.WaitForTablets failed: %v", err) + } } dynamicConfig := NewDynamicViperConfig() From abb53cb4610af9f129cf16ef3cf410a3ea37d1d9 Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com> Date: Fri, 24 Jan 2025 11:17:34 +0100 Subject: [PATCH 2/7] Retry waiting for tablets during vtgate initialization if they timeout. Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com> --- go/vt/vtgate/vtgate.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/go/vt/vtgate/vtgate.go b/go/vt/vtgate/vtgate.go index 298f683be16..6d297d38fa8 100644 --- a/go/vt/vtgate/vtgate.go +++ b/go/vt/vtgate/vtgate.go @@ -297,9 +297,12 @@ func Init( gw.RegisterStats() // Retry loop for potential time-outs waiting for all tablets. +OuterLoop: for { err := gw.WaitForTablets(ctx, tabletTypesToWait) switch { + case err == nil: + break OuterLoop case errors.Is(err, context.DeadlineExceeded): log.Warning("TabletGateway timed out waiting for tablets to become available - retrying.") From c59a6426f68032dcf86e347b534399d4706d551e Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com> Date: Mon, 3 Feb 2025 14:00:18 +0100 Subject: [PATCH 3/7] Fix region_example test The test needed an extra tablet because the vtgate start-up script is waiting for primary and replica, but we were only provisioning primary. Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com> --- examples/region_sharding/101_initial_cluster.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/region_sharding/101_initial_cluster.sh b/examples/region_sharding/101_initial_cluster.sh index 6dd8989a32f..aec4f546d0e 100755 --- a/examples/region_sharding/101_initial_cluster.sh +++ b/examples/region_sharding/101_initial_cluster.sh @@ -30,8 +30,10 @@ fi CELL=zone1 ../common/scripts/vtctld-up.sh # start unsharded keyspace and tablet -CELL=zone1 TABLET_UID=100 ../common/scripts/mysqlctl-up.sh -SHARD=0 CELL=zone1 KEYSPACE=main TABLET_UID=100 ../common/scripts/vttablet-up.sh +for T_UID in 100 101; do + CELL=zone1 TABLET_UID="${T_UID}" ../common/scripts/mysqlctl-up.sh + SHARD=0 CELL=zone1 KEYSPACE=main TABLET_UID="${T_UID}" ../common/scripts/vttablet-up.sh +done # set the correct durability policy for the keyspace vtctldclient --server localhost:15999 SetKeyspaceDurabilityPolicy --durability-policy=none main || fail "Failed to set keyspace durability policy on the main keyspace" From bc1d135da2fce6391c4a617266b5d7d6042837d9 Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com> Date: Mon, 3 Feb 2025 15:46:14 +0100 Subject: [PATCH 4/7] Add logging to debug CI failures Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com> --- go/test/endtoend/cluster/cluster_process.go | 2 ++ go/vt/discovery/healthcheck.go | 3 ++- go/vt/discovery/topology_watcher.go | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go index ed61f7fd0c9..61d7145db86 100644 --- a/go/test/endtoend/cluster/cluster_process.go +++ b/go/test/endtoend/cluster/cluster_process.go @@ -513,6 +513,8 @@ func (cluster *LocalProcessCluster) AddShard(keyspaceName string, shardName stri for _, proc := range mysqlctlProcessList { if err := proc.Wait(); err != nil { log.Errorf("unable to start mysql process %v: %v", proc, err) + log.Errorf("Stdout: %s", proc.Stdout) + log.Errorf("Stderr: %s", proc.Stderr) return nil, err } } diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index 2f270bd7518..d8e16f03d81 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -395,6 +395,7 @@ func NewHealthCheck(ctx context.Context, retryDelay, healthCheckTimeout time.Dur func (hc *HealthCheckImpl) AddTablet(tablet *topodata.Tablet) { // check whether grpc port is present on tablet, if not return if tablet.PortMap["grpc"] == 0 { + log.Infof("grpc port missing for tablet, not adding: %v", tablet.PortMap) return } @@ -831,7 +832,7 @@ func (hc *HealthCheckImpl) waitForTablets(ctx context.Context, targets []*query. timer.Stop() for _, target := range targets { if target != nil { - log.Infof("couldn't find tablets for target: %v", target) + log.Infof("couldn't find tablets for target: %v - Require serving is %+v", target, requireServing) } } return ctx.Err() diff --git a/go/vt/discovery/topology_watcher.go b/go/vt/discovery/topology_watcher.go index d1e358e1aa5..8df02d4fc27 100644 --- a/go/vt/discovery/topology_watcher.go +++ b/go/vt/discovery/topology_watcher.go @@ -163,6 +163,7 @@ func (tw *TopologyWatcher) loadTablets() { defer tw.mu.Unlock() for _, tInfo := range tabletInfos { + log.Infof("Got tabletInfo for %+v, isInServingGraph is %v", tInfo, tInfo.IsInServingGraph()) aliasStr := topoproto.TabletAliasString(tInfo.Alias) tabletAliasStrs = append(tabletAliasStrs, aliasStr) @@ -170,6 +171,7 @@ func (tw *TopologyWatcher) loadTablets() { // We already have a tabletInfo for this and the flag tells us to not refresh. if val, ok := tw.tablets[aliasStr]; ok { newTablets[aliasStr] = val + log.Infof("Tablet Info already known and not refreshing for %+v", *tInfo) continue } } @@ -192,17 +194,21 @@ func (tw *TopologyWatcher) loadTablets() { } } + log.Infof("New tablets %+v", newTablets) for alias, newVal := range newTablets { if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(newVal.tablet) { + log.Infof("Tablet %s filtered out", alias) continue } // Trust the alias from topo and add it if it doesn't exist. if val, ok := tw.tablets[alias]; ok { + log.Infof("Tablet %s already in tw.tablets", alias) // check if the host and port have changed. If yes, replace tablet. oldKey := TabletToMapKey(val.tablet) newKey := TabletToMapKey(newVal.tablet) if oldKey != newKey { + log.Infof("Tablet key mismatch: old %s new %s, replacing in healthcheck", oldKey, newKey) // This is the case where the same tablet alias is now reporting // a different address (host:port) key. tw.healthcheck.ReplaceTablet(val.tablet, newVal.tablet) @@ -210,6 +216,7 @@ func (tw *TopologyWatcher) loadTablets() { } } else { // This is a new tablet record, let's add it to the HealthCheck. + log.Infof("Tablet %s not in tw.tablets, adding to healthcheck", alias) tw.healthcheck.AddTablet(newVal.tablet) topologyWatcherOperations.Add(topologyWatcherOpAddTablet, 1) } From 97e8f3e9f5078e753a40c4afa988c2ade21cd80b Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com> Date: Wed, 5 Feb 2025 10:08:01 +0100 Subject: [PATCH 5/7] Revert "Add logging to debug CI failures" This reverts commit bc1d135da2fce6391c4a617266b5d7d6042837d9. Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com> --- go/test/endtoend/cluster/cluster_process.go | 2 -- go/vt/discovery/healthcheck.go | 3 +-- go/vt/discovery/topology_watcher.go | 7 ------- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go index 61d7145db86..ed61f7fd0c9 100644 --- a/go/test/endtoend/cluster/cluster_process.go +++ b/go/test/endtoend/cluster/cluster_process.go @@ -513,8 +513,6 @@ func (cluster *LocalProcessCluster) AddShard(keyspaceName string, shardName stri for _, proc := range mysqlctlProcessList { if err := proc.Wait(); err != nil { log.Errorf("unable to start mysql process %v: %v", proc, err) - log.Errorf("Stdout: %s", proc.Stdout) - log.Errorf("Stderr: %s", proc.Stderr) return nil, err } } diff --git a/go/vt/discovery/healthcheck.go b/go/vt/discovery/healthcheck.go index d8e16f03d81..2f270bd7518 100644 --- a/go/vt/discovery/healthcheck.go +++ b/go/vt/discovery/healthcheck.go @@ -395,7 +395,6 @@ func NewHealthCheck(ctx context.Context, retryDelay, healthCheckTimeout time.Dur func (hc *HealthCheckImpl) AddTablet(tablet *topodata.Tablet) { // check whether grpc port is present on tablet, if not return if tablet.PortMap["grpc"] == 0 { - log.Infof("grpc port missing for tablet, not adding: %v", tablet.PortMap) return } @@ -832,7 +831,7 @@ func (hc *HealthCheckImpl) waitForTablets(ctx context.Context, targets []*query. timer.Stop() for _, target := range targets { if target != nil { - log.Infof("couldn't find tablets for target: %v - Require serving is %+v", target, requireServing) + log.Infof("couldn't find tablets for target: %v", target) } } return ctx.Err() diff --git a/go/vt/discovery/topology_watcher.go b/go/vt/discovery/topology_watcher.go index 8df02d4fc27..d1e358e1aa5 100644 --- a/go/vt/discovery/topology_watcher.go +++ b/go/vt/discovery/topology_watcher.go @@ -163,7 +163,6 @@ func (tw *TopologyWatcher) loadTablets() { defer tw.mu.Unlock() for _, tInfo := range tabletInfos { - log.Infof("Got tabletInfo for %+v, isInServingGraph is %v", tInfo, tInfo.IsInServingGraph()) aliasStr := topoproto.TabletAliasString(tInfo.Alias) tabletAliasStrs = append(tabletAliasStrs, aliasStr) @@ -171,7 +170,6 @@ func (tw *TopologyWatcher) loadTablets() { // We already have a tabletInfo for this and the flag tells us to not refresh. if val, ok := tw.tablets[aliasStr]; ok { newTablets[aliasStr] = val - log.Infof("Tablet Info already known and not refreshing for %+v", *tInfo) continue } } @@ -194,21 +192,17 @@ func (tw *TopologyWatcher) loadTablets() { } } - log.Infof("New tablets %+v", newTablets) for alias, newVal := range newTablets { if tw.tabletFilter != nil && !tw.tabletFilter.IsIncluded(newVal.tablet) { - log.Infof("Tablet %s filtered out", alias) continue } // Trust the alias from topo and add it if it doesn't exist. if val, ok := tw.tablets[alias]; ok { - log.Infof("Tablet %s already in tw.tablets", alias) // check if the host and port have changed. If yes, replace tablet. oldKey := TabletToMapKey(val.tablet) newKey := TabletToMapKey(newVal.tablet) if oldKey != newKey { - log.Infof("Tablet key mismatch: old %s new %s, replacing in healthcheck", oldKey, newKey) // This is the case where the same tablet alias is now reporting // a different address (host:port) key. tw.healthcheck.ReplaceTablet(val.tablet, newVal.tablet) @@ -216,7 +210,6 @@ func (tw *TopologyWatcher) loadTablets() { } } else { // This is a new tablet record, let's add it to the HealthCheck. - log.Infof("Tablet %s not in tw.tablets, adding to healthcheck", alias) tw.healthcheck.AddTablet(newVal.tablet) topologyWatcherOperations.Add(topologyWatcherOpAddTablet, 1) } From f96ba9fc2589acbd903a394d6f91da6839213ed9 Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U." <5791035+ejortegau@users.noreply.github.com> Date: Wed, 5 Feb 2025 10:10:23 +0100 Subject: [PATCH 6/7] Remove schematracker test TestLoadKeyspaceWithNoTablet This test is no longer applicable as the vtgate won't start without tablets of the typs specified in --tablet_types_to_wait Signed-off-by: Eduardo J. Ortega U. <5791035+ejortegau@users.noreply.github.com> --- .../loadkeyspace/schema_load_keyspace_test.go | 66 ------------------- 1 file changed, 66 deletions(-) diff --git a/go/test/endtoend/vtgate/schematracker/loadkeyspace/schema_load_keyspace_test.go b/go/test/endtoend/vtgate/schematracker/loadkeyspace/schema_load_keyspace_test.go index ec201487887..1b816c2af3e 100644 --- a/go/test/endtoend/vtgate/schematracker/loadkeyspace/schema_load_keyspace_test.go +++ b/go/test/endtoend/vtgate/schematracker/loadkeyspace/schema_load_keyspace_test.go @@ -19,85 +19,19 @@ package loadkeyspace import ( "os" "path" - "strings" "testing" - "time" "github.com/stretchr/testify/require" - "vitess.io/vitess/go/test/endtoend/utils" - "vitess.io/vitess/go/test/endtoend/cluster" ) var ( clusterInstance *cluster.LocalProcessCluster hostname = "localhost" - keyspaceName = "ks" cell = "zone1" - sqlSchema = ` - create table vt_user ( - id bigint, - name varchar(64), - primary key (id) - ) Engine=InnoDB; - - create table main ( - id bigint, - val varchar(128), - primary key(id) - ) Engine=InnoDB; - - create table test_table ( - id bigint, - val varchar(128), - primary key(id) - ) Engine=InnoDB; -` ) -func TestLoadKeyspaceWithNoTablet(t *testing.T) { - var err error - - clusterInstance = cluster.NewCluster(cell, hostname) - defer clusterInstance.Teardown() - - // Start topo server - err = clusterInstance.StartTopo() - require.NoError(t, err) - - // create keyspace - keyspace := &cluster.Keyspace{ - Name: keyspaceName, - SchemaSQL: sqlSchema, - } - clusterInstance.VtTabletExtraArgs = append(clusterInstance.VtTabletExtraArgs, "--queryserver-config-schema-change-signal") - err = clusterInstance.StartUnshardedKeyspace(*keyspace, 0, false) - require.NoError(t, err) - - // teardown vttablets - for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets { - err = vttablet.VttabletProcess.TearDown() - require.NoError(t, err) - utils.TimeoutAction(t, 1*time.Minute, "timeout - teardown of VTTablet", func() bool { - return vttablet.VttabletProcess.GetStatus() == "" - }) - } - - // Start vtgate with the schema_change_signal flag - clusterInstance.VtGateExtraArgs = append(clusterInstance.VtGateExtraArgs, "--schema_change_signal") - err = clusterInstance.StartVtgate() - require.NoError(t, err) - - // After starting VTGate we need to leave enough time for resolveAndLoadKeyspace to reach - // the schema tracking timeout (5 seconds). - utils.TimeoutAction(t, 5*time.Minute, "timeout - could not find 'Unable to get initial schema reload' in 'vtgate-stderr.txt'", func() bool { - logDir := clusterInstance.VtgateProcess.LogDir - all, _ := os.ReadFile(path.Join(logDir, "vtgate-stderr.txt")) - return strings.Contains(string(all), "Unable to get initial schema reload") - }) -} - func TestNoInitialKeyspace(t *testing.T) { var err error From dbbf8792aae667e4721e8def22a3a32c01fee46c Mon Sep 17 00:00:00 2001 From: Eduardo Ortega <5791035+ejortegau@users.noreply.github.com> Date: Wed, 12 Feb 2025 12:26:50 +0100 Subject: [PATCH 7/7] Add change to release notes. Signed-off-by: Eduardo Ortega <5791035+ejortegau@users.noreply.github.com> --- changelog/22.0/22.0.0/summary.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/changelog/22.0/22.0.0/summary.md b/changelog/22.0/22.0.0/summary.md index f773df2b10c..8137b8e3769 100644 --- a/changelog/22.0/22.0.0/summary.md +++ b/changelog/22.0/22.0.0/summary.md @@ -22,6 +22,8 @@ - **[Topology read concurrency behaviour changes](#topo-read-concurrency-changes)** - **[VTAdmin](#vtadmin)** - [Updated to node v22.13.1](#updated-node) + - **[VTGate Flags](#vtgate-flags)** + - [vtgate behavior for `--tablet_types_to_wait`](#vtgate-init-tabletgw-wait-tablet-type) ## Major Changes @@ -167,3 +169,12 @@ All topology read calls _(`Get`, `GetVersion`, `List` and `ListDir`)_ now respec Building `vtadmin-web` now requires node >= v22.13.0 (LTS). Breaking changes from v20 to v22 can be found at https://nodejs.org/en/blog/release/v22.13.0 -- with no known issues that apply to VTAdmin. Full details on the node v20.12.2 release can be found at https://nodejs.org/en/blog/release/v22.13.1. + +## VTGate Flags + +### vtgate behavior for `--tablet_types_to_wait` +Previously, if waiting for them took longer than the time specified in `--gateway_initial_tablet_timeout`, the `vtgate` +would log a warning and start serving. If queries were received by the `vtgate` for the tablet types that were not +received on time, they would fail. Now, if waiting times out, a warning is logged, and the `vtgate` retries until it +succeeds. This means it does not enter service until it has gotten health checks for all tablet types, preventing those +errors - at the expense of potentially taking longer to start serving. \ No newline at end of file