Skip to content

Commit

Permalink
roachprod: set COCKROACH_CONNECT_TIMEOUT to 1200s during start
Browse files Browse the repository at this point in the history
Previously the sql cmds invoked within roachprod start had an infinite timeout.
This patch reduces the timeout to 10 minutes -- if a node can't start within 10
minutes because a roachprod client can't connect to the new crdb node, there's
likely a problem.

We decided to make this change after a roachtest with a 10 hr time out hung for
10 hours due to an internal start cmd. See #99280.

Informs #99280

Release note: none
  • Loading branch information
msbutler committed Mar 30, 2023
1 parent 56bb002 commit 20053d9
Showing 1 changed file with 12 additions and 8 deletions.
20 changes: 12 additions & 8 deletions pkg/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ type StartOpts struct {
KVCluster *SyncedCluster
}

// startSQLTimeout identifies the COCKROACH_CONNECT_TIMEOUT to use (in seconds)
// for sql cmds within syncedCluster.Start().
const startSQLTimeout = 1200

// StartTarget identifies what flavor of cockroach we are starting.
type StartTarget int

Expand Down Expand Up @@ -693,12 +697,12 @@ func (c *SyncedCluster) generateClusterSettingCmd(l *logger.Logger, node Node) s
// removed in v21.2.
clusterSettingCmd += fmt.Sprintf(`
if ! test -e %s ; then
COCKROACH_CONNECT_TIMEOUT=0 %s sql --url %s -e "SET CLUSTER SETTING server.remote_debugging.mode = 'any'" || true;
COCKROACH_CONNECT_TIMEOUT=0 %s sql --url %s -e "
COCKROACH_CONNECT_TIMEOUT=%d %s sql --url %s -e "SET CLUSTER SETTING server.remote_debugging.mode = 'any'" || true;
COCKROACH_CONNECT_TIMEOUT=%d %s sql --url %s -e "
SET CLUSTER SETTING cluster.organization = 'Cockroach Labs - Production Testing';
SET CLUSTER SETTING enterprise.license = '%s';" \
&& touch %s
fi`, path, binary, url, binary, url, config.CockroachDevLicense, path)
fi`, path, startSQLTimeout, binary, url, startSQLTimeout, binary, url, config.CockroachDevLicense, path)
return clusterSettingCmd
}

Expand All @@ -713,8 +717,8 @@ func (c *SyncedCluster) generateInitCmd(node Node) string {
binary := cockroachNodeBinary(c, node)
initCmd += fmt.Sprintf(`
if ! test -e %[1]s ; then
COCKROACH_CONNECT_TIMEOUT=0 %[2]s init --url %[3]s && touch %[1]s
fi`, path, binary, url)
COCKROACH_CONNECT_TIMEOUT=%[4]d %[2]s init --url %[3]s && touch %[1]s
fi`, path, binary, url, startSQLTimeout)
return initCmd
}

Expand Down Expand Up @@ -829,10 +833,10 @@ func (c *SyncedCluster) createFixedBackupSchedule(
node := c.Nodes[0]
binary := cockroachNodeBinary(c, node)
url := c.NodeURL("localhost", c.NodePort(node), "" /* tenantName */)
fullCmd := fmt.Sprintf(`COCKROACH_CONNECT_TIMEOUT=0 %s sql --url %s -e %q`,
binary, url, createScheduleCmd)
fullCmd := fmt.Sprintf(`COCKROACH_CONNECT_TIMEOUT=%d %s sql --url %s -e %q`,
startSQLTimeout, binary, url, createScheduleCmd)
// Instead of using `c.ExecSQL()`, use the more flexible c.newSession(), which allows us to
// 1) prefix the schedule backup cmd with COCKROACH_CONNECT_TIMEOUT=0.
// 1) prefix the schedule backup cmd with COCKROACH_CONNECT_TIMEOUT.
// 2) run the command against the first node in the cluster target.
sess := c.newSession(l, node, fullCmd, withDebugName("init-backup-schedule"))
defer sess.Close()
Expand Down

0 comments on commit 20053d9

Please sign in to comment.