diff --git a/pkg/cmd/roachprod/config/config.go b/pkg/cmd/roachprod/config/config.go index 74c73b0bc146..f2b698614aa2 100644 --- a/pkg/cmd/roachprod/config/config.go +++ b/pkg/cmd/roachprod/config/config.go @@ -43,4 +43,8 @@ const ( // SharedUser is the linux username for shared use on all vms. SharedUser = "ubuntu" + + // MemoryMax is passed to systemd-run; the cockroach process is killed if it + // uses more than this percentage of the host's memory. + MemoryMax = "95%" ) diff --git a/pkg/cmd/roachprod/install/cluster_synced.go b/pkg/cmd/roachprod/install/cluster_synced.go index 22c76d926d04..9f251e4ab0b2 100644 --- a/pkg/cmd/roachprod/install/cluster_synced.go +++ b/pkg/cmd/roachprod/install/cluster_synced.go @@ -156,7 +156,13 @@ func (c *SyncedCluster) newSession(i int) (session, error) { return newRemoteSession(c.user(i), c.host(i), c.DebugDir) } -// Stop TODO(peter): document +// Stop is used to stop cockroach on all nodes in the cluster. +// +// It sends a signal to all processes that have been started with ROACHPROD env +// var and optionally waits until the processes stop. +// +// When running roachprod stop without other flags, the signal is 9 (SIGKILL) +// and wait is true. func (c *SyncedCluster) Stop(sig int, wait bool) { display := fmt.Sprintf("%s: stopping", c.Name) if wait { @@ -181,14 +187,15 @@ func (c *SyncedCluster) Stop(sig int, wait bool) { sleep 1 done echo "${pid}: dead" >> %[1]s/roachprod.log - done -`, c.Impl.LogDir(c, c.Nodes[i])) + done`, + c.Impl.LogDir(c, c.Nodes[i]), // [1] + ) } // NB: the awkward-looking `awk` invocation serves to avoid having the // awk process match its own output from `ps`. cmd := fmt.Sprintf(` -mkdir -p logs +mkdir -p %[1]s echo ">>> roachprod stop: $(date)" >> %[1]s/roachprod.log ps axeww -o pid -o command >> %[1]s/roachprod.log pids=$(ps axeww -o pid -o command | \ @@ -197,8 +204,13 @@ pids=$(ps axeww -o pid -o command | \ if [ -n "${pids}" ]; then kill -%[4]d ${pids} %[5]s -fi -`, c.Impl.LogDir(c, c.Nodes[i]), c.Nodes[i], c.escapedTag(), sig, waitCmd) +fi`, + c.Impl.LogDir(c, c.Nodes[i]), // [1] + c.Nodes[i], // [2] + c.escapedTag(), // [3] + sig, // [4] + waitCmd, // [5] + ) return sess.CombinedOutput(cmd) }) } diff --git a/pkg/cmd/roachprod/install/cockroach.go b/pkg/cmd/roachprod/install/cockroach.go index b6c8276ec4cc..b267c772b825 100644 --- a/pkg/cmd/roachprod/install/cockroach.go +++ b/pkg/cmd/roachprod/install/cockroach.go @@ -19,6 +19,7 @@ import ( "regexp" "sort" "strings" + "text/template" "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config" "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ssh" @@ -43,7 +44,7 @@ func cockroachNodeBinary(c *SyncedCluster, node int) string { return config.Binary } if !c.IsLocal() { - return "./" + config.Binary + return "${HOME}/" + config.Binary } path := filepath.Join(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), node), config.Binary) @@ -377,6 +378,87 @@ func (h *crdbInstallHelper) startNode( func (h *crdbInstallHelper) generateStartCmd( nodeIdx int, extraArgs []string, vers *version.Version, ) (string, error) { + + tpl, err := template.New("start").Parse(`#!/bin/bash +set -euo pipefail + +mkdir -p {{.LogDir}} +helper="{{if .Local}}{{.LogDir}}{{else}}${HOME}{{end}}/cockroach-helper.sh" +verb="{{if .Local}}run{{else}}run-systemd{{end}}" + +# 'EOF' disables parameter substitution in the heredoc. +cat > "${helper}" << 'EOF' && chmod +x "${helper}" && "${helper}" "${verb}" +#!/bin/bash +set -euo pipefail + +if [[ "${1}" == "run" ]]; then + local="{{if .Local}}true{{end}}" + ulimit -c unlimited + mkdir -p {{.LogDir}} + echo "cockroach start: $(date), logging to {{.LogDir}}" | tee -a {{.LogDir}}/{roachprod,cockroach.std{out,err}}.log + {{.KeyCmd}} + export ROACHPROD={{.NodeNum}}{{.Tag}} {{.EnvVars}} + background="" + if [[ "${local}" ]]; then + background="--background" + fi + CODE=0 + {{.Binary}} {{.StartCmd}} {{.Args}} ${background} >> {{.LogDir}}/cockroach.stdout.log 2>> {{.LogDir}}/cockroach.stderr.log || CODE=$? + if [[ -z "${local}" || ${CODE} -ne 0 ]]; then + echo "cockroach exited with code ${CODE}: $(date)" | tee -a {{.LogDir}}/{roachprod,cockroach.{exit,std{out,err}}}.log + fi + exit ${CODE} +fi + +if [[ "${1}" != "run-systemd" ]]; then + echo "unsupported: ${1}" + exit 1 +fi + +if systemctl is-active -q cockroach; then + echo "cockroach service already active" + echo "To get more information: systemctl status cockroach" + exit 1 +fi + +# If cockroach failed, the service still exists; we need to clean it up before +# we can start it again. +sudo systemctl reset-failed cockroach 2>/dev/null || true + +# The first time we run, install a small script that shows some helpful +# information when we ssh in. +if [ ! -e ${HOME}/.profile-cockroach ]; then + cat > ${HOME}/.profile-cockroach <<'EOQ' +echo "" +if systemctl is-active -q cockroach; then + echo "cockroach is running; see: systemctl status cockroach" +elif systemctl is-failed -q cockroach; then + echo "cockroach stopped; see: systemctl status cockroach" +else + echo "cockroach not started" +fi +echo "" +EOQ + echo ". ${HOME}/.profile-cockroach" >> ${HOME}/.profile +fi + +# We run this script (with arg "run") as a service unit. We do not use --user +# because memory limiting doesn't work in that mode. Instead we pass the uid and +# gid that the process will run under. +# The "notify" service type means that systemd-run waits until cockroach +# notifies systemd that it is ready; NotifyAccess=all is needed because this +# notification doesn't come from the main PID (which is bash). +sudo systemd-run --unit cockroach \ + --same-dir --uid $(id -u) --gid $(id -g) \ + --service-type=notify -p NotifyAccess=all \ + -p MemoryMax={{.MemoryMax}} \ + bash $0 run +EOF +`) + if err != nil { + return "", err + } + args, err := h.generateStartArgs(nodeIdx, extraArgs, vers) if err != nil { return "", err @@ -390,35 +472,28 @@ func (h *crdbInstallHelper) generateStartCmd( } else { startCmd = "start" } - nodes := h.c.ServerNodes() - logDir := h.c.Impl.LogDir(h.c, nodes[nodeIdx]) - binary := cockroachNodeBinary(h.c, nodes[nodeIdx]) - keyCmd := h.generateKeyCmd(nodeIdx, extraArgs) - - // NB: this is awkward as when the process fails, the test runner will show an - // unhelpful empty error (since everything has been redirected away). This is - // unfortunately equally awkward to address. - cmd := fmt.Sprintf(` - ulimit -c unlimited; mkdir -p %[1]s; - echo ">>> roachprod start: $(date)" >> %[1]s/roachprod.log; - ps axeww -o pid -o command >> %[1]s/roachprod.log; - [ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> %[1]s/roachprod.log; %[2]s - export ROACHPROD=%[3]d%[4]s; - GOTRACEBACK=crash COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 %[5]s \ - %[6]s %[7]s %[8]s >> %[1]s/cockroach.stdout.log \ - 2>> %[1]s/cockroach.stderr.log \ - || (x=$?; cat %[1]s/cockroach.stderr.log; exit $x)`, - logDir, // [1] - keyCmd, // [2] - nodes[nodeIdx], // [3] - h.c.Tag, // [4] - h.getEnvVars(), // [5] - binary, // [6] - startCmd, // [7] - strings.Join(args, " "), // [8] - ) - return cmd, nil + var buf strings.Builder + if err := tpl.Execute(&buf, struct { + LogDir, KeyCmd, Tag, EnvVars, Binary, StartCmd, Args, MemoryMax string + NodeNum int + Local bool + }{ + LogDir: h.c.Impl.LogDir(h.c, nodes[nodeIdx]), + KeyCmd: h.generateKeyCmd(nodeIdx, extraArgs), + Tag: h.c.Tag, + EnvVars: "GOTRACEBACK=crash COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 " + h.getEnvVars(), + Binary: cockroachNodeBinary(h.c, nodes[nodeIdx]), + StartCmd: startCmd, + Args: strings.Join(args, " "), + MemoryMax: config.MemoryMax, + NodeNum: nodes[nodeIdx], + Local: h.c.IsLocal(), + }); err != nil { + return "", err + } + + return buf.String(), nil } func (h *crdbInstallHelper) generateStartArgs( @@ -427,7 +502,6 @@ func (h *crdbInstallHelper) generateStartArgs( var args []string nodes := h.c.ServerNodes() - args = append(args, "--background") if h.c.Secure { args = append(args, "--certs-dir="+h.c.Impl.CertsDir(h.c, nodes[nodeIdx])) } else {