Skip to content

Commit

Permalink
Merge #64177
Browse files Browse the repository at this point in the history
64177: roachprod: improve the way cockroach is run r=tbg a=RaduBerinde

While running some stress tests with TPCH, I observed two big problems
with roachprod:
 - the out-of-memory behavior is very bad: instead of the process
   being killed, the system enters a thrashing mode where everything
   in the VM slows to a crawl (to the point where just sshing in can
   take minutes).
 - when the cockroach process exits, the exit code is not recorded
   anywhere, making it impossible in some cases to figure out why it
   stopped. In my particular case, we were exiting with exit code 8
   (which is `exit.TimeoutAfterFatalError()`) because writing to the
   logs was unacceptably slow.

This commit attempts to improve things on both these fronts. Instead
of running with `--background`, we use `systemd-run` to run cockroach
as a service unit. This has several advantages:
 - we have much better monitoring infrastructure via
   `systemctl status cockroach`
 - we can now run code after the exit, allowing us to record it in
   various logs.
 - we can set a strict cgroups memory limit (set to `95%`) so that the
   process gets oom-killed before the system starts to thrash.

As part of the commit, we also print out information about the status
of cockroach when logging in.

Fixes #64176.

Release note: None

Co-authored-by: Radu Berinde <[email protected]>
  • Loading branch information
craig[bot] and RaduBerinde committed Apr 29, 2021
2 parents ea6bcca + b937cc8 commit 3834668
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 36 deletions.
4 changes: 4 additions & 0 deletions pkg/cmd/roachprod/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,8 @@ const (

// SharedUser is the linux username for shared use on all vms.
SharedUser = "ubuntu"

// MemoryMax is passed to systemd-run; the cockroach process is killed if it
// uses more than this percentage of the host's memory.
MemoryMax = "95%"
)
24 changes: 18 additions & 6 deletions pkg/cmd/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,13 @@ func (c *SyncedCluster) newSession(i int) (session, error) {
return newRemoteSession(c.user(i), c.host(i), c.DebugDir)
}

// Stop TODO(peter): document
// Stop is used to stop cockroach on all nodes in the cluster.
//
// It sends a signal to all processes that have been started with ROACHPROD env
// var and optionally waits until the processes stop.
//
// When running roachprod stop without other flags, the signal is 9 (SIGKILL)
// and wait is true.
func (c *SyncedCluster) Stop(sig int, wait bool) {
display := fmt.Sprintf("%s: stopping", c.Name)
if wait {
Expand All @@ -181,14 +187,15 @@ func (c *SyncedCluster) Stop(sig int, wait bool) {
sleep 1
done
echo "${pid}: dead" >> %[1]s/roachprod.log
done
`, c.Impl.LogDir(c, c.Nodes[i]))
done`,
c.Impl.LogDir(c, c.Nodes[i]), // [1]
)
}

// NB: the awkward-looking `awk` invocation serves to avoid having the
// awk process match its own output from `ps`.
cmd := fmt.Sprintf(`
mkdir -p logs
mkdir -p %[1]s
echo ">>> roachprod stop: $(date)" >> %[1]s/roachprod.log
ps axeww -o pid -o command >> %[1]s/roachprod.log
pids=$(ps axeww -o pid -o command | \
Expand All @@ -197,8 +204,13 @@ pids=$(ps axeww -o pid -o command | \
if [ -n "${pids}" ]; then
kill -%[4]d ${pids}
%[5]s
fi
`, c.Impl.LogDir(c, c.Nodes[i]), c.Nodes[i], c.escapedTag(), sig, waitCmd)
fi`,
c.Impl.LogDir(c, c.Nodes[i]), // [1]
c.Nodes[i], // [2]
c.escapedTag(), // [3]
sig, // [4]
waitCmd, // [5]
)
return sess.CombinedOutput(cmd)
})
}
Expand Down
134 changes: 104 additions & 30 deletions pkg/cmd/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"regexp"
"sort"
"strings"
"text/template"

"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ssh"
Expand All @@ -43,7 +44,7 @@ func cockroachNodeBinary(c *SyncedCluster, node int) string {
return config.Binary
}
if !c.IsLocal() {
return "./" + config.Binary
return "${HOME}/" + config.Binary
}

path := filepath.Join(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), node), config.Binary)
Expand Down Expand Up @@ -377,6 +378,87 @@ func (h *crdbInstallHelper) startNode(
func (h *crdbInstallHelper) generateStartCmd(
nodeIdx int, extraArgs []string, vers *version.Version,
) (string, error) {

tpl, err := template.New("start").Parse(`#!/bin/bash
set -euo pipefail
mkdir -p {{.LogDir}}
helper="{{if .Local}}{{.LogDir}}{{else}}${HOME}{{end}}/cockroach-helper.sh"
verb="{{if .Local}}run{{else}}run-systemd{{end}}"
# 'EOF' disables parameter substitution in the heredoc.
cat > "${helper}" << 'EOF' && chmod +x "${helper}" && "${helper}" "${verb}"
#!/bin/bash
set -euo pipefail
if [[ "${1}" == "run" ]]; then
local="{{if .Local}}true{{end}}"
ulimit -c unlimited
mkdir -p {{.LogDir}}
echo "cockroach start: $(date), logging to {{.LogDir}}" | tee -a {{.LogDir}}/{roachprod,cockroach.std{out,err}}.log
{{.KeyCmd}}
export ROACHPROD={{.NodeNum}}{{.Tag}} {{.EnvVars}}
background=""
if [[ "${local}" ]]; then
background="--background"
fi
CODE=0
{{.Binary}} {{.StartCmd}} {{.Args}} ${background} >> {{.LogDir}}/cockroach.stdout.log 2>> {{.LogDir}}/cockroach.stderr.log || CODE=$?
if [[ -z "${local}" || ${CODE} -ne 0 ]]; then
echo "cockroach exited with code ${CODE}: $(date)" | tee -a {{.LogDir}}/{roachprod,cockroach.{exit,std{out,err}}}.log
fi
exit ${CODE}
fi
if [[ "${1}" != "run-systemd" ]]; then
echo "unsupported: ${1}"
exit 1
fi
if systemctl is-active -q cockroach; then
echo "cockroach service already active"
echo "To get more information: systemctl status cockroach"
exit 1
fi
# If cockroach failed, the service still exists; we need to clean it up before
# we can start it again.
sudo systemctl reset-failed cockroach 2>/dev/null || true
# The first time we run, install a small script that shows some helpful
# information when we ssh in.
if [ ! -e ${HOME}/.profile-cockroach ]; then
cat > ${HOME}/.profile-cockroach <<'EOQ'
echo ""
if systemctl is-active -q cockroach; then
echo "cockroach is running; see: systemctl status cockroach"
elif systemctl is-failed -q cockroach; then
echo "cockroach stopped; see: systemctl status cockroach"
else
echo "cockroach not started"
fi
echo ""
EOQ
echo ". ${HOME}/.profile-cockroach" >> ${HOME}/.profile
fi
# We run this script (with arg "run") as a service unit. We do not use --user
# because memory limiting doesn't work in that mode. Instead we pass the uid and
# gid that the process will run under.
# The "notify" service type means that systemd-run waits until cockroach
# notifies systemd that it is ready; NotifyAccess=all is needed because this
# notification doesn't come from the main PID (which is bash).
sudo systemd-run --unit cockroach \
--same-dir --uid $(id -u) --gid $(id -g) \
--service-type=notify -p NotifyAccess=all \
-p MemoryMax={{.MemoryMax}} \
bash $0 run
EOF
`)
if err != nil {
return "", err
}

args, err := h.generateStartArgs(nodeIdx, extraArgs, vers)
if err != nil {
return "", err
Expand All @@ -390,35 +472,28 @@ func (h *crdbInstallHelper) generateStartCmd(
} else {
startCmd = "start"
}

nodes := h.c.ServerNodes()
logDir := h.c.Impl.LogDir(h.c, nodes[nodeIdx])
binary := cockroachNodeBinary(h.c, nodes[nodeIdx])
keyCmd := h.generateKeyCmd(nodeIdx, extraArgs)

// NB: this is awkward as when the process fails, the test runner will show an
// unhelpful empty error (since everything has been redirected away). This is
// unfortunately equally awkward to address.
cmd := fmt.Sprintf(`
ulimit -c unlimited; mkdir -p %[1]s;
echo ">>> roachprod start: $(date)" >> %[1]s/roachprod.log;
ps axeww -o pid -o command >> %[1]s/roachprod.log;
[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> %[1]s/roachprod.log; %[2]s
export ROACHPROD=%[3]d%[4]s;
GOTRACEBACK=crash COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 %[5]s \
%[6]s %[7]s %[8]s >> %[1]s/cockroach.stdout.log \
2>> %[1]s/cockroach.stderr.log \
|| (x=$?; cat %[1]s/cockroach.stderr.log; exit $x)`,
logDir, // [1]
keyCmd, // [2]
nodes[nodeIdx], // [3]
h.c.Tag, // [4]
h.getEnvVars(), // [5]
binary, // [6]
startCmd, // [7]
strings.Join(args, " "), // [8]
)
return cmd, nil
var buf strings.Builder
if err := tpl.Execute(&buf, struct {
LogDir, KeyCmd, Tag, EnvVars, Binary, StartCmd, Args, MemoryMax string
NodeNum int
Local bool
}{
LogDir: h.c.Impl.LogDir(h.c, nodes[nodeIdx]),
KeyCmd: h.generateKeyCmd(nodeIdx, extraArgs),
Tag: h.c.Tag,
EnvVars: "GOTRACEBACK=crash COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 " + h.getEnvVars(),
Binary: cockroachNodeBinary(h.c, nodes[nodeIdx]),
StartCmd: startCmd,
Args: strings.Join(args, " "),
MemoryMax: config.MemoryMax,
NodeNum: nodes[nodeIdx],
Local: h.c.IsLocal(),
}); err != nil {
return "", err
}

return buf.String(), nil
}

func (h *crdbInstallHelper) generateStartArgs(
Expand All @@ -427,7 +502,6 @@ func (h *crdbInstallHelper) generateStartArgs(
var args []string
nodes := h.c.ServerNodes()

args = append(args, "--background")
if h.c.Secure {
args = append(args, "--certs-dir="+h.c.Impl.CertsDir(h.c, nodes[nodeIdx]))
} else {
Expand Down

0 comments on commit 3834668

Please sign in to comment.