Skip to content

Commit

Permalink
roachprod: improve the way cockroach is run
Browse files Browse the repository at this point in the history
While running some stress tests with TPCH, I observed two big problems
with roachprod:
 - the out-of-memory behavior is very bad: instead of the process
   being killed, the system enters a thrashing mode where everything
   in the VM slows to a crawl (to the point where just sshing in can
   take minutes).
 - when the cockroach process exits, the exit code is not recorded
   anywhere, making it impossible in some cases to figure out why it
   stopped. In my particular case, we were exiting with exit code 8
   (which is `exit.TimeoutAfterFatalError()`) because writing to the
   logs was unacceptably slow.

This commit attempts to improve things on both these fronts. Instead
of running with `--background`, we use `systemd-run` to run cockroach
as a service unit. This has several advantages:
 - we have much better monitoring infrastructure via
   `systemctl status cockroach`
 - we can now run code after the exit, allowing us to record it in
   various logs.
 - we can set a strict cgroups memory limit (set to `95%`) so that the
   process gets oom-killed before the system starts to thrash.

As part of the commit, we also print out information about the status
of cockroach when logging in.

Fixes #64176.

Release note: None
  • Loading branch information
RaduBerinde committed Apr 27, 2021
1 parent 40d6cb1 commit e6297b2
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 19 deletions.
4 changes: 4 additions & 0 deletions pkg/cmd/roachprod/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,8 @@ const (

// SharedUser is the linux username for shared use on all vms.
SharedUser = "ubuntu"

// MemoryMax is passed to systemd-run; the cockroach process is killed if it
// uses more than this percentage of the host's memory.
MemoryMax = "95%"
)
23 changes: 18 additions & 5 deletions pkg/cmd/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ func (c *SyncedCluster) Stop(sig int, wait bool) {

var waitCmd string
if wait {
var resetServiceCmd string
if !c.IsLocal() {
// Since we're waiting (the default), clean up the service.
resetServiceCmd = "sudo systemctl reset-failed cockroach"
}
waitCmd = fmt.Sprintf(`
for pid in ${pids}; do
echo "${pid}: checking" >> %[1]s/roachprod.log
Expand All @@ -181,14 +186,17 @@ func (c *SyncedCluster) Stop(sig int, wait bool) {
sleep 1
done
echo "${pid}: dead" >> %[1]s/roachprod.log
done
`, c.Impl.LogDir(c, c.Nodes[i]))
%[2]s
done`,
c.Impl.LogDir(c, c.Nodes[i]), // [1]
resetServiceCmd,
)
}

// NB: the awkward-looking `awk` invocation serves to avoid having the
// awk process match its own output from `ps`.
cmd := fmt.Sprintf(`
mkdir -p logs
mkdir -p %[1]s
echo ">>> roachprod stop: $(date)" >> %[1]s/roachprod.log
ps axeww -o pid -o command >> %[1]s/roachprod.log
pids=$(ps axeww -o pid -o command | \
Expand All @@ -197,8 +205,13 @@ pids=$(ps axeww -o pid -o command | \
if [ -n "${pids}" ]; then
kill -%[4]d ${pids}
%[5]s
fi
`, c.Impl.LogDir(c, c.Nodes[i]), c.Nodes[i], c.escapedTag(), sig, waitCmd)
fi`,
c.Impl.LogDir(c, c.Nodes[i]), // [1]
c.Nodes[i], // [2]
c.escapedTag(), // [3]
sig, // [4]
waitCmd, // [5]
)
return sess.CombinedOutput(cmd)
})
}
Expand Down
95 changes: 81 additions & 14 deletions pkg/cmd/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func cockroachNodeBinary(c *SyncedCluster, node int) string {
return config.Binary
}
if !c.IsLocal() {
return "./" + config.Binary
return "${HOME}/" + config.Binary
}

path := filepath.Join(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), node), config.Binary)
Expand Down Expand Up @@ -396,19 +396,86 @@ func (h *crdbInstallHelper) generateStartCmd(
binary := cockroachNodeBinary(h.c, nodes[nodeIdx])
keyCmd := h.generateKeyCmd(nodeIdx, extraArgs)

// NB: this is awkward as when the process fails, the test runner will show an
// unhelpful empty error (since everything has been redirected away). This is
// unfortunately equally awkward to address.
if h.c.IsLocal() {
// We cannot rely on sudo or systemd on the local machine; run using the
// background flag.
cmd := fmt.Sprintf(`
ulimit -c unlimited; mkdir -p %[1]s;
echo ">>> roachprod start: $(date)" >> %[1]s/roachprod.log;
ps axeww -o pid -o command >> %[1]s/roachprod.log;
[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> %[1]s/roachprod.log; %[2]s
export ROACHPROD=%[3]d%[4]s;
GOTRACEBACK=crash COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 %[5]s \
%[6]s %[7]s --background %[8]s >> %[1]s/cockroach.stdout.log \
2>> %[1]s/cockroach.stderr.log \
|| (x=$?; cat %[1]s/cockroach.stderr.log; exit $x)`,
logDir, // [1]
keyCmd, // [2]
nodes[nodeIdx], // [3]
h.c.Tag, // [4]
h.getEnvVars(), // [5]
binary, // [6]
startCmd, // [7]
strings.Join(args, " "), // [8]
)
return cmd, nil
}

// We run cockroach as a systemd service unit. This allows us to apply a
// strict memory limit and provides tools to monitor the process.
cmd := fmt.Sprintf(`
ulimit -c unlimited; mkdir -p %[1]s;
echo ">>> roachprod start: $(date)" >> %[1]s/roachprod.log;
ps axeww -o pid -o command >> %[1]s/roachprod.log;
[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> %[1]s/roachprod.log; %[2]s
export ROACHPROD=%[3]d%[4]s;
GOTRACEBACK=crash COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 %[5]s \
%[6]s %[7]s %[8]s >> %[1]s/cockroach.stdout.log \
2>> %[1]s/cockroach.stderr.log \
|| (x=$?; cat %[1]s/cockroach.stderr.log; exit $x)`,
sudo systemctl reset-failed cockroach
if systemctl is-active -q cockroach; then
echo "cockroach service already active"
echo "To get more information: systemctl status cockroach"
exit 1
fi
# The first time we run, install a small script that shows some helpful
# information when we ssh in.
if [ ! -e ${HOME}/.profile-cockroach ]; then
cat > ${HOME}/.profile-cockroach <<'EOF'
echo ""
if systemctl is-active -q cockroach; then
echo "cockroach is running; see: systemctl status cockroach"
elif systemctl is-failed -q cockroach; then
echo "cockroach failed; see: systemctl status cockroach"
else
echo "cockroach stopped"
fi
echo ""
EOF
echo ". ${HOME}/.profile-cockroach" >> ${HOME}/.profile
fi
uid=$(id -u)
gid=$(id -g)
cat > ${HOME}/run-cockroach.sh <<'EOF' && \
sudo systemd-run --unit cockroach --uid $uid --gid $gid -p MemoryMax=%[9]s bash ${HOME}/run-cockroach.sh
#!/bin/bash
ulimit -c unlimited
mkdir -p %[1]s
starttime=$(date)
echo ">>> roachprod start: $starttime" >> %[1]s/roachprod.log
ps axeww -o pid -o command >> %[1]s/roachprod.log
[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> %[1]s/roachprod.log
echo -e "\n\n====== cockroach start: $starttime ======\n" >> %[1]s/cockroach.stdout.log
echo -e "\n\n====== cockroach start: $starttime ======\n" >> %[1]s/cockroach.stderr.log
%[2]s
# Output a message visible when checking the status (via sudo systemctl status).
echo "Starting cockroach (output redirected to %[1]s)"
export ROACHPROD=%[3]d%[4]s;
set -x
COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 %[5]s \
%[6]s %[7]s %[8]s \
>> %[1]s/cockroach.stdout.log 2>> %[1]s/cockroach.stderr.log
code=$?
set +x
echo "cockroach exited with exit code $code" >> %[1]s/roachprod.log
echo "cockroach exited with exit code $code" >> %[1]s/cockroach.stderr.log
exit $code
EOF
`,
logDir, // [1]
keyCmd, // [2]
nodes[nodeIdx], // [3]
Expand All @@ -417,6 +484,7 @@ func (h *crdbInstallHelper) generateStartCmd(
binary, // [6]
startCmd, // [7]
strings.Join(args, " "), // [8]
config.MemoryMax, // [9]
)
return cmd, nil
}
Expand All @@ -427,7 +495,6 @@ func (h *crdbInstallHelper) generateStartArgs(
var args []string
nodes := h.c.ServerNodes()

args = append(args, "--background")
if h.c.Secure {
args = append(args, "--certs-dir="+h.c.Impl.CertsDir(h.c, nodes[nodeIdx]))
} else {
Expand Down

0 comments on commit e6297b2

Please sign in to comment.