Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachprod: improve the way cockroach is run #64177

Merged
merged 1 commit into from
Apr 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/cmd/roachprod/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,8 @@ const (

// SharedUser is the linux username for shared use on all vms.
SharedUser = "ubuntu"

// MemoryMax is passed to systemd-run; the cockroach process is killed if it
// uses more than this percentage of the host's memory.
MemoryMax = "95%"
)
24 changes: 18 additions & 6 deletions pkg/cmd/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,13 @@ func (c *SyncedCluster) newSession(i int) (session, error) {
return newRemoteSession(c.user(i), c.host(i), c.DebugDir)
}

// Stop TODO(peter): document
// Stop is used to stop cockroach on all nodes in the cluster.
//
// It sends a signal to all processes that have been started with ROACHPROD env
// var and optionally waits until the processes stop.
//
// When running roachprod stop without other flags, the signal is 9 (SIGKILL)
// and wait is true.
func (c *SyncedCluster) Stop(sig int, wait bool) {
display := fmt.Sprintf("%s: stopping", c.Name)
if wait {
Expand All @@ -181,14 +187,15 @@ func (c *SyncedCluster) Stop(sig int, wait bool) {
sleep 1
done
echo "${pid}: dead" >> %[1]s/roachprod.log
done
`, c.Impl.LogDir(c, c.Nodes[i]))
done`,
c.Impl.LogDir(c, c.Nodes[i]), // [1]
)
}

// NB: the awkward-looking `awk` invocation serves to avoid having the
// awk process match its own output from `ps`.
cmd := fmt.Sprintf(`
mkdir -p logs
mkdir -p %[1]s
echo ">>> roachprod stop: $(date)" >> %[1]s/roachprod.log
ps axeww -o pid -o command >> %[1]s/roachprod.log
pids=$(ps axeww -o pid -o command | \
Expand All @@ -197,8 +204,13 @@ pids=$(ps axeww -o pid -o command | \
if [ -n "${pids}" ]; then
kill -%[4]d ${pids}
%[5]s
fi
`, c.Impl.LogDir(c, c.Nodes[i]), c.Nodes[i], c.escapedTag(), sig, waitCmd)
fi`,
c.Impl.LogDir(c, c.Nodes[i]), // [1]
c.Nodes[i], // [2]
c.escapedTag(), // [3]
sig, // [4]
waitCmd, // [5]
)
return sess.CombinedOutput(cmd)
})
}
Expand Down
134 changes: 104 additions & 30 deletions pkg/cmd/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"regexp"
"sort"
"strings"
"text/template"

"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ssh"
Expand All @@ -43,7 +44,7 @@ func cockroachNodeBinary(c *SyncedCluster, node int) string {
return config.Binary
}
if !c.IsLocal() {
return "./" + config.Binary
return "${HOME}/" + config.Binary
}

path := filepath.Join(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), node), config.Binary)
Expand Down Expand Up @@ -377,6 +378,87 @@ func (h *crdbInstallHelper) startNode(
func (h *crdbInstallHelper) generateStartCmd(
nodeIdx int, extraArgs []string, vers *version.Version,
) (string, error) {

tpl, err := template.New("start").Parse(`#!/bin/bash
set -euo pipefail

mkdir -p {{.LogDir}}
helper="{{if .Local}}{{.LogDir}}{{else}}${HOME}{{end}}/cockroach-helper.sh"
verb="{{if .Local}}run{{else}}run-systemd{{end}}"

# 'EOF' disables parameter substitution in the heredoc.
cat > "${helper}" << 'EOF' && chmod +x "${helper}" && "${helper}" "${verb}"
#!/bin/bash
set -euo pipefail

if [[ "${1}" == "run" ]]; then
local="{{if .Local}}true{{end}}"
ulimit -c unlimited
mkdir -p {{.LogDir}}
echo "cockroach start: $(date), logging to {{.LogDir}}" | tee -a {{.LogDir}}/{roachprod,cockroach.std{out,err}}.log
{{.KeyCmd}}
export ROACHPROD={{.NodeNum}}{{.Tag}} {{.EnvVars}}
background=""
if [[ "${local}" ]]; then
background="--background"
fi
CODE=0
{{.Binary}} {{.StartCmd}} {{.Args}} ${background} >> {{.LogDir}}/cockroach.stdout.log 2>> {{.LogDir}}/cockroach.stderr.log || CODE=$?
if [[ -z "${local}" || ${CODE} -ne 0 ]]; then
echo "cockroach exited with code ${CODE}: $(date)" | tee -a {{.LogDir}}/{roachprod,cockroach.{exit,std{out,err}}}.log
fi
exit ${CODE}
fi

if [[ "${1}" != "run-systemd" ]]; then
echo "unsupported: ${1}"
exit 1
fi

if systemctl is-active -q cockroach; then
echo "cockroach service already active"
echo "To get more information: systemctl status cockroach"
exit 1
fi

# If cockroach failed, the service still exists; we need to clean it up before
# we can start it again.
sudo systemctl reset-failed cockroach 2>/dev/null || true

# The first time we run, install a small script that shows some helpful
# information when we ssh in.
if [ ! -e ${HOME}/.profile-cockroach ]; then
cat > ${HOME}/.profile-cockroach <<'EOQ'
echo ""
if systemctl is-active -q cockroach; then
echo "cockroach is running; see: systemctl status cockroach"
elif systemctl is-failed -q cockroach; then
echo "cockroach stopped; see: systemctl status cockroach"
else
echo "cockroach not started"
fi
echo ""
EOQ
echo ". ${HOME}/.profile-cockroach" >> ${HOME}/.profile
fi

# We run this script (with arg "run") as a service unit. We do not use --user
# because memory limiting doesn't work in that mode. Instead we pass the uid and
# gid that the process will run under.
# The "notify" service type means that systemd-run waits until cockroach
# notifies systemd that it is ready; NotifyAccess=all is needed because this
# notification doesn't come from the main PID (which is bash).
sudo systemd-run --unit cockroach \
--same-dir --uid $(id -u) --gid $(id -g) \
--service-type=notify -p NotifyAccess=all \
-p MemoryMax={{.MemoryMax}} \
bash $0 run
EOF
`)
if err != nil {
return "", err
}

args, err := h.generateStartArgs(nodeIdx, extraArgs, vers)
if err != nil {
return "", err
Expand All @@ -390,35 +472,28 @@ func (h *crdbInstallHelper) generateStartCmd(
} else {
startCmd = "start"
}

nodes := h.c.ServerNodes()
logDir := h.c.Impl.LogDir(h.c, nodes[nodeIdx])
binary := cockroachNodeBinary(h.c, nodes[nodeIdx])
keyCmd := h.generateKeyCmd(nodeIdx, extraArgs)

// NB: this is awkward as when the process fails, the test runner will show an
// unhelpful empty error (since everything has been redirected away). This is
// unfortunately equally awkward to address.
cmd := fmt.Sprintf(`
ulimit -c unlimited; mkdir -p %[1]s;
echo ">>> roachprod start: $(date)" >> %[1]s/roachprod.log;
ps axeww -o pid -o command >> %[1]s/roachprod.log;
[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> %[1]s/roachprod.log; %[2]s
export ROACHPROD=%[3]d%[4]s;
GOTRACEBACK=crash COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 %[5]s \
%[6]s %[7]s %[8]s >> %[1]s/cockroach.stdout.log \
2>> %[1]s/cockroach.stderr.log \
|| (x=$?; cat %[1]s/cockroach.stderr.log; exit $x)`,
logDir, // [1]
keyCmd, // [2]
nodes[nodeIdx], // [3]
h.c.Tag, // [4]
h.getEnvVars(), // [5]
binary, // [6]
startCmd, // [7]
strings.Join(args, " "), // [8]
)
return cmd, nil
var buf strings.Builder
if err := tpl.Execute(&buf, struct {
LogDir, KeyCmd, Tag, EnvVars, Binary, StartCmd, Args, MemoryMax string
NodeNum int
Local bool
}{
LogDir: h.c.Impl.LogDir(h.c, nodes[nodeIdx]),
KeyCmd: h.generateKeyCmd(nodeIdx, extraArgs),
Tag: h.c.Tag,
EnvVars: "GOTRACEBACK=crash COCKROACH_SKIP_ENABLING_DIAGNOSTIC_REPORTING=1 " + h.getEnvVars(),
Binary: cockroachNodeBinary(h.c, nodes[nodeIdx]),
StartCmd: startCmd,
Args: strings.Join(args, " "),
MemoryMax: config.MemoryMax,
NodeNum: nodes[nodeIdx],
Local: h.c.IsLocal(),
}); err != nil {
return "", err
}

return buf.String(), nil
}

func (h *crdbInstallHelper) generateStartArgs(
Expand All @@ -427,7 +502,6 @@ func (h *crdbInstallHelper) generateStartArgs(
var args []string
nodes := h.c.ServerNodes()

args = append(args, "--background")
if h.c.Secure {
args = append(args, "--certs-dir="+h.c.Impl.CertsDir(h.c, nodes[nodeIdx]))
} else {
Expand Down