Skip to content

Commit

Permalink
runc-dmz: reduce memfd binary cloning cost with small C binary
Browse files Browse the repository at this point in the history
The idea is to remove the need for cloning the entire runc binary by
replacing the final execve() call of the container process with an
execve() call to a clone of a small C binary which just does an execve()
of its arguments.

This provides similar protection against CVE-2019-5736 but without
requiring a >10MB binary copy for each "runc init". When compiled with
musl, runc-dmz is 13kB (though unfortunately with glibc, it is 1.1MB
which is still quite large).

It should be noted that there is still a window where the container
processes could get access to the host runc binary, but because we set
ourselves as non-dumpable the container would need CAP_SYS_PTRACE (which
is not enabled by default in Docker) in order to get around the
proc_fd_access_allowed() checks. In addition, since Linux 4.10[1] the
kernel blocks access entirely for user namespaced containers in this
scenario. For those cases we cannot use runc-dmz, but most containers
won't have this issue.

This new runc-dmz binary can be opted out of at compile time by setting
the "runc_nodmz" buildtag, and at runtime by setting the RUNC_DMZ=legacy
environment variable. In both cases, runc will fall back to the classic
/proc/self/exe-based cloning trick. If /proc/self/exe is already a
sealed memfd (namely if the user is using contrib/cmd/memfd-bind to
create a persistent sealed memfd for runc), neither runc-dmz nor
/proc/self/exe cloning will be used because they are not necessary.

[1]: torvalds/linux@bfedb58

Co-authored-by: lifubang <[email protected]>
Signed-off-by: lifubang <[email protected]>
[cyphar: address various review nits]
[cyphar: fix runc-dmz cross-compilation]
[cyphar: embed runc-dmz into runc binary and clone in Go code]
[cyphar: make runc-dmz optional, with fallback to /proc/self/exe cloning]
[cyphar: do not use runc-dmz when the container has certain privs]
Co-authored-by: Aleksa Sarai <[email protected]>
Signed-off-by: Aleksa Sarai <[email protected]>
  • Loading branch information
lifubang and cyphar committed Sep 22, 2023
1 parent e089db3 commit dac4171
Show file tree
Hide file tree
Showing 20 changed files with 608 additions and 25 deletions.
17 changes: 16 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,18 @@ jobs:
rootless: ["rootless", ""]
race: ["-race", ""]
criu: ["", "criu-dev"]
dmz: ["", "runc_nodmz"]
exclude:
- criu: criu-dev
rootless: rootless
- criu: criu-dev
go-version: 1.20.x
- criu: criu-dev
race: -race
- dmz: runc_nodmz
criu: criu-dev
- dmz: runc_nodmz
os: ubuntu-20.04
runs-on: ${{ matrix.os }}

steps:
Expand Down Expand Up @@ -71,6 +76,8 @@ jobs:
go-version: ${{ matrix.go-version }}

- name: build
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all

- name: install bats
Expand All @@ -80,6 +87,8 @@ jobs:

- name: unit test
if: matrix.rootless != 'rootless'
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest

- name: add rootless user
Expand Down Expand Up @@ -113,8 +122,12 @@ jobs:
# However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
# We are not interested in providing official support for i386.
cross-i386:
runs-on: ubuntu-22.04
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
dmz: ["", "runc_nodmz"]
runs-on: ubuntu-22.04

steps:

Expand All @@ -136,4 +149,6 @@ jobs:
go-version: 1.x # Latest stable

- name: unit test
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest
1 change: 1 addition & 0 deletions .golangci-extra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
run:
build-tags:
- seccomp
- runc_nodmz

linters:
disable-all: true
Expand Down
1 change: 1 addition & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
run:
build-tags:
- seccomp
- runc_nodmz

linters:
enable:
Expand Down
35 changes: 30 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
SHELL = /bin/bash

CONTAINER_ENGINE := docker
GO ?= go

# Get CC values for cross-compilation.
include cc_platform.mk

PREFIX ?= /usr/local
BINDIR := $(PREFIX)/sbin
MANDIR := $(PREFIX)/share/man
Expand All @@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
PROJECT := github.com/opencontainers/runc
BUILDTAGS ?= seccomp urfave_cli_no_docs
BUILDTAGS += $(EXTRA_BUILDTAGS)

COMMIT ?= $(shell git describe --dirty --long --always)
VERSION := $(shell cat ./VERSION)
Expand Down Expand Up @@ -57,16 +63,23 @@ endif

.DEFAULT: runc

runc:
runc: runc-dmz
$(GO_BUILD) -o runc .
make verify-dmz-arch

all: runc recvtty sd-helper seccompagent fs-idmap

recvtty sd-helper seccompagent fs-idmap:
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@

static:
static: runc-dmz
$(GO_BUILD_STATIC) -o runc .
make verify-dmz-arch

.PHONY: runc-dmz
runc-dmz:
rm -f libcontainer/dmz/runc-dmz
$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz

releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
releaseall: release
Expand Down Expand Up @@ -147,12 +160,12 @@ install-man: man
install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8

clean:
rm -f runc runc-*
rm -f runc runc-* libcontainer/dmz/runc-dmz
rm -f contrib/cmd/recvtty/recvtty
rm -f contrib/cmd/sd-helper/sd-helper
rm -f contrib/cmd/seccompagent/seccompagent
rm -f contrib/cmd/fs-idmap/fs-idmap
rm -rf release
sudo rm -rf release
rm -rf man/man8

cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
Expand Down Expand Up @@ -188,6 +201,18 @@ verify-dependencies: vendor
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
&& echo "all vendor files are up to date."
verify-dmz-arch:
@test -s libcontainer/dmz/runc-dmz || exit 0; \
set -Eeuo pipefail; \
export LC_ALL=C; \
echo "readelf -h runc"; \
readelf -h runc | grep -E "(Machine|Flags):"; \
echo "readelf -h libcontainer/dmz/runc-dmz"; \
readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
diff -u \
<(readelf -h runc | grep -E "(Machine|Flags):") \
<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
&& echo "runc-dmz architecture matches runc binary."

validate-keyring:
script/keyring_validate.sh
Expand All @@ -197,4 +222,4 @@ validate-keyring:
test localtest unittest localunittest integration localintegration \
rootlessintegration localrootlessintegration shell install install-bash \
install-man clean cfmt shfmt localshfmt shellcheck \
vendor verify-changelog verify-dependencies validate-keyring
vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,10 @@ e.g. to disable seccomp:
make BUILDTAGS=""
```

| Build Tag | Feature | Enabled by default | Dependency |
|-----------|------------------------------------|--------------------|------------|
| seccomp | Syscall filtering | yes | libseccomp |
| Build Tag | Feature | Enabled by Default | Dependencies |
|---------------|---------------------------------------|--------------------|---------------------|
| `seccomp` | Syscall filtering using `libseccomp`. | yes | `libseccomp` |
| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||

The following build tags were used earlier, but are now obsoleted:
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
Expand Down
61 changes: 61 additions & 0 deletions cc_platform.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# NOTE: Make sure you keep this file in sync with scripts/lib.sh.

GO ?= go
GOARCH ?= $(shell $(GO) env GOARCH)

ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
# openSUSE has a custom PLATFORM
PLATFORM ?= suse-linux
IS_SUSE := 1
else
PLATFORM ?= linux-gnu
endif

ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
# use the native CC and STRIP
HOST :=
else ifeq ($(GOARCH),386)
# Always use the 64-bit compiler to build the 386 binary, which works for
# the more common cross-build method for x86 (namely, the equivalent of
# dpkg --add-architecture).
ifdef IS_SUSE
# There is no x86_64-suse-linux-gcc, so use the native one.
HOST :=
CPU_TYPE := i586
else
HOST := x86_64-$(PLATFORM)-
CPU_TYPE := i686
endif
CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
else ifeq ($(GOARCH),amd64)
ifdef IS_SUSE
# There is no x86_64-suse-linux-gcc, so use the native one.
HOST :=
else
HOST := x86_64-$(PLATFORM)-
endif
else ifeq ($(GOARCH),arm64)
HOST := aarch64-$(PLATFORM)-
else ifeq ($(GOARCH),arm)
# HOST already configured by release_build.sh in this case.
else ifeq ($(GOARCH),armel)
HOST := arm-$(PLATFORM)eabi-
else ifeq ($(GOARCH),armhf)
HOST := arm-$(PLATFORM)eabihf-
else ifeq ($(GOARCH),ppc64le)
HOST := powerpc64le-$(PLATFORM)-
else ifeq ($(GOARCH),riscv64)
HOST := riscv64-$(PLATFORM)-
else ifeq ($(GOARCH),s390x)
HOST := s390x-$(PLATFORM)-
else
$(error Unsupported GOARCH $(GOARCH))
endif

ifeq ($(origin CC),$(filter $(origin CC),undefined default))
# Override CC if it's undefined or just the default value set by Make.
CC := $(HOST)gcc
export CC
endif
STRIP ?= $(HOST)strip
export STRIP
98 changes: 87 additions & 11 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/opencontainers/runc/libcontainer/dmz"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/system/kernelversion"
"github.com/opencontainers/runc/libcontainer/utils"
)

Expand Down Expand Up @@ -444,6 +445,48 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
return nil
}

// No longer needed in Go 1.21.
func slicesContains[S ~[]E, E comparable](slice S, needle E) bool {
for _, val := range slice {
if val == needle {
return true
}
}
return false
}

func isDmzBinarySafe(c *configs.Config) bool {
// Because we set the dumpable flag in nsexec, the only time when it is
// unsafe to use runc-dmz is when the container process would be able to
// race against "runc init" and bypass the ptrace_may_access() checks.
//
// This is only the case if the container processes could have
// CAP_SYS_PTRACE somehow (i.e. the capability is present in the bounding,
// inheritable, or ambient sets). Luckily, most containers do not have this
// capability.
if c.Capabilities == nil ||
(!slicesContains(c.Capabilities.Bounding, "CAP_SYS_PTRACE") &&
!slicesContains(c.Capabilities.Inheritable, "CAP_SYS_PTRACE") &&
!slicesContains(c.Capabilities.Ambient, "CAP_SYS_PTRACE")) {
return true
}

// Since Linux 4.10 (see bfedb589252c0) user namespaced containers cannot
// access /proc/$pid/exe of runc after it joins the namespace (until it
// does an exec), regardless of the capability set. This has been
// backported to other distribution kernels, but there's no way of checking
// this cheaply -- better to be safe than sorry here.
linux410 := kernelversion.KernelVersion{Kernel: 4, Major: 10}
if ok, err := kernelversion.GreaterEqualThan(linux410); ok && err == nil {
if c.Namespaces.Contains(configs.NEWUSER) {
return true
}
}

// Assume it's unsafe otherwise.
return false
}

func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
if err != nil {
Expand All @@ -457,27 +500,54 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
}
logFilePair := filePair{parentLogPipe, childLogPipe}

// Make sure we use a new safe copy of /proc/self/exe each time this is
// called, to make sure that if a container manages to overwrite the file
// it cannot affect other containers on the system. For runc, this code
// will only ever be called once, but libcontainer users might call this
// more than once.
// Make sure we use a new safe copy of /proc/self/exe or the runc-dmz
// binary each time this is called, to make sure that if a container
// manages to overwrite the file it cannot affect other containers on the
// system. For runc, this code will only ever be called once, but
// libcontainer users might call this more than once.
p.closeClonedExes()
var (
exePath string
safeExe *os.File
// only one of dmzExe or safeExe are used at a time
dmzExe, safeExe *os.File
)
if dmz.IsSelfExeCloned() {
// /proc/self/exe is already a cloned binary -- no need to do anything
logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
exePath = "/proc/self/exe"
} else {
safeExe, err = dmz.CloneSelfExe(c.root)
if err != nil {
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
var err error
if isDmzBinarySafe(c.config) {
dmzExe, err = dmz.Binary(c.root)
if err == nil {
// We can use our own executable without cloning if we are using
// runc-dmz.
exePath = "/proc/self/exe"
p.clonedExes = append(p.clonedExes, dmzExe)
} else if errors.Is(err, dmz.ErrNoDmzBinary) {
logrus.Debug("runc-dmz binary not embedded in runc binary, falling back to /proc/self/exe clone")
} else if err != nil {
return nil, fmt.Errorf("failed to create runc-dmz binary clone: %w", err)
}
} else {
// If the configuration makes it unsafe to use runc-dmz, pretend we
// don't have it embedded so we do /proc/self/exe cloning.
logrus.Debug("container configuration unsafe for runc-dmz, falling back to /proc/self/exe clone")
err = dmz.ErrNoDmzBinary
}
if errors.Is(err, dmz.ErrNoDmzBinary) {
safeExe, err = dmz.CloneSelfExe(c.root)
if err != nil {
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
}
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
p.clonedExes = append(p.clonedExes, safeExe)
}
// Just to make sure we don't run without protection.
if dmzExe == nil && safeExe == nil {
// This should never happen.
return nil, fmt.Errorf("[internal error] attempted to spawn a container with no /proc/self/exe protection")
}
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
p.clonedExes = append(p.clonedExes, safeExe)
}

cmd := exec.Command(exePath, "init")
Expand All @@ -503,6 +573,12 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
"_LIBCONTAINER_STATEDIR="+c.root,
)

if dmzExe != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, dmzExe)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
}

cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
Expand Down
1 change: 1 addition & 0 deletions libcontainer/dmz/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/runc-dmz
6 changes: 6 additions & 0 deletions libcontainer/dmz/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Get CC values for cross-compilation.
include ../../cc_platform.mk

runc-dmz: _dmz.c
$(CC) $(CFLAGS) -static -o $@ $^
$(STRIP) -gs $@
10 changes: 10 additions & 0 deletions libcontainer/dmz/_dmz.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include <unistd.h>

extern char **environ;

int main(int argc, char **argv)
{
if (argc < 1)
return 127;
return execve(argv[0], argv, environ);
}
Loading

0 comments on commit dac4171

Please sign in to comment.