Skip to content

Commit

Permalink
Merge pull request opencontainers#3987 from cyphar/cloned-binary-rework
Browse files Browse the repository at this point in the history
nsexec: cloned binary rework
  • Loading branch information
AkihiroSuda authored Sep 24, 2023
2 parents 1d9b158 + 90c8d36 commit f235fa6
Show file tree
Hide file tree
Showing 36 changed files with 1,431 additions and 664 deletions.
17 changes: 16 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,18 @@ jobs:
rootless: ["rootless", ""]
race: ["-race", ""]
criu: ["", "criu-dev"]
dmz: ["", "runc_nodmz"]
exclude:
- criu: criu-dev
rootless: rootless
- criu: criu-dev
go-version: 1.20.x
- criu: criu-dev
race: -race
- dmz: runc_nodmz
criu: criu-dev
- dmz: runc_nodmz
os: ubuntu-20.04
runs-on: ${{ matrix.os }}

steps:
Expand Down Expand Up @@ -71,6 +76,8 @@ jobs:
go-version: ${{ matrix.go-version }}

- name: build
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all

- name: install bats
Expand All @@ -80,6 +87,8 @@ jobs:

- name: unit test
if: matrix.rootless != 'rootless'
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest

- name: add rootless user
Expand Down Expand Up @@ -113,8 +122,12 @@ jobs:
# However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
# We are not interested in providing official support for i386.
cross-i386:
runs-on: ubuntu-22.04
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
dmz: ["", "runc_nodmz"]
runs-on: ubuntu-22.04

steps:

Expand All @@ -136,4 +149,6 @@ jobs:
go-version: 1.x # Latest stable

- name: unit test
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest
9 changes: 5 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
vendor/pkg
/runc
/runc-*
contrib/cmd/recvtty/recvtty
contrib/cmd/sd-helper/sd-helper
contrib/cmd/seccompagent/seccompagent
contrib/cmd/fs-idmap/fs-idmap
/contrib/cmd/recvtty/recvtty
/contrib/cmd/sd-helper/sd-helper
/contrib/cmd/seccompagent/seccompagent
/contrib/cmd/fs-idmap/fs-idmap
/contrib/cmd/memfd-bind/memfd-bind
man/man8
release
Vagrantfile
Expand Down
1 change: 1 addition & 0 deletions .golangci-extra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
run:
build-tags:
- seccomp
- runc_nodmz

linters:
disable-all: true
Expand Down
1 change: 1 addition & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
run:
build-tags:
- seccomp
- runc_nodmz

linters:
enable:
Expand Down
20 changes: 12 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,15 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi
RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
&& echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
&& dpkg --add-architecture i386 \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential \
criu \
gcc-aarch64-linux-gnu libc-dev-arm64-cross \
gcc-arm-linux-gnueabi libc-dev-armel-cross \
gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
gcc-s390x-linux-gnu libc-dev-s390x-cross \
gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
gcc \
gcc-multilib \
curl \
gawk \
gcc \
gperf \
iptables \
jq \
Expand All @@ -32,6 +28,14 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
sudo \
uidmap \
iproute2 \
&& apt-get install -y --no-install-recommends \
libc-dev:i386 libgcc-s1:i386 \
gcc-aarch64-linux-gnu libc-dev-arm64-cross \
gcc-arm-linux-gnueabi libc-dev-armel-cross \
gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
gcc-s390x-linux-gnu libc-dev-s390x-cross \
gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
&& apt-get clean \
&& rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list

Expand All @@ -54,7 +58,7 @@ RUN cd /tmp \
ARG LIBSECCOMP_VERSION
COPY script/seccomp.sh script/lib.sh /tmp/script/
RUN mkdir -p /opt/libseccomp \
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp 386 amd64 arm64 armel armhf ppc64le riscv64 s390x
ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig
Expand Down
44 changes: 35 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
SHELL = /bin/bash

CONTAINER_ENGINE := docker
GO ?= go

# Get CC values for cross-compilation.
include cc_platform.mk

PREFIX ?= /usr/local
BINDIR := $(PREFIX)/sbin
MANDIR := $(PREFIX)/share/man
Expand All @@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
PROJECT := github.com/opencontainers/runc
BUILDTAGS ?= seccomp urfave_cli_no_docs
BUILDTAGS += $(EXTRA_BUILDTAGS)

COMMIT ?= $(shell git describe --dirty --long --always)
VERSION := $(shell cat ./VERSION)
Expand Down Expand Up @@ -57,18 +63,25 @@ endif

.DEFAULT: runc

runc:
runc: runc-dmz
$(GO_BUILD) -o runc .
make verify-dmz-arch

all: runc recvtty sd-helper seccompagent fs-idmap
all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind

recvtty sd-helper seccompagent fs-idmap:
recvtty sd-helper seccompagent fs-idmap memfd-bind:
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@

static:
static: runc-dmz
$(GO_BUILD_STATIC) -o runc .
make verify-dmz-arch

.PHONY: runc-dmz
runc-dmz:
rm -f libcontainer/dmz/runc-dmz
$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz

releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
releaseall: release

release: runcimage
Expand Down Expand Up @@ -147,12 +160,13 @@ install-man: man
install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8

clean:
rm -f runc runc-*
rm -f runc runc-* libcontainer/dmz/runc-dmz
rm -f contrib/cmd/fs-idmap/fs-idmap
rm -f contrib/cmd/recvtty/recvtty
rm -f contrib/cmd/sd-helper/sd-helper
rm -f contrib/cmd/seccompagent/seccompagent
rm -f contrib/cmd/fs-idmap/fs-idmap
rm -rf release
rm -f contrib/cmd/memfd-bind/memfd-bind
sudo rm -rf release
rm -rf man/man8

cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
Expand Down Expand Up @@ -188,6 +202,18 @@ verify-dependencies: vendor
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
&& echo "all vendor files are up to date."
verify-dmz-arch:
@test -s libcontainer/dmz/runc-dmz || exit 0; \
set -Eeuo pipefail; \
export LC_ALL=C; \
echo "readelf -h runc"; \
readelf -h runc | grep -E "(Machine|Flags):"; \
echo "readelf -h libcontainer/dmz/runc-dmz"; \
readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
diff -u \
<(readelf -h runc | grep -E "(Machine|Flags):") \
<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
&& echo "runc-dmz architecture matches runc binary."

validate-keyring:
script/keyring_validate.sh
Expand All @@ -197,4 +223,4 @@ validate-keyring:
test localtest unittest localunittest integration localintegration \
rootlessintegration localrootlessintegration shell install install-bash \
install-man clean cfmt shfmt localshfmt shellcheck \
vendor verify-changelog verify-dependencies validate-keyring
vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,18 @@ e.g. to disable seccomp:
make BUILDTAGS=""
```

| Build Tag | Feature | Enabled by default | Dependency |
|-----------|------------------------------------|--------------------|------------|
| seccomp | Syscall filtering | yes | libseccomp |
| Build Tag | Feature | Enabled by Default | Dependencies |
|---------------|---------------------------------------|--------------------|---------------------|
| `seccomp` | Syscall filtering using `libseccomp`. | yes | `libseccomp` |
| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary, [see `memfd-bind` for more details][contrib-memfd-bind]. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||

The following build tags were used earlier, but are now obsoleted:
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
- **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
- **selinux** (since runc v1.0.0-rc93 the feature is always enabled)

[contrib-memfd-bind]: /contrib/memfd-bind/README.md

### Running the test suite

`runc` currently supports running its test suite via Docker.
Expand Down
61 changes: 61 additions & 0 deletions cc_platform.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# NOTE: Make sure you keep this file in sync with scripts/lib.sh.

GO ?= go
GOARCH ?= $(shell $(GO) env GOARCH)

ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
# openSUSE has a custom PLATFORM
PLATFORM ?= suse-linux
IS_SUSE := 1
else
PLATFORM ?= linux-gnu
endif

ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
# use the native CC and STRIP
HOST :=
else ifeq ($(GOARCH),386)
# Always use the 64-bit compiler to build the 386 binary, which works for
# the more common cross-build method for x86 (namely, the equivalent of
# dpkg --add-architecture).
ifdef IS_SUSE
# There is no x86_64-suse-linux-gcc, so use the native one.
HOST :=
CPU_TYPE := i586
else
HOST := x86_64-$(PLATFORM)-
CPU_TYPE := i686
endif
CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
else ifeq ($(GOARCH),amd64)
ifdef IS_SUSE
# There is no x86_64-suse-linux-gcc, so use the native one.
HOST :=
else
HOST := x86_64-$(PLATFORM)-
endif
else ifeq ($(GOARCH),arm64)
HOST := aarch64-$(PLATFORM)-
else ifeq ($(GOARCH),arm)
# HOST already configured by release_build.sh in this case.
else ifeq ($(GOARCH),armel)
HOST := arm-$(PLATFORM)eabi-
else ifeq ($(GOARCH),armhf)
HOST := arm-$(PLATFORM)eabihf-
else ifeq ($(GOARCH),ppc64le)
HOST := powerpc64le-$(PLATFORM)-
else ifeq ($(GOARCH),riscv64)
HOST := riscv64-$(PLATFORM)-
else ifeq ($(GOARCH),s390x)
HOST := s390x-$(PLATFORM)-
else
$(error Unsupported GOARCH $(GOARCH))
endif

ifeq ($(origin CC),$(filter $(origin CC),undefined default))
# Override CC if it's undefined or just the default value set by Make.
CC := $(HOST)gcc
export CC
endif
STRIP ?= $(HOST)strip
export STRIP
67 changes: 67 additions & 0 deletions contrib/cmd/memfd-bind/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
## memfd-bind ##

`runc` normally has to make a binary copy of itself (or of a smaller helper
binary called `runc-dmz`) when constructing a container process in order to
defend against certain container runtime attacks such as CVE-2019-5736.

This cloned binary only exists until the container process starts (this means
for `runc run` and `runc exec`, it only exists for a few hundred milliseconds
-- for `runc create` it exists until `runc start` is called). However, because
the clone is done using a memfd (or by creating files in directories that are
likely to be a `tmpfs`), this can lead to temporary increases in *host* memory
usage. Unless you are running on a cgroupv1 system with the cgroupv1 memory
controller enabled and the (deprecated) `memory.move_charge_at_immigrate`
enabled, there is no effect on the container's memory.

However, for certain configurations this can still be undesirable. This daemon
allows you to create a sealed memfd copy of the `runc` binary, which will cause
`runc` to skip all binary copying, resulting in no additional memory usage for
each container process (instead there is a single in-memory copy of the
binary). It should be noted that (strictly speaking) this is slightly less
secure if you are concerned about Dirty Cow-like 0-day kernel vulnerabilities,
but for most users the security benefit is identical.

The provided `[email protected]` file can be used to get systemd to manage
this daemon. You can supply the path like so:

```
% systemctl start memfd-bind@/usr/bin/runc
```

Thus, there are three ways of protecting against CVE-2019-5736, in order of how
much memory usage they can use:

* `memfd-bind` only creates a single in-memory copy of the `runc` binary (about
10MB), regardless of how many containers are running.

* `runc-dmz` is (depending on which libc it was compiled with) between 10kB and
1MB in size, and a copy is created once per process spawned inside a
container by runc (both the pid1 and every `runc exec`). There are
circumstances where using `runc-dmz` will fail in ways that runc cannot
predict ahead of time (such as restrictive LSMs applied to containers), in
which case users can disable it with the `RUNC_DMZ=legacy` setting.
`runc-dmz` also requires an additional `execve` over the other options,
though since the binary is so small the cost is probably not even noticeable.

* The classic method of making a copy of the entire `runc` binary during
container process setup takes up about 10MB per process spawned inside the
container by runc (both pid1 and `runc exec`).

### Caveats ###

There are several downsides with using `memfd-bind` on the `runc` binary:

* The `memfd-bind` process needs to continue to run indefinitely in order for
the memfd reference to stay alive. If the process is forcefully killed, the
bind-mount on top of the `runc` binary will become stale and nobody will be
able to execute it (you can use `memfd-bind --cleanup` to clean up the stale
mount).

* Only root can execute the cloned binary due to permission restrictions on
accessing other process's files. More specifically, only users with ptrace
privileges over the memfd-bind daemon can access the file (but in practice
this is usually only root).

* When updating `runc`, the daemon needs to be stopped before the update (so
the package manager can access the underlying file) and then restarted after
the update.
Loading

0 comments on commit f235fa6

Please sign in to comment.