forked from opencontainers/runc
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request opencontainers#3987 from cyphar/cloned-binary-rework
nsexec: cloned binary rework
- Loading branch information
Showing
36 changed files
with
1,431 additions
and
664 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
run: | ||
build-tags: | ||
- seccomp | ||
- runc_nodmz | ||
|
||
linters: | ||
disable-all: true | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
run: | ||
build-tags: | ||
- seccomp | ||
- runc_nodmz | ||
|
||
linters: | ||
enable: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# NOTE: Make sure you keep this file in sync with scripts/lib.sh. | ||
|
||
GO ?= go | ||
GOARCH ?= $(shell $(GO) env GOARCH) | ||
|
||
ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),) | ||
# openSUSE has a custom PLATFORM | ||
PLATFORM ?= suse-linux | ||
IS_SUSE := 1 | ||
else | ||
PLATFORM ?= linux-gnu | ||
endif | ||
|
||
ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH)) | ||
# use the native CC and STRIP | ||
HOST := | ||
else ifeq ($(GOARCH),386) | ||
# Always use the 64-bit compiler to build the 386 binary, which works for | ||
# the more common cross-build method for x86 (namely, the equivalent of | ||
# dpkg --add-architecture). | ||
ifdef IS_SUSE | ||
# There is no x86_64-suse-linux-gcc, so use the native one. | ||
HOST := | ||
CPU_TYPE := i586 | ||
else | ||
HOST := x86_64-$(PLATFORM)- | ||
CPU_TYPE := i686 | ||
endif | ||
CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS) | ||
else ifeq ($(GOARCH),amd64) | ||
ifdef IS_SUSE | ||
# There is no x86_64-suse-linux-gcc, so use the native one. | ||
HOST := | ||
else | ||
HOST := x86_64-$(PLATFORM)- | ||
endif | ||
else ifeq ($(GOARCH),arm64) | ||
HOST := aarch64-$(PLATFORM)- | ||
else ifeq ($(GOARCH),arm) | ||
# HOST already configured by release_build.sh in this case. | ||
else ifeq ($(GOARCH),armel) | ||
HOST := arm-$(PLATFORM)eabi- | ||
else ifeq ($(GOARCH),armhf) | ||
HOST := arm-$(PLATFORM)eabihf- | ||
else ifeq ($(GOARCH),ppc64le) | ||
HOST := powerpc64le-$(PLATFORM)- | ||
else ifeq ($(GOARCH),riscv64) | ||
HOST := riscv64-$(PLATFORM)- | ||
else ifeq ($(GOARCH),s390x) | ||
HOST := s390x-$(PLATFORM)- | ||
else | ||
$(error Unsupported GOARCH $(GOARCH)) | ||
endif | ||
|
||
ifeq ($(origin CC),$(filter $(origin CC),undefined default)) | ||
# Override CC if it's undefined or just the default value set by Make. | ||
CC := $(HOST)gcc | ||
export CC | ||
endif | ||
STRIP ?= $(HOST)strip | ||
export STRIP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
## memfd-bind ## | ||
|
||
`runc` normally has to make a binary copy of itself (or of a smaller helper | ||
binary called `runc-dmz`) when constructing a container process in order to | ||
defend against certain container runtime attacks such as CVE-2019-5736. | ||
|
||
This cloned binary only exists until the container process starts (this means | ||
for `runc run` and `runc exec`, it only exists for a few hundred milliseconds | ||
-- for `runc create` it exists until `runc start` is called). However, because | ||
the clone is done using a memfd (or by creating files in directories that are | ||
likely to be a `tmpfs`), this can lead to temporary increases in *host* memory | ||
usage. Unless you are running on a cgroupv1 system with the cgroupv1 memory | ||
controller enabled and the (deprecated) `memory.move_charge_at_immigrate` | ||
enabled, there is no effect on the container's memory. | ||
|
||
However, for certain configurations this can still be undesirable. This daemon | ||
allows you to create a sealed memfd copy of the `runc` binary, which will cause | ||
`runc` to skip all binary copying, resulting in no additional memory usage for | ||
each container process (instead there is a single in-memory copy of the | ||
binary). It should be noted that (strictly speaking) this is slightly less | ||
secure if you are concerned about Dirty Cow-like 0-day kernel vulnerabilities, | ||
but for most users the security benefit is identical. | ||
|
||
The provided `[email protected]` file can be used to get systemd to manage | ||
this daemon. You can supply the path like so: | ||
|
||
``` | ||
% systemctl start memfd-bind@/usr/bin/runc | ||
``` | ||
|
||
Thus, there are three ways of protecting against CVE-2019-5736, in order of how | ||
much memory usage they can use: | ||
|
||
* `memfd-bind` only creates a single in-memory copy of the `runc` binary (about | ||
10MB), regardless of how many containers are running. | ||
|
||
* `runc-dmz` is (depending on which libc it was compiled with) between 10kB and | ||
1MB in size, and a copy is created once per process spawned inside a | ||
container by runc (both the pid1 and every `runc exec`). There are | ||
circumstances where using `runc-dmz` will fail in ways that runc cannot | ||
predict ahead of time (such as restrictive LSMs applied to containers), in | ||
which case users can disable it with the `RUNC_DMZ=legacy` setting. | ||
`runc-dmz` also requires an additional `execve` over the other options, | ||
though since the binary is so small the cost is probably not even noticeable. | ||
|
||
* The classic method of making a copy of the entire `runc` binary during | ||
container process setup takes up about 10MB per process spawned inside the | ||
container by runc (both pid1 and `runc exec`). | ||
|
||
### Caveats ### | ||
|
||
There are several downsides with using `memfd-bind` on the `runc` binary: | ||
|
||
* The `memfd-bind` process needs to continue to run indefinitely in order for | ||
the memfd reference to stay alive. If the process is forcefully killed, the | ||
bind-mount on top of the `runc` binary will become stale and nobody will be | ||
able to execute it (you can use `memfd-bind --cleanup` to clean up the stale | ||
mount). | ||
|
||
* Only root can execute the cloned binary due to permission restrictions on | ||
accessing other process's files. More specifically, only users with ptrace | ||
privileges over the memfd-bind daemon can access the file (but in practice | ||
this is usually only root). | ||
|
||
* When updating `runc`, the daemon needs to be stopped before the update (so | ||
the package manager can access the underlying file) and then restarted after | ||
the update. |
Oops, something went wrong.