Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for cgroup namespace #1916

Merged
merged 2 commits into from
Nov 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libcontainer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ config := &configs.Config{
{Type: configs.NEWPID},
{Type: configs.NEWUSER},
{Type: configs.NEWNET},
{Type: configs.NEWCGROUP},
}),
Cgroups: &configs.Cgroup{
Name: "test-container",
Expand Down
21 changes: 11 additions & 10 deletions libcontainer/SPEC.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,17 @@ Minimum requirements:

### Namespaces

| Flag | Enabled |
| ------------ | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |

Namespaces are created for the container via the `clone` syscall.
| Flag | Enabled |
| --------------- | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |
| CLONE_NEWCGROUP | 1 |

Namespaces are created for the container via the `unshare` syscall.


### Filesystem
Expand Down
8 changes: 4 additions & 4 deletions libcontainer/cgroups/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (
)

const (
cgroupNamePrefix = "name="
CgroupNamePrefix = "name="
CgroupProcesses = "cgroup.procs"
)

Expand Down Expand Up @@ -156,8 +156,8 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
continue
}
ss[opt] = true
if strings.HasPrefix(opt, cgroupNamePrefix) {
opt = opt[len(cgroupNamePrefix):]
if strings.HasPrefix(opt, CgroupNamePrefix) {
opt = opt[len(CgroupNamePrefix):]
}
m.Subsystems = append(m.Subsystems, opt)
numFound++
Expand Down Expand Up @@ -343,7 +343,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err
return p, nil
}

if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok {
if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
return p, nil
}

Expand Down
16 changes: 10 additions & 6 deletions libcontainer/configs/namespaces_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ import (
)

const (
NEWNET NamespaceType = "NEWNET"
NEWPID NamespaceType = "NEWPID"
NEWNS NamespaceType = "NEWNS"
NEWUTS NamespaceType = "NEWUTS"
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWNET NamespaceType = "NEWNET"
NEWPID NamespaceType = "NEWPID"
NEWNS NamespaceType = "NEWNS"
NEWUTS NamespaceType = "NEWUTS"
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWCGROUP NamespaceType = "NEWCGROUP"
)

var (
Expand All @@ -35,6 +36,8 @@ func NsName(ns NamespaceType) string {
return "user"
case NEWUTS:
return "uts"
case NEWCGROUP:
return "cgroup"
}
return ""
}
Expand Down Expand Up @@ -68,6 +71,7 @@ func NamespaceTypes() []NamespaceType {
NEWNET,
NEWPID,
NEWNS,
NEWCGROUP,
}
}

Expand Down
13 changes: 7 additions & 6 deletions libcontainer/configs/namespaces_syscall.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ func (n *Namespace) Syscall() int {
}

var namespaceInfo = map[NamespaceType]int{
NEWNET: unix.CLONE_NEWNET,
NEWNS: unix.CLONE_NEWNS,
NEWUSER: unix.CLONE_NEWUSER,
NEWIPC: unix.CLONE_NEWIPC,
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWNET: unix.CLONE_NEWNET,
NEWNS: unix.CLONE_NEWNS,
NEWUSER: unix.CLONE_NEWUSER,
NEWIPC: unix.CLONE_NEWIPC,
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWCGROUP: unix.CLONE_NEWCGROUP,
}

// CloneFlags parses the container's Namespaces options to set the correct
Expand Down
12 changes: 12 additions & 0 deletions libcontainer/configs/validate/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.usernamespace(config); err != nil {
return err
}
if err := v.cgroupnamespace(config); err != nil {
return err
}
if err := v.sysctl(config); err != nil {
return err
}
Expand Down Expand Up @@ -116,6 +119,15 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error {
return nil
}

func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
if config.Namespaces.Contains(configs.NEWCGROUP) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
return fmt.Errorf("cgroup namespaces aren't enabled in the kernel")
}
}
return nil
}

// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
Expand Down
1 change: 0 additions & 1 deletion libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -1745,7 +1745,6 @@ func (c *linuxContainer) currentState() (*State, error) {
// can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{}

for _, ns := range configs.NamespaceTypes() {

// Remove namespaces that we don't need to join.
Expand Down
57 changes: 57 additions & 0 deletions libcontainer/integration/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1776,3 +1776,60 @@ func TestTmpfsCopyUp(t *testing.T) {
t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs)
}
}

func TestCGROUPPrivate(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
t.Skip("cgroupns is unsupported")
}
if testing.Short() {
return
}

rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)

l, err := os.Readlink("/proc/1/ns/cgroup")
ok(t, err)

config := newTemplateConfig(rootfs)
config.Namespaces.Add(configs.NEWCGROUP, "")
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
ok(t, err)

if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}

if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
}
}

func TestCGROUPHost(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
t.Skip("cgroupns is unsupported")
}
if testing.Short() {
return
}

rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)

l, err := os.Readlink("/proc/1/ns/cgroup")
ok(t, err)

config := newTemplateConfig(rootfs)
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
ok(t, err)

if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}

if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
}
}
63 changes: 39 additions & 24 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ enum sync_t {
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};

/*
* Synchronisation value for cgroup namespace setup.
* The same constant is defined in process_linux.go as "createCgroupns".
*/
#define CREATECGROUPNS 0x80

/* longjmp() arguments. */
#define JUMP_PARENT 0x00
#define JUMP_CHILD 0xA0
Expand Down Expand Up @@ -640,7 +646,6 @@ void nsexec(void)
case JUMP_PARENT:{
int len;
pid_t child, first_child = -1;
char buf[JSON_MAX];
bool ready = false;

/* For debugging. */
Expand Down Expand Up @@ -716,6 +721,18 @@ void nsexec(void)
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}

/* Send the init_func pid back to our parent.
*
* Send the init_func pid and the pid of the first child back to our parent.
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
* It becomes the responsibility of our parent to reap the first child.
*/
len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
crosbymichael marked this conversation as resolved.
Show resolved Hide resolved
bail("unable to generate JSON for child pid");
}
}
break;
case SYNC_CHILD_READY:
Expand Down Expand Up @@ -759,23 +776,6 @@ void nsexec(void)
bail("unexpected sync value: %u", s);
}
}

/*
* Send the init_func pid and the pid of the first child back to our parent.
*
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
* It becomes the responsibility of our parent to reap the first child.
*/
len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
if (write(pipenum, buf, len) != len) {
kill(child, SIGKILL);
bail("unable to send child pid to bootstrapper");
}

exit(0);
}

Expand Down Expand Up @@ -862,14 +862,17 @@ void nsexec(void)
if (setresuid(0, 0, 0) < 0)
bail("failed to become root in user namespace");
}

/*
* Unshare all of the namespaces. Note that we don't merge this
* with clone() because there were some old kernel versions where
* clone(CLONE_PARENT | CLONE_NEWPID) was broken, so we'll just do
* it the long way.
* Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* Note that we don't merge this with clone() because there were
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
if (unshare(config.cloneflags) < 0)
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
bail("failed to unshare namespaces");

/*
Expand Down Expand Up @@ -958,6 +961,18 @@ void nsexec(void)
bail("setgroups failed");
}

/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
if (config.cloneflags & CLONE_NEWCGROUP) {
uint8_t value;
if (read(pipenum, &value, sizeof(value)) != sizeof(value))
bail("read synchronisation value failed");
if (value == CREATECGROUPNS) {
if (unshare(CLONE_NEWCGROUP) < 0)
bail("failed to unshare cgroup namespace");
} else
bail("received unknown synchronisation value");
}

s = SYNC_CHILD_READY;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with patent: write(SYNC_CHILD_READY)");
Expand Down
Loading