Skip to content

Commit

Permalink
Add support for cgroup namespace
Browse files Browse the repository at this point in the history
Signed-off-by: Yuanhong Peng <[email protected]>
  • Loading branch information
Yuanhong Peng committed Nov 14, 2016
1 parent eb411bf commit 6c4f233
Show file tree
Hide file tree
Showing 10 changed files with 167 additions and 22 deletions.
1 change: 1 addition & 0 deletions libcontainer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ config := &configs.Config{
{Type: configs.NEWPID},
{Type: configs.NEWUSER},
{Type: configs.NEWNET},
{Type: configs.NEWCGROUP},
}),
Cgroups: &configs.Cgroup{
Name: "test-container",
Expand Down
21 changes: 11 additions & 10 deletions libcontainer/SPEC.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,17 @@ Minimum requirements:

### Namespaces

| Flag | Enabled |
| ------------ | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |

Namespaces are created for the container via the `clone` syscall.
| Flag | Enabled |
| --------------- | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |
| CLONE_NEWCGROUP | 1 |

Namespaces are created for the container via the `unshare` syscall.


### Filesystem
Expand Down
8 changes: 4 additions & 4 deletions libcontainer/cgroups/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (
)

const (
cgroupNamePrefix = "name="
CgroupNamePrefix = "name="
CgroupProcesses = "cgroup.procs"
)

Expand Down Expand Up @@ -161,8 +161,8 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
if !ss[opt] {
continue
}
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
if strings.HasPrefix(opt, CgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(CgroupNamePrefix):])
} else {
m.Subsystems = append(m.Subsystems, opt)
}
Expand Down Expand Up @@ -316,7 +316,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err
return p, nil
}

if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok {
if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
return p, nil
}

Expand Down
4 changes: 2 additions & 2 deletions libcontainer/configs/namespaces_syscall.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ func (n *Namespace) Syscall() int {
}

// This is not yet in the Go stdlib.
const syscall_CLONE_NEWCGROUP = (1 << 29)
const Syscall_CLONE_NEWCGROUP = (1 << 25)

var namespaceInfo = map[NamespaceType]int{
NEWNET: syscall.CLONE_NEWNET,
Expand All @@ -18,7 +18,7 @@ var namespaceInfo = map[NamespaceType]int{
NEWIPC: syscall.CLONE_NEWIPC,
NEWUTS: syscall.CLONE_NEWUTS,
NEWPID: syscall.CLONE_NEWPID,
NEWCGROUP: syscall_CLONE_NEWCGROUP,
NEWCGROUP: Syscall_CLONE_NEWCGROUP,
}

// CloneFlags parses the container's Namespaces options to set the correct
Expand Down
1 change: 1 addition & 0 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
configs.NEWCGROUP,
}

// Remove namespaces that we don't need to join.
Expand Down
51 changes: 51 additions & 0 deletions libcontainer/integration/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1660,3 +1660,54 @@ func TestTmpfsCopyUp(t *testing.T) {
t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs)
}
}

func TestCGROUPPrivate(t *testing.T) {
if testing.Short() {
return
}

rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)

l, err := os.Readlink("/proc/1/ns/cgroup")
ok(t, err)

config := newTemplateConfig(rootfs)
config.Namespaces.Add(configs.NEWCGROUP, "")
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
ok(t, err)

if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}

if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
}
}

func TestCGROUPHost(t *testing.T) {
if testing.Short() {
return
}

rootfs, err := newRootfs()
ok(t, err)
defer remove(rootfs)

l, err := os.Readlink("/proc/1/ns/cgroup")
ok(t, err)

config := newTemplateConfig(rootfs)
buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup")
ok(t, err)

if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}

if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
}
}
31 changes: 30 additions & 1 deletion libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ enum sync_t {
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};

/* Synchronisation value for cgroup namespace setup */
#define SYNC_PRIVATE_CGNS 0x80

/* longjmp() arguments. */
#define JUMP_PARENT 0x00
#define JUMP_CHILD 0xA0
Expand All @@ -46,6 +49,9 @@ enum sync_t {
/* JSON buffer. */
#define JSON_MAX 4096

/* buffer for synchronisation value */
#define SYNC_BUF_LEN 10

/* Assume the stack grows down, so arguments should be above it. */
struct clone_t {
/*
Expand Down Expand Up @@ -608,6 +614,7 @@ void nsexec(void)
case JUMP_CHILD: {
pid_t child;
enum sync_t s;
uint32_t actual_flags = config.cloneflags;

/* We're in a child and thus need to tell the parent if we die. */
syncfd = syncpipe[0];
Expand All @@ -634,7 +641,10 @@ void nsexec(void)
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
if (unshare(config.cloneflags) < 0)
if (actual_flags & CLONE_NEWCGROUP) {
actual_flags &= ~CLONE_NEWCGROUP;
}
if (unshare(actual_flags) < 0)
bail("failed to unshare namespaces");

/*
Expand Down Expand Up @@ -711,6 +721,9 @@ void nsexec(void)
* start_child() code after forking in the parent.
*/
int consolefd = config.consolefd;
char buf[SYNC_BUF_LEN];
int len, value;
char *endptr;

/* We're in a child and thus need to tell the parent if we die. */
syncfd = syncpipe[0];
Expand Down Expand Up @@ -741,6 +754,22 @@ void nsexec(void)
bail("failed to dup stderr");
}

/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
if (config.cloneflags & CLONE_NEWCGROUP) {
len = read(pipenum, buf, SYNC_BUF_LEN);
if (len < 0)
bail("read synchronisation value failed");
value = strtol(buf, &endptr, 10);
if (*endptr != '\0')
bail("unable to parse synchronisation value");
if (value == SYNC_PRIVATE_CGNS) {
if (unshare(CLONE_NEWCGROUP) < 0)
bail("failed to unshare cgroup namespace");
}
else
bail("received unknown synchronisation value");
}

/* Close sync pipes. */
close(syncpipe[0]);
close(syncpipe[1]);
Expand Down
10 changes: 10 additions & 0 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ import (
"github.com/opencontainers/runc/libcontainer/utils"
)

// Synchronisation value for cgroup namespace setup.
const privateCgroupns int = (1 << 7)

type parentProcess interface {
// pid returns the pid for the running process.
pid() int
Expand Down Expand Up @@ -257,6 +260,13 @@ func (p *initProcess) start() error {
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
// Now it's time to setup cgroup namesapce
if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
if _, err := p.parentPipe.WriteString(strconv.Itoa(privateCgroupns)); err != nil {
return newSystemErrorWithCause(err, "sending synchronization value to init process")
}

}
defer func() {
if err != nil {
// TODO: should not be the responsibility to call here
Expand Down
47 changes: 42 additions & 5 deletions libcontainer/rootfs_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,15 @@ func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWrit
return newSystemErrorWithCause(err, "preparing rootfs")
}

hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP)
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
return newSystemErrorWithCause(err, "running premount command")
}
}
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil {
return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination)
}

Expand Down Expand Up @@ -125,7 +126,7 @@ func mountCmd(cmd configs.Command) error {
return nil
}

func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
var (
dest = m.Destination
)
Expand Down Expand Up @@ -257,12 +258,33 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
Data: "mode=755",
PropagationFlags: m.PropagationFlags,
}
if err := mountToRootfs(tmpfs, rootfs, mountLabel); err != nil {
if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil {
return err
}
for _, b := range binds {
if err := mountToRootfs(b, rootfs, mountLabel); err != nil {
return err
if enableCgroupns {
subsystemPath := filepath.Join(rootfs, b.Destination)
if err := os.MkdirAll(subsystemPath, 0755); err != nil {
return err
}
flags := defaultMountFlags
if m.Flags&syscall.MS_RDONLY != 0 {
flags = flags | syscall.MS_RDONLY
}
cgroupmount := &configs.Mount{
Source: "cgroup",
Device: "cgroup",
Destination: subsystemPath,
Flags: flags,
Data: filepath.Base(subsystemPath),
}
if err := mountNewCgroup(cgroupmount); err != nil {
return err
}
} else {
if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil {
return err
}
}
}
for _, mc := range merged {
Expand Down Expand Up @@ -764,3 +786,18 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
}
return nil
}

func mountNewCgroup(m *configs.Mount) error {
var (
data = m.Data
source = m.Source
)
if data == "systemd" {
data = cgroups.CgroupNamePrefix + data
source = "systemd"
}
if err := syscall.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil {
return err
}
return nil
}
15 changes: 15 additions & 0 deletions libcontainer/specconv/spec_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
}

cgroupPathOfHost, err := os.Readlink("/proc/1/ns/cgroup")
if err != nil {
return nil, err
}
for _, ns := range spec.Linux.Namespaces {
t, exists := namespaceMapping[ns.Type]
if !exists {
Expand All @@ -191,6 +195,17 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
if config.Namespaces.Contains(t) {
return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
}
// if this container is going to have shared cgroup namespace with host,
// we just skip the following .Add()
if t == configs.NEWCGROUP && ns.Path != "" {
cgroupPathOfContainer, err := os.Readlink(ns.Path)
if err != nil {
return nil, err
}
if strings.Compare(cgroupPathOfHost, cgroupPathOfContainer) == 0 {
continue
}
}
config.Namespaces.Add(t, ns.Path)
}
if config.Namespaces.Contains(configs.NEWNET) {
Expand Down

0 comments on commit 6c4f233

Please sign in to comment.