From 759ba545ce83a58a5767eb5537de936f286416b9 Mon Sep 17 00:00:00 2001 From: "Daniel, Dao Quang Minh" Date: Mon, 1 Jun 2015 19:18:08 +0000 Subject: [PATCH] set custom namespaces for init process An init process can join other namespaces (pidns, ipc etc.). This leverages C code defined in nsenter package to spawn a process with correct namespaces and clone if necessary. This moves all setns and cloneflags related code to nsenter layer, which mean that we dont use Go os/exec to create process with cloneflags and set uid/gid_map or setgroups anymore. The necessary data is passed from Go to C using a simple binary encoding format. With this change, setns and init processes are almost the same, which brings some opportunity for refactoring. This removes joinExistingNamespaces in Go layer. Signed-off-by: Daniel, Dao Quang Minh --- libcontainer/configs/namespaces_unix.go | 81 +++++-- libcontainer/container_linux.go | 213 ++++++++++++++--- libcontainer/init_linux.go | 19 -- libcontainer/integration/exec_test.go | 189 +++++++++++++++ libcontainer/integration/execin_test.go | 58 +++++ libcontainer/integration/utils_test.go | 8 +- libcontainer/nsenter/nsenter_test.go | 129 +++++++--- libcontainer/nsenter/nsexec.c | 301 ++++++++++++++++++------ libcontainer/process_linux.go | 82 +++++-- libcontainer/standard_init_linux.go | 4 - spec.go | 5 +- 11 files changed, 894 insertions(+), 195 deletions(-) diff --git a/libcontainer/configs/namespaces_unix.go b/libcontainer/configs/namespaces_unix.go index 7bc9085468d..73dba4e61bc 100644 --- a/libcontainer/configs/namespaces_unix.go +++ b/libcontainer/configs/namespaces_unix.go @@ -2,7 +2,11 @@ package configs -import "fmt" +import ( + "fmt" + "os" + "sync" +) const ( NEWNET NamespaceType = "NEWNET" @@ -13,6 +17,52 @@ const ( NEWUSER NamespaceType = "NEWUSER" ) +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// nsToFile converts the namespace type to its filename +func nsToFile(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + } + return "" +} + +// IsNamespaceSupported returns the list of current kernel's supported +// namespaces. The namespaces will be sorted in order that we can safely setns +// to (i.e., mount namespace is at the bottom of the list) +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := nsToFile(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + func NamespaceTypes() []NamespaceType { return []NamespaceType{ NEWNET, @@ -35,26 +85,7 @@ func (n *Namespace) GetPath(pid int) string { if n.Path != "" { return n.Path } - return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file()) -} - -func (n *Namespace) file() string { - file := "" - switch n.Type { - case NEWNET: - file = "net" - case NEWNS: - file = "mnt" - case NEWPID: - file = "pid" - case NEWIPC: - file = "ipc" - case NEWUSER: - file = "user" - case NEWUTS: - file = "uts" - } - return file + return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type)) } func (n *Namespaces) Remove(t NamespaceType) bool { @@ -87,3 +118,11 @@ func (n *Namespaces) index(t NamespaceType) int { func (n *Namespaces) Contains(t NamespaceType) bool { return n.index(t) != -1 } + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 9210ec6a3b9..24aaeae03eb 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -3,8 +3,11 @@ package libcontainer import ( + "bytes" + "encoding/binary" "encoding/json" "fmt" + "io" "io/ioutil" "os" "os/exec" @@ -135,7 +138,7 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces return nil, newSystemError(err) } if !doInit { - return c.newSetnsProcess(p, cmd, parentPipe, childPipe), nil + return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } return c.newInitProcess(p, cmd, parentPipe, childPipe) } @@ -164,46 +167,48 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. } func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { - t := "_LIBCONTAINER_INITTYPE=standard" - cloneFlags := c.config.Namespaces.CloneFlags() - if cloneFlags&syscall.CLONE_NEWUSER != 0 { - if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil { - // user mappings are not supported - return nil, err - } - enableSetgroups(cmd.SysProcAttr) - // Default to root user when user namespaces are enabled. - if cmd.SysProcAttr.Credential == nil { - cmd.SysProcAttr.Credential = &syscall.Credential{} + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE=standard") + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path } } - cmd.Env = append(cmd.Env, t) - cmd.SysProcAttr.Cloneflags = cloneFlags + data, err := c.bootstrapData(cmd, c.config.Namespaces.CloneFlags(), nsMaps, "") + if err != nil { + return nil, err + } return &initProcess{ - cmd: cmd, - childPipe: childPipe, - parentPipe: parentPipe, - manager: c.cgroupManager, - config: c.newInitConfig(p), + cmd: cmd, + childPipe: childPipe, + parentPipe: parentPipe, + manager: c.cgroupManager, + config: c.newInitConfig(p), + bootstrapData: data, }, nil } -func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) *setnsProcess { - cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.initProcess.pid()), - "_LIBCONTAINER_INITTYPE=setns", - ) - if p.consolePath != "" { - cmd.Env = append(cmd.Env, "_LIBCONTAINER_CONSOLE_PATH="+p.consolePath) +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE=setns") + state, err := c.currentState() + if err != nil { + return nil, newSystemError(err) + } + // for setns process, we dont have to set cloneflags as the process namespaces + // will only be set via setns syscall + data, err := c.bootstrapData(cmd, 0, state.NamespacePaths, p.consolePath) + if err != nil { + return nil, err } // TODO: set on container for process management return &setnsProcess{ - cmd: cmd, - cgroupPaths: c.cgroupManager.GetPaths(), - childPipe: childPipe, - parentPipe: parentPipe, - config: c.newInitConfig(p), - } + cmd: cmd, + cgroupPaths: c.cgroupManager.GetPaths(), + childPipe: childPipe, + parentPipe: parentPipe, + config: c.newInitConfig(p), + bootstrapData: data, + }, nil } func (c *linuxContainer) newInitConfig(process *Process) *initConfig { @@ -827,3 +832,147 @@ func (c *linuxContainer) currentState() (*State, error) { } return state, nil } + +// orderNamespacePaths sorts that namespace paths into a list of paths that we +// can safely setns to. +func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { + paths := []string{} + nsTypes := []configs.NamespaceType{ + configs.NEWIPC, + configs.NEWUTS, + configs.NEWNET, + configs.NEWPID, + configs.NEWNS, + } + // join userns if the init process explicitly requires NEWUSER + if c.config.Namespaces.Contains(configs.NEWUSER) { + nsTypes = append(nsTypes, configs.NEWUSER) + } + for _, nsType := range nsTypes { + if p, ok := namespaces[nsType]; ok && p != "" { + // check if the requested namespace is supported + if !configs.IsNamespaceSupported(nsType) { + return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType)) + } + // only set to join this namespace if it exists + if _, err := os.Lstat(p); err != nil { + return nil, newSystemError(err) + } + // do not allow namespace path with comma as we use it to separate + // the namespace paths + if strings.ContainsRune(p, ',') { + return nil, newSystemError(fmt.Errorf("invalid path %s", p)) + } + paths = append(paths, p) + } + } + return paths, nil +} + +// bootstrapData encodes the necessary data in binary format as a io.Reader. +// Consumer can write the data to a bootstrap program such as one that uses +// nsenter package to bootstrap the container's init process correctly, i.e. with +// correct namespaces, uid/gid mapping etc. +// +// The binary format is: +// - 8 byte of uint64 total length of the key-value structure +// - for each key-value: +// - 1 byte of uint8 for the length of key +// - key content +// - 4 byte of uint32 for the length of the value +// - value +func (c *linuxContainer) bootstrapData(cmd *exec.Cmd, cloneFlags uintptr, + nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) { + b := bytes.NewBuffer(nil) + + // write cloneFlags + if err := encodeInt32(b, "clone_flags", uint32(cloneFlags)); err != nil { + return nil, err + } + + // write console path if we requires it + if consolePath != "" { + if err := encodeString(b, "console_path", consolePath); err != nil { + return nil, err + } + } + + if len(nsMaps) > 0 { + nsPaths, err := c.orderNamespacePaths(nsMaps) + if err != nil { + return nil, err + } + if err := encodeString(b, "ns_paths", strings.Join(nsPaths, ",")); err != nil { + return nil, err + } + } + + // write namespace paths only when we are not joining an existing user ns + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(c.config.UidMappings) > 0 { + if err := encodeIDMapping(b, "uid_map", c.config.UidMappings); err != nil { + return nil, err + } + } + + // write gid mappings + if len(c.config.GidMappings) > 0 { + if err := encodeIDMapping(b, "gid_map", c.config.GidMappings); err != nil { + return nil, err + } + } + } + + // prefix the total length and then write the data out + data := bytes.NewBuffer(make([]byte, 0, b.Len()+8)) + if err := binary.Write(data, binary.BigEndian, uint64(b.Len())); err != nil { + return nil, err + } + if _, err := io.Copy(data, b); err != nil { + return nil, err + } + return data, nil +} + +func encodeInt32(w io.Writer, name string, val uint32) error { + if len(name) > 255 { + return fmt.Errorf("%s is too long", name) + } + if err := binary.Write(w, binary.BigEndian, uint8(len(name))); err != nil { + return err + } + if _, err := w.Write([]byte(name)); err != nil { + return err + } + return binary.Write(w, binary.BigEndian, val) +} + +func encodeString(w io.Writer, name, val string) error { + if len(name) > 255 { + return fmt.Errorf("%s is too long", name) + } + if err := binary.Write(w, binary.BigEndian, uint8(len(name))); err != nil { + return err + } + if _, err := w.Write([]byte(name)); err != nil { + return err + } + if err := binary.Write(w, binary.BigEndian, uint32(len(val))); err != nil { + return err + } + _, err := w.Write([]byte(val)) + return err +} + +func encodeIDMapping(w io.Writer, name string, idMap []configs.IDMap) error { + data := bytes.NewBuffer(nil) + for _, im := range idMap { + line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) + if _, err := data.WriteString(line); err != nil { + return err + } + } + return encodeString(w, name, data.String()) +} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 6854a2d9023..39db87d5c49 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -139,25 +139,6 @@ func finalizeNamespace(config *initConfig) error { return nil } -// joinExistingNamespaces gets all the namespace paths specified for the container and -// does a setns on the namespace fd so that the current process joins the namespace. -func joinExistingNamespaces(namespaces []configs.Namespace) error { - for _, ns := range namespaces { - if ns.Path != "" { - f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) - if err != nil { - return err - } - err = system.Setns(f.Fd(), uintptr(ns.Syscall())) - f.Close() - if err != nil { - return err - } - } - } - return nil -} - // setupUser changes the groups, gid, and uid for the user inside the container func setupUser(config *initConfig) error { // Set up defaults. diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 26ce3a8a8f9..82ae5eb211f 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -2,9 +2,11 @@ package integration import ( "bytes" + "fmt" "io/ioutil" "os" "path/filepath" + "reflect" "strconv" "strings" "syscall" @@ -920,3 +922,190 @@ func TestOomScoreAdj(t *testing.T) { t.Fatalf("Expected oom_score_adj %d; got %q", config.OomScoreAdj, outputOomScoreAdj) } } + +func TestInitJoinPID(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + container1, err := newContainer(newTemplateConfig(rootfs)) + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + } + err = container1.Start(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + pidns1 := state1.NamespacePaths[configs.NEWPID] + + // Start a container inside the existing pidns but with different cgroups + config2 := newTemplateConfig(rootfs) + config2.Namespaces.Add(configs.NEWPID, pidns1) + config2.Cgroups.Name = "test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + } + err = container2.Start(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state1.InitProcessPid)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state2.InitProcessPid)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("pidns(%s), wanted %s", ns2, ns1) + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // check that pidns is joined correctly. The initial container process list + // should contain the second container's init process + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Args: []string{"ps"}, + Env: standardEnvironment, + Stdout: buffers.Stdout, + } + err = container1.Start(ps) + ok(t, err) + waitProcess(ps, t) + + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) + + out := strings.TrimSpace(buffers.Stdout.String()) + // output of ps inside the initial PID namespace should have + // 1 line of header, + // 2 lines of init processes, + // 1 line of ps process + if len(strings.Split(out, "\n")) != 4 { + t.Errorf("unexpected running process, output %q", out) + } +} + +func TestInitJoinNetworkAndUser(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + config1 := newTemplateConfig(rootfs) + config1.UidMappings = []configs.IDMap{{0, 0, 1000}} + config1.GidMappings = []configs.IDMap{{0, 0, 1000}} + config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + container1, err := newContainer(config1) + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + } + err = container1.Start(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + netns1 := state1.NamespacePaths[configs.NEWNET] + userns1 := state1.NamespacePaths[configs.NEWUSER] + + // Start a container inside the existing pidns but with different cgroups + rootfs2, err := newRootfs() + ok(t, err) + defer remove(rootfs2) + + config2 := newTemplateConfig(rootfs2) + config2.UidMappings = []configs.IDMap{{0, 0, 1000}} + config2.GidMappings = []configs.IDMap{{0, 0, 1000}} + config2.Namespaces.Add(configs.NEWNET, netns1) + config2.Namespaces.Add(configs.NEWUSER, userns1) + config2.Cgroups.Name = "test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + } + err = container2.Start(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + for _, ns := range []string{"net", "user"} { + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state1.InitProcessPid, ns)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state2.InitProcessPid, ns)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("%s(%s), wanted %s", ns, ns2, ns1) + } + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) +} diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go index 33d7b1cc017..4277ef34be9 100644 --- a/libcontainer/integration/execin_test.go +++ b/libcontainer/integration/execin_test.go @@ -2,6 +2,7 @@ package integration import ( "bytes" + "fmt" "io" "os" "strconv" @@ -10,6 +11,7 @@ import ( "time" "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" ) func TestExecIn(t *testing.T) { @@ -379,3 +381,59 @@ func TestExecInOomScoreAdj(t *testing.T) { t.Fatalf("expected oomScoreAdj to be %d, got %s", config.OomScoreAdj, oomScoreAdj) } } + +func TestExecInUserns(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + config.UidMappings = []configs.IDMap{{0, 0, 1000}} + config.GidMappings = []configs.IDMap{{0, 0, 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + } + err = container.Start(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + initPID, err := process.Pid() + ok(t, err) + initUserns, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/user", initPID)) + ok(t, err) + + buffers := newStdBuffers() + process2 := &libcontainer.Process{ + Args: []string{"readlink", "/proc/self/ns/user"}, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + }, + Stdout: buffers.Stdout, + Stderr: os.Stderr, + } + err = container.Start(process2) + ok(t, err) + waitProcess(process2, t) + stdinW.Close() + waitProcess(process, t) + + if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns { + t.Errorf("execin userns(%s), wanted %s", out, initUserns) + } +} diff --git a/libcontainer/integration/utils_test.go b/libcontainer/integration/utils_test.go index 1fc2da4d647..c5539a3f8ae 100644 --- a/libcontainer/integration/utils_test.go +++ b/libcontainer/integration/utils_test.go @@ -92,13 +92,15 @@ func copyBusybox(dest string) error { } func newContainer(config *configs.Config) (libcontainer.Container, error) { - f := factory + return newContainerWithName("testCT", config) +} +func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) { + f := factory if config.Cgroups != nil && config.Cgroups.Slice == "system.slice" { f = systemdFactory } - - return f.Create("testCT", config) + return f.Create(name, config) } // runContainer runs the container with the specific config and arguments diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go index db27b8a4099..3b398ce61e4 100644 --- a/libcontainer/nsenter/nsenter_test.go +++ b/libcontainer/nsenter/nsenter_test.go @@ -1,11 +1,16 @@ package nsenter import ( + "bytes" + "encoding/binary" "encoding/json" "fmt" + "io" + "io/ioutil" "os" "os/exec" "strings" + "syscall" "testing" ) @@ -13,29 +18,53 @@ type pid struct { Pid int `json:"Pid"` } -func TestNsenterAlivePid(t *testing.T) { +func TestNsenterValidPaths(t *testing.T) { args := []string{"nsenter-exec"} - r, w, err := os.Pipe() + parent, child, err := newPipe() if err != nil { t.Fatalf("failed to create pipe %v", err) } + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()), + } cmd := &exec.Cmd{ Path: os.Args[0], Args: args, - ExtraFiles: []*os.File{w}, - Env: []string{fmt.Sprintf("_LIBCONTAINER_INITPID=%d", os.Getpid()), "_LIBCONTAINER_INITPIPE=3"}, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + Stdout: os.Stdout, + Stderr: os.Stderr, } if err := cmd.Start(); err != nil { t.Fatalf("nsenter failed to start %v", err) } - w.Close() + // write cloneFlags + b := bytes.NewBuffer(nil) + if err := encodeInt32(b, "clone_flags", uint32(syscall.CLONE_NEWNET)); err != nil { + t.Fatal(err) + } + if err := encodeString(b, "ns_paths", strings.Join(namespaces, ",")); err != nil { + t.Fatal(err) + } + // prefix the total length and then write the data out + if err := binary.Write(parent, binary.BigEndian, uint64(b.Len())); err != nil { + t.Fatal(err) + } + if _, err := io.Copy(parent, b); err != nil { + t.Fatal(err) + } - decoder := json.NewDecoder(r) + decoder := json.NewDecoder(parent) var pid *pid if err := decoder.Decode(&pid); err != nil { + dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid())) + for _, d := range dir { + t.Log(d.Name()) + } t.Fatalf("%v", err) } @@ -49,37 +78,45 @@ func TestNsenterAlivePid(t *testing.T) { p.Wait() } -func TestNsenterInvalidPid(t *testing.T) { +func TestNsenterInvalidPaths(t *testing.T) { args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("/proc/%d/ns/pid", -1), + } cmd := &exec.Cmd{ - Path: os.Args[0], - Args: args, - Env: []string{"_LIBCONTAINER_INITPID=-1"}, + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, } - err := cmd.Run() - if err == nil { - t.Fatal("nsenter exits with a zero exit status") + if err := cmd.Start(); err != nil { + t.Fatal(err) } -} - -func TestNsenterDeadPid(t *testing.T) { - dead_cmd := exec.Command("true") - if err := dead_cmd.Run(); err != nil { + // write cloneFlags + b := bytes.NewBuffer(nil) + if err := encodeInt32(b, "clone_flags", uint32(syscall.CLONE_NEWNET)); err != nil { t.Fatal(err) } - args := []string{"nsenter-exec"} - - cmd := &exec.Cmd{ - Path: os.Args[0], - Args: args, - Env: []string{fmt.Sprintf("_LIBCONTAINER_INITPID=%d", dead_cmd.Process.Pid)}, + if err := encodeString(b, "ns_paths", strings.Join(namespaces, ",")); err != nil { + t.Fatal(err) + } + // prefix the total length and then write the data out + if err := binary.Write(parent, binary.BigEndian, uint64(b.Len())); err != nil { + t.Fatal(err) + } + if _, err := io.Copy(parent, b); err != nil { + t.Fatal(err) } - err := cmd.Run() - if err == nil { - t.Fatal("nsenter exits with a zero exit status") + if err := cmd.Wait(); err == nil { + t.Fatalf("nsenter exits with a zero exit status") } } @@ -89,3 +126,41 @@ func init() { } return } + +func encodeInt32(w io.Writer, name string, val uint32) error { + if len(name) > 255 { + return fmt.Errorf("%s is too long", name) + } + if err := binary.Write(w, binary.BigEndian, uint8(len(name))); err != nil { + return err + } + if _, err := w.Write([]byte(name)); err != nil { + return err + } + return binary.Write(w, binary.BigEndian, val) +} + +func encodeString(w io.Writer, name, val string) error { + if len(name) > 255 { + return fmt.Errorf("%s is too long", name) + } + if err := binary.Write(w, binary.BigEndian, uint8(len(name))); err != nil { + return err + } + if _, err := w.Write([]byte(name)); err != nil { + return err + } + if err := binary.Write(w, binary.BigEndian, uint32(len(val))); err != nil { + return err + } + _, err := w.Write([]byte(val)) + return err +} + +func newPipe() (parent *os.File, child *os.File, err error) { + fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index cd02d00a0a6..3be843b804f 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -4,7 +4,6 @@ #include #include #include - #include #include #include @@ -16,6 +15,9 @@ #include #include #include +#include +#include +#include /* All arguments should be above stack, because it grows down */ struct clone_arg { @@ -51,102 +53,224 @@ int setns(int fd, int nstype) #endif #endif -static int clone_parent(jmp_buf * env) __attribute__ ((noinline)); -static int clone_parent(jmp_buf * env) +static int clone_parent(jmp_buf * env, int flags) __attribute__ ((noinline)); +static int clone_parent(jmp_buf * env, int flags) { struct clone_arg ca; int child; ca.env = env; - child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); + child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, &ca); return child; } -void nsexec() +// get init pipe from the parent. It's used to read bootstrap data, and to +// write pid to after nsexec finishes setting up the environment. +static int get_init_pipe() { - char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt" }; - const int num = sizeof(namespaces) / sizeof(char *); - jmp_buf env; - char buf[PATH_MAX], *val; - int i, tfd, child, len, pipenum, consolefd = -1; - pid_t pid; - char *console; - - val = getenv("_LIBCONTAINER_INITPID"); - if (val == NULL) - return; - - pid = atoi(val); - snprintf(buf, sizeof(buf), "%d", pid); - if (strcmp(val, buf)) { - pr_perror("Unable to parse _LIBCONTAINER_INITPID"); + char buf[PATH_MAX], *initpipe; + int pipenum = -1; + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL) { + return -1; + } + pipenum = atoi(initpipe); + snprintf(buf, sizeof(buf), "%d", pipenum); + if (strcmp(initpipe, buf)) { + pr_perror("Unable to parse _LIBCONTAINER_INITPIPE"); exit(1); } - val = getenv("_LIBCONTAINER_INITPIPE"); - if (val == NULL) { - pr_perror("Child pipe not found"); - exit(1); + return pipenum; +} + +// namespacesLength returns the number of additional namespaces to setns. The +// argument is a comma-separated string of namespace paths. +static int namespacesLength(char *nspaths) +{ + int size = 0, i = 0; + for (i = 0; nspaths[i]; i++) { + if (nspaths[i] == ',') { + size += 1; + } } + return size + 1; +} - pipenum = atoi(val); - snprintf(buf, sizeof(buf), "%d", pipenum); - if (strcmp(val, buf)) { - pr_perror("Unable to parse _LIBCONTAINER_INITPIPE"); - exit(1); +static uint32_t readint32(char *buf, int *start) +{ + union { + uint32_t n; + char arr[4]; + } num; + int i = 0; + for (i = 0; i < 4; i++) { + num.arr[i] = buf[*start + i]; } + *start += 4; + return be32toh(num.n); +} - console = getenv("_LIBCONTAINER_CONSOLE_PATH"); - if (console != NULL) { - consolefd = open(console, O_RDWR); - if (consolefd < 0) { - pr_perror("Failed to open console %s", console); +static uint8_t readint8(char *buf, int *start) +{ + union { + uint8_t n; + char arr[1]; + } num; + num.arr[0] = buf[*start]; + *start += 1; + return num.n; +} + +static void writedata(int fd, char *data, int start, int len) +{ + int written = 0; + while (written < len) { + size_t nbyte, i; + if ((len - written) < 1024) { + nbyte = len - written; + } else { + nbyte = 1024; + } + i = write(fd, data + start + written, nbyte); + if (i == -1) { + pr_perror("failed to write data to %d", fd); exit(1); } + written += i; } +} + +void nsexec() +{ + jmp_buf env; + int child, pipenum = -1; - /* Check that the specified process exists */ - snprintf(buf, PATH_MAX - 1, "/proc/%d/ns", pid); - tfd = open(buf, O_DIRECTORY | O_RDONLY); - if (tfd == -1) { - pr_perror("Failed to open \"%s\"", buf); + uint64_t total; + uint32_t cloneflags = -1; + int consolefd = -1; + int uidmap_start, uidmap_len = -1; + int gidmap_start, gidmap_len = -1; + + // if we dont have init pipe, then just return to the parent + pipenum = get_init_pipe(); + if (pipenum == -1) { + return; + } + if (read(pipenum, &total, 8) != 8 || total <= 0) { + pr_perror("Invalid total size of bootstrap data"); exit(1); } + total = be64toh(total); - for (i = 0; i < num; i++) { - struct stat st; - int fd; - - /* Symlinks on all namespaces exist for dead processes, but they can't be opened */ - if (fstatat(tfd, namespaces[i], &st, AT_SYMLINK_NOFOLLOW) == -1) { - // Ignore nonexistent namespaces. - if (errno == ENOENT) - continue; + // pre-allocate the bootstrap data + char data[total]; + int i = 0; + while (i < total) { + size_t nbyte, nread; + if ((total - i) < 1024) { + nbyte = total - i; + } else { + nbyte = 1024; } - - fd = openat(tfd, namespaces[i], O_RDONLY); - if (fd == -1) { - pr_perror("Failed to open ns file %s for ns %s", buf, - namespaces[i]); + nread = read(pipenum, data + i, nbyte); + if (nread < 0) { + pr_perror("Failed to read from fd %d", pipenum); exit(1); } - // Set the namespace. - if (setns(fd, 0) == -1) { - pr_perror("Failed to setns for %s", namespaces[i]); - exit(1); + i += nread; + } + + // pre-processing the data to get offset of what we interested in + int start = 0; + while (start < total) { + uint8_t namelen = readint8(data, &start); + if (strncmp(data + start, "clone_flags", namelen) == 0) { + // process clone_flags + start = start + namelen; + cloneflags = readint32(data, &start); + } else if (strncmp(data + start, "console_path", namelen) == 0) { + // process console_paths + start = start + namelen; + uint32_t consolelen = readint32(data, &start); + char console[consolelen + 1]; + strncpy(console, data + start, consolelen); + console[consolelen] = '\0'; + // get the console path before setns because it may change mnt namespace + consolefd = open(console, O_RDWR); + if (consolefd < 0) { + pr_perror("Failed to open console %s", console); + exit(1); + } + start = start + consolelen; + } else if (strncmp(data + start, "ns_paths", namelen) == 0) { + // process ns_paths + start = start + namelen; + uint32_t nspaths_len = readint32(data, &start); + char nspaths[nspaths_len + 1]; + strncpy(nspaths, data + start, nspaths_len); + nspaths[nspaths_len] = '\0'; + + // if custom namespaces are required, open all descriptors and perform + // setns on them + int nslen = namespacesLength(nspaths); + int fds[nslen]; + char *nslist[nslen]; + int i = -1; + char *ns, *saveptr; + for (i = 0; i < nslen; i++) { + char *str = NULL; + if (i == 0) { + str = nspaths; + } + ns = strtok_r(str, ",", &saveptr); + if (ns == NULL) { + break; + } + fds[i] = open(ns, O_RDONLY); + if (fds[i] == -1) { + pr_perror("Failed to open %s", ns); + exit(1); + } + nslist[i] = ns; + } + for (i = 0; i < nslen; i++) { + if (setns(fds[i], 0) != 0) { + pr_perror("Failed to setns to %s", nslist[i]); + exit(1); + } + close(fds[i]); + } + + start = start + nspaths_len; + } else if (strncmp(data + start, "uid_map", namelen) == 0) { + // process uid_map + start = start + namelen; + uidmap_len = readint32(data, &start); + uidmap_start = start; + start = start + uidmap_len; + } else if (strncmp(data + start, "gid_map", namelen) == 0) { + // process gid_map + start = start + namelen; + gidmap_len = readint32(data, &start); + gidmap_start = start; + start = start + gidmap_len; } - close(fd); + } + // required clone_flags to be passed + if (cloneflags == -1) { + pr_perror("missing clone_flags"); + exit(1); } if (setjmp(env) == 1) { // Child - - if (setsid() == -1) { - pr_perror("setsid failed"); - exit(1); - } if (consolefd != -1) { + if (setsid() == -1) { + pr_perror("setsid failed"); + exit(1); + } if (ioctl(consolefd, TIOCSCTTY, 0) == -1) { pr_perror("ioctl TIOCSCTTY failed"); exit(1); @@ -172,14 +296,55 @@ void nsexec() // We must fork to actually enter the PID namespace, use CLONE_PARENT // so the child can have the right parent, and we don't need to forward // the child's exit code or resend its death signal. - child = clone_parent(&env); + child = clone_parent(&env, cloneflags); if (child < 0) { pr_perror("Unable to fork"); exit(1); } - - len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", child); - + // if we specifies uid_map and gid_map, writes the data to /proc files + if (uidmap_start > 0 && uidmap_len > 0) { + char buf[PATH_MAX]; + if (snprintf(buf, sizeof(buf), "/proc/%d/uid_map", child) < 0) { + pr_perror("failed to construct uid_map file for %d", child); + exit(1); + } + int fd = open(buf, O_RDWR); + writedata(fd, data, uidmap_start, uidmap_len); + } + if (gidmap_start > 0 && gidmap_len > 0) { + { + // write setgroups. This is needed since kernel 3.19, because you can't + // write gid_map without disabling setgroups() system call. + char buf[PATH_MAX]; + if (snprintf(buf, sizeof(buf), "/proc/%d/setgroups", child) < 0) { + pr_perror("failed to construct setgroups file for %d", child); + exit(1); + } + int fd = open(buf, O_RDWR); + if (write(fd, "allow", 5) != 5) { + // If the kernel is too old to support /proc/PID/setgroups, + // write will return ENOENT; this is OK. + if (errno != ENOENT) { + pr_perror("failed to write allow to %s", buf); + exit(1); + } + } + } + { + // write gid mappings + char buf[PATH_MAX]; + if (snprintf(buf, sizeof(buf), "/proc/%d/gid_map", child) < 0) { + pr_perror("failed to construct gid_map file for %d", child); + exit(1); + } + int fd = open(buf, O_RDWR); + writedata(fd, data, gidmap_start, gidmap_len); + } + } + // finish setting up the environment, write back pid of the child to the + // parent to finish the bootstrap process + char buf[PATH_MAX]; + int len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", child); if (write(pipenum, buf, len) != len) { pr_perror("Unable to send a child pid"); kill(child, SIGKILL); diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 0fe06e8a6a5..048f1cff57d 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -40,12 +40,13 @@ type parentProcess interface { } type setnsProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - cgroupPaths map[string]string - config *initConfig - fds []string + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + cgroupPaths map[string]string + config *initConfig + fds []string + bootstrapData io.Reader } func (p *setnsProcess) startTime() (string, error) { @@ -62,6 +63,14 @@ func (p *setnsProcess) signal(sig os.Signal) error { func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() + err = p.cmd.Start() + p.childPipe.Close() + if err != nil { + return newSystemError(err) + } + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return err + } if err = p.execSetns(); err != nil { return newSystemError(err) } @@ -70,6 +79,7 @@ func (p *setnsProcess) start() (err error) { return newSystemError(err) } } + if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return newSystemError(err) } @@ -94,11 +104,6 @@ func (p *setnsProcess) start() (err error) { // before the go runtime boots, we wait on the process to die and receive the child's pid // over the provided pipe. func (p *setnsProcess) execSetns() error { - err := p.cmd.Start() - p.childPipe.Close() - if err != nil { - return newSystemError(err) - } status, err := p.cmd.Process.Wait() if err != nil { p.cmd.Wait() @@ -156,13 +161,14 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { } type initProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - config *initConfig - manager cgroups.Manager - container *linuxContainer - fds []string + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + config *initConfig + manager cgroups.Manager + container *linuxContainer + fds []string + bootstrapData io.Reader } func (p *initProcess) pid() int { @@ -173,13 +179,47 @@ func (p *initProcess) externalDescriptors() []string { return p.fds } -func (p *initProcess) start() (err error) { +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +// This is called by initProcess.start function +func (p *initProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return err + } + if !status.Success() { + p.cmd.Wait() + return &exec.ExitError{ProcessState: status} + } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return err + } + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + return nil +} + +func (p *initProcess) start() error { defer p.parentPipe.Close() - err = p.cmd.Start() + err := p.cmd.Start() p.childPipe.Close() if err != nil { return newSystemError(err) } + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return err + } + if err := p.execSetns(); err != nil { + return newSystemError(err) + } // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. @@ -224,6 +264,8 @@ func (p *initProcess) wait() (*os.ProcessState, error) { return p.cmd.ProcessState, err } // we should kill all processes in cgroup when init is died if we use host PID namespace + // FIXME: instead of checking here, we should check when create the init + // process if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 { killCgroupProcesses(p.manager) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index ec1005789c5..0041e8708aa 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -19,10 +19,6 @@ type linuxStandardInit struct { } func (l *linuxStandardInit) Init() error { - // join any namespaces via a path to the namespace fd if provided - if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil { - return err - } var console *linuxConsole if l.config.Console != "" { console = newConsoleFromPath(l.config.Console) diff --git a/spec.go b/spec.go index 52f171bc8fa..5067c19a8a8 100644 --- a/spec.go +++ b/spec.go @@ -401,7 +401,10 @@ func setupUserNamespace(spec *specs.LinuxSpec, config *configs.Config) error { if len(spec.Linux.UIDMappings) == 0 { return nil } - config.Namespaces.Add(configs.NEWUSER, "") + // do not override the specified user namespace path + if config.Namespaces.PathOf(configs.NEWUSER) == "" { + config.Namespaces.Add(configs.NEWUSER, "") + } create := func(m specs.IDMapping) configs.IDMap { return configs.IDMap{ HostID: int(m.HostID),