Skip to content

Commit d2adb91

Browse files
committed
syscall: add support for setns after fork
This adds a `Namespaces` field to Linux's `SysProcAttr` type. When set, these namespaces will be entered after fork and before exec. The format for this is `<ns name>=<path>`, e.g. `mnt=/some/path`. This allows users to exec a new process in a pre-defined set of namespaces without having to resort to hacks or re-execs to bootstrap these namespaces.
1 parent 9e88520 commit d2adb91

File tree

1 file changed

+66
-0
lines changed

1 file changed

+66
-0
lines changed

src/syscall/exec_linux.go

+66
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
package syscall
88

99
import (
10+
"internal/bytealg"
1011
"internal/itoa"
1112
"runtime"
1213
"unsafe"
@@ -101,6 +102,13 @@ type SysProcAttr struct {
101102
AmbientCaps []uintptr // Ambient capabilities (Linux only)
102103
UseCgroupFD bool // Whether to make use of the CgroupFD field.
103104
CgroupFD int // File descriptor of a cgroup to put the new process into.
105+
// Namespaces to join after fork and before exec. Namespaces are joined
106+
// before any unshare calls. If you are using CloneFlags note that those
107+
// flags will be used to do the initial fork, so they occur before joining
108+
// these namespaces. It is expected that the caller has sorted the list in
109+
// the order they want to join. It is possible for ordering to affect
110+
// permissions to join other namespaces.
111+
Namespaces []string
104112
}
105113

106114
var (
@@ -300,6 +308,49 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
300308
}
301309
}
302310

311+
var (
312+
switchNamespaces []int
313+
switchNamespaceFds []int
314+
)
315+
316+
for _, nsSpec := range sys.Namespaces {
317+
idx := bytealg.IndexString(nsSpec, "=")
318+
if idx <= 0 {
319+
err1 = EINVAL
320+
return
321+
}
322+
323+
switch nsSpec[:idx] {
324+
case "user":
325+
switchNamespaces = append(switchNamespaces, CLONE_NEWUSER)
326+
case "pid":
327+
switchNamespaces = append(switchNamespaces, CLONE_NEWPID)
328+
case "net":
329+
switchNamespaces = append(switchNamespaces, CLONE_NEWNET)
330+
case "ipc":
331+
switchNamespaces = append(switchNamespaces, CLONE_NEWIPC)
332+
case "uts":
333+
switchNamespaces = append(switchNamespaces, CLONE_NEWUTS)
334+
case "mount":
335+
switchNamespaces = append(switchNamespaces, CLONE_NEWNS)
336+
case "cgroup":
337+
switchNamespaces = append(switchNamespaces, CLONE_NEWCGROUP)
338+
default:
339+
err1 = EINVAL
340+
return
341+
}
342+
343+
fd, err := Open(nsSpec[idx+1:], O_RDONLY, 0)
344+
if err != nil {
345+
for _, fd := range switchNamespaceFds {
346+
Close(fd)
347+
}
348+
err1 = err.(Errno)
349+
return
350+
}
351+
switchNamespaceFds = append(switchNamespaceFds, fd)
352+
}
353+
303354
// About to call fork.
304355
// No more allocation or calls of non-assembly functions.
305356
runtime_BeforeFork()
@@ -316,6 +367,9 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
316367
}
317368
}
318369
if err1 != 0 || pid != 0 {
370+
for _, fd := range switchNamespaceFds {
371+
RawSyscall(SYS_CLOSE, uintptr(fd), 0, 0)
372+
}
319373
// If we're in the parent, we must return immediately
320374
// so we're not in the same stack frame as the child.
321375
// This can at most use the return PC, which the child
@@ -335,6 +389,18 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
335389
}
336390
}
337391

392+
for i, ns := range switchNamespaces {
393+
fd := switchNamespaceFds[i]
394+
_, _, err1 = RawSyscall(SYS_SETNS, uintptr(fd), uintptr(ns), 0)
395+
if err1 != 0 {
396+
goto childerror
397+
}
398+
_, _, err1 = RawSyscall(SYS_CLOSE, uintptr(fd), 0, 0)
399+
if err1 != 0 {
400+
goto childerror
401+
}
402+
}
403+
338404
// Wait for User ID/Group ID mappings to be written.
339405
if sys.UidMappings != nil || sys.GidMappings != nil {
340406
if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {

0 commit comments

Comments
 (0)