7
7
package syscall
8
8
9
9
import (
10
+ "internal/bytealg"
10
11
"internal/itoa"
11
12
"runtime"
12
13
"unsafe"
@@ -101,6 +102,13 @@ type SysProcAttr struct {
101
102
AmbientCaps []uintptr // Ambient capabilities (Linux only)
102
103
UseCgroupFD bool // Whether to make use of the CgroupFD field.
103
104
CgroupFD int // File descriptor of a cgroup to put the new process into.
105
+ // Namespaces to join after fork and before exec. Namespaces are joined
106
+ // before any unshare calls. If you are using CloneFlags note that those
107
+ // flags will be used to do the initial fork, so they occur before joining
108
+ // these namespaces. It is expected that the caller has sorted the list in
109
+ // the order they want to join. It is possible for ordering to affect
110
+ // permissions to join other namespaces.
111
+ Namespaces []string
104
112
}
105
113
106
114
var (
@@ -300,6 +308,49 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
300
308
}
301
309
}
302
310
311
+ var (
312
+ switchNamespaces []int
313
+ switchNamespaceFds []int
314
+ )
315
+
316
+ for _ , nsSpec := range sys .Namespaces {
317
+ idx := bytealg .IndexString (nsSpec , "=" )
318
+ if idx <= 0 {
319
+ err1 = EINVAL
320
+ return
321
+ }
322
+
323
+ switch nsSpec [:idx ] {
324
+ case "user" :
325
+ switchNamespaces = append (switchNamespaces , CLONE_NEWUSER )
326
+ case "pid" :
327
+ switchNamespaces = append (switchNamespaces , CLONE_NEWPID )
328
+ case "net" :
329
+ switchNamespaces = append (switchNamespaces , CLONE_NEWNET )
330
+ case "ipc" :
331
+ switchNamespaces = append (switchNamespaces , CLONE_NEWIPC )
332
+ case "uts" :
333
+ switchNamespaces = append (switchNamespaces , CLONE_NEWUTS )
334
+ case "mount" :
335
+ switchNamespaces = append (switchNamespaces , CLONE_NEWNS )
336
+ case "cgroup" :
337
+ switchNamespaces = append (switchNamespaces , CLONE_NEWCGROUP )
338
+ default :
339
+ err1 = EINVAL
340
+ return
341
+ }
342
+
343
+ fd , err := Open (nsSpec [idx + 1 :], O_RDONLY , 0 )
344
+ if err != nil {
345
+ for _ , fd := range switchNamespaceFds {
346
+ Close (fd )
347
+ }
348
+ err1 = err .(Errno )
349
+ return
350
+ }
351
+ switchNamespaceFds = append (switchNamespaceFds , fd )
352
+ }
353
+
303
354
// About to call fork.
304
355
// No more allocation or calls of non-assembly functions.
305
356
runtime_BeforeFork ()
@@ -316,6 +367,9 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
316
367
}
317
368
}
318
369
if err1 != 0 || pid != 0 {
370
+ for _ , fd := range switchNamespaceFds {
371
+ RawSyscall (SYS_CLOSE , uintptr (fd ), 0 , 0 )
372
+ }
319
373
// If we're in the parent, we must return immediately
320
374
// so we're not in the same stack frame as the child.
321
375
// This can at most use the return PC, which the child
@@ -335,6 +389,18 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
335
389
}
336
390
}
337
391
392
+ for i , ns := range switchNamespaces {
393
+ fd := switchNamespaceFds [i ]
394
+ _ , _ , err1 = RawSyscall (SYS_SETNS , uintptr (fd ), uintptr (ns ), 0 )
395
+ if err1 != 0 {
396
+ goto childerror
397
+ }
398
+ _ , _ , err1 = RawSyscall (SYS_CLOSE , uintptr (fd ), 0 , 0 )
399
+ if err1 != 0 {
400
+ goto childerror
401
+ }
402
+ }
403
+
338
404
// Wait for User ID/Group ID mappings to be written.
339
405
if sys .UidMappings != nil || sys .GidMappings != nil {
340
406
if _ , _ , err1 = RawSyscall (SYS_CLOSE , uintptr (mapPipe [1 ]), 0 , 0 ); err1 != 0 {
0 commit comments