From 481af80c7c15d4b063b41120a2a87ba03801bdc0 Mon Sep 17 00:00:00 2001 From: Qiang Huang Date: Sat, 17 Dec 2016 09:59:49 +0800 Subject: [PATCH] Fix race condition when sync with child and grandchild Fixes: #1236 Signed-off-by: Qiang Huang --- libcontainer/nsenter/nsexec.c | 108 +++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 35 deletions(-) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 5c6b95e1ab3..bd40407d53d 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -33,7 +33,8 @@ enum sync_t { SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ - SYNC_CHILD_READY = 0x44, /* The grandchild is ready to return. */ + SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ + SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ /* XXX: This doesn't help with segfaults and other such issues. */ SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ @@ -413,7 +414,7 @@ void nsexec(void) { int pipenum; jmp_buf env; - int syncpipe[2]; + int syncChildPipe[2], syncGrandchildPipe[2]; struct nlconfig_t config = {0}; /* @@ -428,9 +429,16 @@ void nsexec(void) nl_parse(pipenum, &config); /* Pipe so we can tell the child when we've finished setting up. */ - if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0) + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncChildPipe) < 0) bail("failed to setup sync pipe between parent and child"); + /* + * We need a new socketpair to sync with grandchild so we don't have + * race condition with child. + */ + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncGrandchildPipe) < 0) + bail("failed to setup sync pipe between parent and grandchild"); + /* TODO: Currently we aren't dealing with child deaths properly. */ /* @@ -489,9 +497,10 @@ void nsexec(void) * process. */ case JUMP_PARENT: { - int len, ready = 0; + int len; pid_t child; char buf[JSON_MAX]; + bool ready = false; /* For debugging. */ prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); @@ -508,26 +517,23 @@ void nsexec(void) * ready, so we can receive all possible error codes * generated by children. */ - while (ready < 2) { + while (!ready) { enum sync_t s; + int ret; - /* This doesn't need to be global, we're in the parent. */ - int syncfd = syncpipe[1]; + syncfd = syncChildPipe[1]; + close(syncChildPipe[0]); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with child: next state"); switch (s) { - case SYNC_ERR: { - /* We have to mirror the error code of the child. */ - int ret; + case SYNC_ERR: + /* We have to mirror the error code of the child. */ + if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) + bail("failed to sync with child: read(error code)"); - if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) - bail("failed to sync with child: read(error code)"); - - exit(ret); - } - break; + exit(ret); case SYNC_USERMAP_PLS: /* Enable setgroups(2) if we've been asked to. */ if (config.is_setgroup) @@ -543,11 +549,6 @@ void nsexec(void) bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); } break; - case SYNC_USERMAP_ACK: - /* We should _never_ receive acks. */ - kill(child, SIGKILL); - bail("failed to sync with child: unexpected SYNC_USERMAP_ACK"); - break; case SYNC_RECVPID_PLS: { pid_t old = child; @@ -565,20 +566,46 @@ void nsexec(void) bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); } } - - ready++; - break; - case SYNC_RECVPID_ACK: - /* We should _never_ receive acks. */ - kill(child, SIGKILL); - bail("failed to sync with child: unexpected SYNC_RECVPID_ACK"); break; case SYNC_CHILD_READY: - ready++; + ready = true; break; default: - bail("unexpected sync value"); + bail("unexpected sync value: %u", s); + } + } + + /* Now sync with grandchild. */ + + ready = false; + while (!ready) { + enum sync_t s; + int ret; + + syncfd = syncGrandchildPipe[1]; + close(syncGrandchildPipe[0]); + + s = SYNC_GRANDCHILD; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_GRANDCHILD)"); + } + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_ERR: + /* We have to mirror the error code of the child. */ + if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) + bail("failed to sync with child: read(error code)"); + + exit(ret); + case SYNC_CHILD_READY: + ready = true; break; + default: + bail("unexpected sync value: %u", s); } } @@ -610,7 +637,8 @@ void nsexec(void) enum sync_t s; /* We're in a child and thus need to tell the parent if we die. */ - syncfd = syncpipe[0]; + syncfd = syncChildPipe[0]; + close(syncChildPipe[1]); /* For debugging. */ prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); @@ -695,6 +723,10 @@ void nsexec(void) bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); } + s = SYNC_CHILD_READY; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with patent: write(SYNC_CHILD_READY)"); + /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ exit(0); } @@ -713,11 +745,19 @@ void nsexec(void) enum sync_t s; /* We're in a child and thus need to tell the parent if we die. */ - syncfd = syncpipe[0]; + syncfd = syncGrandchildPipe[0]; + close(syncGrandchildPipe[1]); + close(syncChildPipe[0]); + close(syncChildPipe[1]); /* For debugging. */ prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0); + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); + if (s != SYNC_GRANDCHILD) + bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s); + if (setsid() < 0) bail("setsid failed"); @@ -735,8 +775,7 @@ void nsexec(void) bail("failed to sync with patent: write(SYNC_CHILD_READY)"); /* Close sync pipes. */ - close(syncpipe[0]); - close(syncpipe[1]); + close(syncGrandchildPipe[0]); /* Free netlink data. */ nl_free(&config); @@ -746,7 +785,6 @@ void nsexec(void) } default: bail("unexpected jump value"); - break; } /* Should never be reached. */