From c39fb5758a772c062e20db9b42f2b06805884802 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 21 Feb 2019 19:52:52 -0500 Subject: [PATCH] Address a race condition in libevent select. This is not really a fix for the race condition because I could not figure out how it happen, but it does address the problem generated by the race. If we do not remove a bad fd from the select list we keep getting the same error from select, and we stop doing any progress on the communication side. Thus, we forcefully disable all bad fd as soon as select fails, and we are back in track, progress ensure and everything seems to work as expected (no leftover events in the event base). Signed-off-by: George Bosilca --- opal/mca/event/libevent2022/libevent/select.c | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/opal/mca/event/libevent2022/libevent/select.c b/opal/mca/event/libevent2022/libevent/select.c index afba6d34911..18b16567cc3 100644 --- a/opal/mca/event/libevent2022/libevent/select.c +++ b/opal/mca/event/libevent2022/libevent/select.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "event-internal.h" #include "evsignal-internal.h" @@ -166,12 +167,30 @@ select_dispatch(struct event_base *base, struct timeval *tv) check_selectop(sop); if (res == -1) { - if (errno != EINTR) { - event_warn("select"); - return (-1); + if (errno == EINTR) { + return (0); } - - return (0); + /* There seems to be a very subtle race condition between the + * event_del and the select, where the fd is still active on the + * event_readset_in but no libevent structure make reference + * to it so it. Thus, any call to progress will no nothing more + * than print a warning and do nothing, leading to deadlocks. + * If we force remove the problematic fd, we get the warning only + * once, and things work as expected. + */ + event_warn("select"); + for (j = 0; j < nfds; ++j) { + if (FD_ISSET(j, sop->event_readset_in) || + FD_ISSET(j, sop->event_writeset_in)) { + res = fcntl(j, F_GETFL); + if( res == -1 ) { + event_warn("bad file descriptor %d/%d\n", j, nfds); + FD_CLR(j, sop->event_readset_in); + FD_CLR(j, sop->event_writeset_in); + } + } + } + return (-1); } event_debug(("%s: select reports %d", __func__, res));