nng threads should block signals #1997

lwFace · 2024-12-19T07:35:08Z

Describe the bug

linux nng1.9.0

panic: pthread_mutex_lock: Resource deadlock avoided
This message is indicative of a BUG.
Report this at https://github.com/nanomsg/nng/issues
/usr/local/lib64/libnng.so.1(+0x20af9) [0x7ffff7b7baf9]
/usr/local/lib64/libnng.so.1(+0x32b09) [0x7ffff7b8db09]
/usr/local/lib64/libnng.so.1(+0x32ca4) [0x7ffff7b8dca4]
/usr/local/lib64/libnng.so.1(+0x29223) [0x7ffff7b84223]
/usr/local/lib64/libnng.so.1(+0x1811a) [0x7ffff7b7311a]
/usr/local/lib64/libnng.so.1(+0x181f2) [0x7ffff7b731f2]
/usr/local/lib64/libnng.so.1(+0x3a728) [0x7ffff7b95728]
/usr/local/lib64/libnng.so.1(+0x236d3) [0x7ffff7b7e6d3]
/usr/local/lib64/libnng.so.1(nng_sendmsg+0xdc) [0x7ffff7b6e0fc]
/usr/local/lib64/libnng.so.1(nng_send+0x6a) [0x7ffff7b6dfe7]
./pubsub() [0x400cd7]
/lib64/libpthread.so.0(+0xf370) [0x7ffff794e370]
/lib64/libpthread.so.0(pthread_cond_broadcast+0xc) [0x7ffff794ae1c]
Aborted

This is the minimum example to recurrent the issue. If I replace the timer with thread or increase the timer start interval, the crash will not happen.

int pubInit(const char *url,nng_socket* pSock);
int subInit(const char *url,nng_socket* pSock);
nng_socket txSock;
nng_socket rxSock;

void fatal(const char *func, int rv)
{
        fprintf(stderr, "%s: %s\n", func, nng_strerror(rv));
}

void timerHandler(int signo)
{
    int rv = 0;
    char pData[512]={0};
        if ((rv = nng_send(txSock, pData, 512, 0)) != 0) {
                        fatal("nng_send", rv);
        }
}

void startTimer(int t)
{
        struct sigaction siga;
        siga.sa_flags = SA_SIGINFO;
        siga.sa_handler = timerHandler;
        sigemptyset(&siga.sa_mask);
        if (sigaction(SIGRTMIN, &siga, NULL) == -1)
        {
                return;
        }

        sigevent sev;
        timer_t timerId;
        sev.sigev_notify = SIGUSR1;
        sev.sigev_signo = SIGRTMIN;
        sev.sigev_value.sival_ptr = &timerId;
        if (timer_create(CLOCK_REALTIME, &sev, &timerId) == -1)
        {
                return;
        }

        itimerspec* gInterval = new itimerspec;
        ((itimerspec *)gInterval)->it_value.tv_sec = t / 1e6;
        ((itimerspec *)gInterval)->it_value.tv_nsec = t % 1000000 * 1000;
        ((itimerspec *)gInterval)->it_interval = ((itimerspec *)gInterval)->it_value;

        if (timer_settime(timerId, 0, (itimerspec *)gInterval, NULL) == -1)
        {
                delete (itimerspec *)gInterval;
                return;
        }
}
int main(int argc,char* argv[])
{
    int rv = 0;
    pubInit("tcp://192.168.1.122:20000",&txSock);
    subInit("tcp://192.168.1.122:20001",&rxSock);    
    startTimer(10);
    while(true)
    {
        sleep(1);
    }

    nng_close(txSock);
    nng_close(rxSock);
    return 0;
}

int pubInit(const char *url,nng_socket* pSock)
{
    int rv = 0;
    if ((rv = nng_pub0_open(pSock)) != 0) 
    {
            fatal("nng_pub0_open", rv);
    }
    if ((rv = nng_listen(*pSock, url, NULL, 0)) < 0) 
    {
            fatal("nng_listen", rv);
    }       
    return rv;
}
int subInit(const char *url, nng_socket* pSock)
{
    nng_socket sock;
    int rv = 0;
    if ((rv = nng_sub0_open(pSock)) != 0) 
    {
            fatal("nng_sub0_open", rv);
    }

    if ((rv = nng_setopt(*pSock, NNG_OPT_SUB_SUBSCRIBE, "", 0)) != 0) 
    {
            fatal("nng_setopt", rv);
    }

    if ((rv = nng_setopt_ms(*pSock,NNG_OPT_RECVTIMEO,1000)) != 0) 
    {
        fatal("nng_setopt", rv);
    }

    if ((rv = nng_dial(*pSock, url, NULL, NNG_FLAG_NONBLOCK)) != 0) 
    {
            fatal("nng_dial", rv);
    }
    return rv;
}

gdamore · 2024-12-27T06:02:07Z

NNG is not async-signal-safe. In fact, almost nothing is. The use of mutexes here means that the thread that is getting interrupted might be holding the mutex, when we try to to re-acquire it from the signal handler (which runs on the same thread!)

Generally speaking the only way to safely deal with signals (apart from not using them at all!) is to create a dedicated thread to handle them. You specifically need to block that signal from being executed in any other thread.

This could actually be a problem for NNG's own internal threads, since right now you don't have access to them. I plan to expose a thread initialization function that will let you do this. And it seems like really what we need to do is block all signals from those threads at creation time (which to do atomically requires blocking them in the creator first).

As a workaround, you could set the pthread signal mask (pthread_sigmask) of the main application thread to block the offending signals, and then only enable it in the one thread that you intend to handle it. (You will have to give it a thread to do so, and that thread should not do anything else with NNG or else you will wind up in the same situation.)

gdamore added the feedback label Dec 27, 2024

gdamore changed the title ~~panic: pthread_mutex_lock: Resource deadlock avoided~~ nng threads should block signals Dec 31, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

nng threads should block signals #1997

nng threads should block signals #1997

lwFace commented Dec 19, 2024 •

edited

Loading

gdamore commented Dec 27, 2024

nng threads should block signals #1997

nng threads should block signals #1997

Comments

lwFace commented Dec 19, 2024 • edited Loading

gdamore commented Dec 27, 2024

lwFace commented Dec 19, 2024 •

edited

Loading