commit da767eaaf6128eccd349a54ec6eac2a68dcacacb Author: Willy Tarreau Date: Wed Jul 31 19:15:45 2019 +0200 BUG/MINOR: debug: fix a small race in the thread dumping code If a thread dump is requested from a signal handler, it may interrupt a thread already waiting for a dump to complete, and may see the threads_to_dump variable go to zero while others are waiting, steal the lock and prevent other threads from ever completing. This tends to happen when dumping many threads upon a watchdog timeout, to threads waiting for their turn. Instead now we proceed in two steps : 1) the last dumped thread sets all bits again 2) all threads only wait for their own bit to appear, then clear it and quit This way there's no risk that a bit performs a double flip in the same loop and threads cannot get stuck here anymore. This should be backported to 2.0 as it clarifies stack traces. (cherry picked from commit c07736209db764fb2aef6f18ed3687a504c35771) Signed-off-by: Willy Tarreau diff --git a/src/debug.c b/src/debug.c index 059bc6b9..07624ca5 100644 --- a/src/debug.c +++ b/src/debug.c @@ -440,8 +440,8 @@ void debug_handler(int sig, siginfo_t *si, void *arg) * 1- wait for our turn, i.e. when all lower bits are gone. * 2- perform the action if our bit is set * 3- remove our bit to let the next one go, unless we're - * the last one and have to put them all but ours - * 4- wait for zero and clear our bit if it's set + * the last one and have to put them all as a signal + * 4- wait out bit to re-appear, then clear it and quit. */ /* wait for all previous threads to finish first */ @@ -454,7 +454,7 @@ void debug_handler(int sig, siginfo_t *si, void *arg) ha_thread_dump(thread_dump_buffer, tid, thread_dump_tid); if ((threads_to_dump & all_threads_mask) == tid_bit) { /* last one */ - HA_ATOMIC_STORE(&threads_to_dump, all_threads_mask & ~tid_bit); + HA_ATOMIC_STORE(&threads_to_dump, all_threads_mask); thread_dump_buffer = NULL; } else @@ -462,14 +462,13 @@ void debug_handler(int sig, siginfo_t *si, void *arg) } /* now wait for all others to finish dumping. The last one will set all - * bits again to broadcast the leaving condition. + * bits again to broadcast the leaving condition so we'll see ourselves + * present again. This way the threads_to_dump variable never passes to + * zero until all visitors have stopped waiting. */ - while (threads_to_dump & all_threads_mask) { - if (threads_to_dump & tid_bit) - HA_ATOMIC_AND(&threads_to_dump, ~tid_bit); - else - ha_thread_relax(); - } + while (!(threads_to_dump & tid_bit)) + ha_thread_relax(); + HA_ATOMIC_AND(&threads_to_dump, ~tid_bit); /* mark the current thread as stuck to detect it upon next invocation * if it didn't move.