Hello William L., On Thu, Jun 07, 2018 at 11:50:45AM +0200, William Lallemand wrote: > Sorry for the late reply, I manage to reproduce and fix what seams to be the > bug. > The signal management was not handled correctly with threads. > Could you try those patches and see if it fixes the problem?
Thanks a lot for the proposed patches! I started a new test yesterday evening, and it seems ok so far, so I deployed on two new machines this morning. I hope I don't speak too fast but it looks promising because I was previously able to reproduce in a few hours maximum. Keep me up to date if you need a test on a re-worked version. Thanks! > From d695242fb260538bd8db323715d627c4a9deacc7 Mon Sep 17 00:00:00 2001 > From: William Lallemand <wlallem...@haproxy.org> > Date: Thu, 7 Jun 2018 09:46:01 +0200 > Subject: [PATCH 1/3] BUG/MEDIUM: threads: handle signal queue only in thread 0 > > Signals were handled in all threads which caused some signals to be lost > from time to time. To avoid complicated lock system (threads+signals), > we prefer handling the signals in one thread avoiding concurrent access. > > The side effect of this bug was that some process were not leaving from > time to time during a reload. > --- > src/haproxy.c | 21 ++++++++++++++++++--- > src/signal.c | 8 -------- > 2 files changed, 18 insertions(+), 11 deletions(-) > > diff --git a/src/haproxy.c b/src/haproxy.c > index 4628d8296..e984ca573 100644 > --- a/src/haproxy.c > +++ b/src/haproxy.c > @@ -2398,8 +2398,10 @@ static void run_poll_loop() > /* Process a few tasks */ > process_runnable_tasks(); > > - /* check if we caught some signals and process them */ > - signal_process_queue(); > + /* check if we caught some signals and process them in the > + first thread */ > + if (tid == 0) > + signal_process_queue(); > > /* Check if we can expire some tasks */ > next = wake_expired_tasks(); > @@ -2416,7 +2418,7 @@ static void run_poll_loop() > activity[tid].wake_tasks++; > else if (active_applets_mask & tid_bit) > activity[tid].wake_applets++; > - else if (signal_queue_len) > + else if (signal_queue_len && tid == 0) > activity[tid].wake_signal++; > else > exp = next; > @@ -3006,6 +3008,7 @@ int main(int argc, char **argv) > unsigned int *tids = calloc(global.nbthread, sizeof(unsigned > int)); > pthread_t *threads = calloc(global.nbthread, > sizeof(pthread_t)); > int i; > + sigset_t blocked_sig, old_sig; > > THREAD_SYNC_INIT((1UL << global.nbthread) - 1); > > @@ -3013,6 +3016,15 @@ int main(int argc, char **argv) > for (i = 0; i < global.nbthread; i++) > tids[i] = i; > > + /* ensure the signals will be blocked in every thread */ > + sigfillset(&blocked_sig); > + sigdelset(&blocked_sig, SIGPROF); > + sigdelset(&blocked_sig, SIGBUS); > + sigdelset(&blocked_sig, SIGFPE); > + sigdelset(&blocked_sig, SIGILL); > + sigdelset(&blocked_sig, SIGSEGV); > + pthread_sigmask(SIG_SETMASK, &blocked_sig, &old_sig); > + > /* Create nbthread-1 thread. The first thread is the current > process */ > threads[0] = pthread_self(); > for (i = 1; i < global.nbthread; i++) > @@ -3046,6 +3058,9 @@ int main(int argc, char **argv) > } > #endif /* !USE_CPU_AFFINITY */ > > + /* when multithreading we need to let only the thread 0 handle > the signals */ > + pthread_sigmask(SIG_SETMASK, &old_sig, NULL); > + > /* Finally, start the poll loop for the first thread */ > run_thread_poll_loop(&tids[0]); > > diff --git a/src/signal.c b/src/signal.c > index a0975910b..f1f682188 100644 > --- a/src/signal.c > +++ b/src/signal.c > @@ -31,7 +31,6 @@ struct pool_head *pool_head_sig_handlers = NULL; > sigset_t blocked_sig; > int signal_pending = 0; /* non-zero if t least one signal remains > unprocessed */ > > -__decl_hathreads(HA_SPINLOCK_T signals_lock); > > /* Common signal handler, used by all signals. Received signals are queued. > * Signal number zero has a specific status, as it cannot be delivered by the > @@ -71,9 +70,6 @@ void __signal_process_queue() > struct signal_descriptor *desc; > sigset_t old_sig; > > - if (HA_SPIN_TRYLOCK(SIGNALS_LOCK, &signals_lock)) > - return; > - > /* block signal delivery during processing */ > sigprocmask(SIG_SETMASK, &blocked_sig, &old_sig); > > @@ -100,7 +96,6 @@ void __signal_process_queue() > > /* restore signal delivery */ > sigprocmask(SIG_SETMASK, &old_sig, NULL); > - HA_SPIN_UNLOCK(SIGNALS_LOCK, &signals_lock); > } > > /* perform minimal intializations, report 0 in case of error, 1 if OK. */ > @@ -112,8 +107,6 @@ int signal_init() > memset(signal_queue, 0, sizeof(signal_queue)); > memset(signal_state, 0, sizeof(signal_state)); > > - HA_SPIN_INIT(&signals_lock); > - > /* Ensure signals are not blocked. Some shells or service managers may > * accidently block all of our signals unfortunately, causing lots of > * zombie processes to remain in the background during reloads. > @@ -148,7 +141,6 @@ void deinit_signals() > pool_free(pool_head_sig_handlers, sh); > } > } > - HA_SPIN_DESTROY(&signals_lock); > } > > /* Register a function and an integer argument on a signal. A pointer to the > -- > 2.16.1 > > From 1501eddeb506897126d0d3d60a36ca780b24ffdf Mon Sep 17 00:00:00 2001 > From: William Lallemand <wlallem...@haproxy.org> > Date: Thu, 7 Jun 2018 09:49:04 +0200 > Subject: [PATCH 2/3] BUG/MINOR: don't ignore SIG{BUS,FPE,ILL,SEGV} during > signal processing > > --- > src/signal.c | 8 ++++++++ > 1 file changed, 8 insertions(+) > > diff --git a/src/signal.c b/src/signal.c > index f1f682188..0dadd762c 100644 > --- a/src/signal.c > +++ b/src/signal.c > @@ -120,6 +120,14 @@ int signal_init() > > sigfillset(&blocked_sig); > sigdelset(&blocked_sig, SIGPROF); > + /* man sigprocmask: If SIGBUS, SIGFPE, SIGILL, or SIGSEGV are > + generated while they are blocked, the result is undefined, unless > + the signal was generated by kill(2), > + sigqueue(3), or raise(3) */ > + sigdelset(&blocked_sig, SIGBUS); > + sigdelset(&blocked_sig, SIGFPE); > + sigdelset(&blocked_sig, SIGILL); > + sigdelset(&blocked_sig, SIGSEGV); > for (sig = 0; sig < MAX_SIGNAL; sig++) > LIST_INIT(&signal_state[sig].handlers); > > -- > 2.16.1 > > From 73423f636d906e7815589f80088084cb7ca589b1 Mon Sep 17 00:00:00 2001 > From: William Lallemand <wlallem...@haproxy.org> > Date: Thu, 7 Jun 2018 11:23:40 +0200 > Subject: [PATCH 3/3] BUG/MINOR: signals: sigprocmask is unspecified in a > multithread process > > Replace sigprocmask by pthread_sigmask. > --- > src/signal.c | 8 ++++++++ > 1 file changed, 8 insertions(+) > > diff --git a/src/signal.c b/src/signal.c > index 0dadd762c..767d85c48 100644 > --- a/src/signal.c > +++ b/src/signal.c > @@ -71,7 +71,11 @@ void __signal_process_queue() > sigset_t old_sig; > > /* block signal delivery during processing */ > +#ifdef USE_THREAD > + pthread_sigmask(SIG_SETMASK, &blocked_sig, &old_sig); > +#else > sigprocmask(SIG_SETMASK, &blocked_sig, &old_sig); > +#endif > > /* It is important that we scan the queue forwards so that we can > * catch any signal that would have been queued by another signal > @@ -95,7 +99,11 @@ void __signal_process_queue() > signal_queue_len = 0; > > /* restore signal delivery */ > +#ifdef USE_THREAD > + pthread_sigmask(SIG_SETMASK, &old_sig, NULL); > +#else > sigprocmask(SIG_SETMASK, &old_sig, NULL); > +#endif > } > > /* perform minimal intializations, report 0 in case of error, 1 if OK. */ > -- > 2.16.1 -- William