Hello William L.,

On Thu, Jun 07, 2018 at 11:50:45AM +0200, William Lallemand wrote:
> Sorry for the late reply, I manage to reproduce and fix what seams to be the 
> bug.
> The signal management was not handled correctly with threads.
> Could you try those patches and see if it fixes the problem?

Thanks a lot for the proposed patches!
I started a new test yesterday evening, and it seems ok so far, so I
deployed on two new machines this morning.
I hope I don't speak too fast but it looks promising because I was
previously able to reproduce in a few hours maximum.

Keep me up to date if you need a test on a re-worked version.

Thanks!


> From d695242fb260538bd8db323715d627c4a9deacc7 Mon Sep 17 00:00:00 2001
> From: William Lallemand <wlallem...@haproxy.org>
> Date: Thu, 7 Jun 2018 09:46:01 +0200
> Subject: [PATCH 1/3] BUG/MEDIUM: threads: handle signal queue only in thread 0
>
> Signals were handled in all threads which caused some signals to be lost
> from time to time. To avoid complicated lock system (threads+signals),
> we prefer handling the signals in one thread avoiding concurrent access.
>
> The side effect of this bug was that some process were not leaving from
> time to time during a reload.
> ---
>  src/haproxy.c | 21 ++++++++++++++++++---
>  src/signal.c  |  8 --------
>  2 files changed, 18 insertions(+), 11 deletions(-)
>
> diff --git a/src/haproxy.c b/src/haproxy.c
> index 4628d8296..e984ca573 100644
> --- a/src/haproxy.c
> +++ b/src/haproxy.c
> @@ -2398,8 +2398,10 @@ static void run_poll_loop()
>               /* Process a few tasks */
>               process_runnable_tasks();
>
> -             /* check if we caught some signals and process them */
> -             signal_process_queue();
> +             /* check if we caught some signals and process them in the
> +              first thread */
> +             if (tid == 0)
> +                     signal_process_queue();
>
>               /* Check if we can expire some tasks */
>               next = wake_expired_tasks();
> @@ -2416,7 +2418,7 @@ static void run_poll_loop()
>                       activity[tid].wake_tasks++;
>               else if (active_applets_mask & tid_bit)
>                       activity[tid].wake_applets++;
> -             else if (signal_queue_len)
> +             else if (signal_queue_len && tid == 0)
>                       activity[tid].wake_signal++;
>               else
>                       exp = next;
> @@ -3006,6 +3008,7 @@ int main(int argc, char **argv)
>               unsigned int *tids    = calloc(global.nbthread, sizeof(unsigned 
> int));
>               pthread_t    *threads = calloc(global.nbthread, 
> sizeof(pthread_t));
>               int          i;
> +             sigset_t     blocked_sig, old_sig;
>
>               THREAD_SYNC_INIT((1UL << global.nbthread) - 1);
>
> @@ -3013,6 +3016,15 @@ int main(int argc, char **argv)
>               for (i = 0; i < global.nbthread; i++)
>                       tids[i] = i;
>
> +             /* ensure the signals will be blocked in every thread */
> +             sigfillset(&blocked_sig);
> +             sigdelset(&blocked_sig, SIGPROF);
> +             sigdelset(&blocked_sig, SIGBUS);
> +             sigdelset(&blocked_sig, SIGFPE);
> +             sigdelset(&blocked_sig, SIGILL);
> +             sigdelset(&blocked_sig, SIGSEGV);
> +             pthread_sigmask(SIG_SETMASK, &blocked_sig, &old_sig);
> +
>               /* Create nbthread-1 thread. The first thread is the current 
> process */
>               threads[0] = pthread_self();
>               for (i = 1; i < global.nbthread; i++)
> @@ -3046,6 +3058,9 @@ int main(int argc, char **argv)
>               }
>  #endif /* !USE_CPU_AFFINITY */
>
> +             /* when multithreading we need to let only the thread 0 handle 
> the signals */
> +             pthread_sigmask(SIG_SETMASK, &old_sig, NULL);
> +
>               /* Finally, start the poll loop for the first thread */
>               run_thread_poll_loop(&tids[0]);
>
> diff --git a/src/signal.c b/src/signal.c
> index a0975910b..f1f682188 100644
> --- a/src/signal.c
> +++ b/src/signal.c
> @@ -31,7 +31,6 @@ struct pool_head *pool_head_sig_handlers = NULL;
>  sigset_t blocked_sig;
>  int signal_pending = 0; /* non-zero if t least one signal remains 
> unprocessed */
>
> -__decl_hathreads(HA_SPINLOCK_T signals_lock);
>
>  /* Common signal handler, used by all signals. Received signals are queued.
>   * Signal number zero has a specific status, as it cannot be delivered by the
> @@ -71,9 +70,6 @@ void __signal_process_queue()
>       struct signal_descriptor *desc;
>       sigset_t old_sig;
>
> -     if (HA_SPIN_TRYLOCK(SIGNALS_LOCK, &signals_lock))
> -             return;
> -
>       /* block signal delivery during processing */
>       sigprocmask(SIG_SETMASK, &blocked_sig, &old_sig);
>
> @@ -100,7 +96,6 @@ void __signal_process_queue()
>
>       /* restore signal delivery */
>       sigprocmask(SIG_SETMASK, &old_sig, NULL);
> -     HA_SPIN_UNLOCK(SIGNALS_LOCK, &signals_lock);
>  }
>
>  /* perform minimal intializations, report 0 in case of error, 1 if OK. */
> @@ -112,8 +107,6 @@ int signal_init()
>       memset(signal_queue, 0, sizeof(signal_queue));
>       memset(signal_state, 0, sizeof(signal_state));
>
> -     HA_SPIN_INIT(&signals_lock);
> -
>       /* Ensure signals are not blocked. Some shells or service managers may
>        * accidently block all of our signals unfortunately, causing lots of
>        * zombie processes to remain in the background during reloads.
> @@ -148,7 +141,6 @@ void deinit_signals()
>                       pool_free(pool_head_sig_handlers, sh);
>               }
>       }
> -     HA_SPIN_DESTROY(&signals_lock);
>  }
>
>  /* Register a function and an integer argument on a signal. A pointer to the
> --
> 2.16.1
>

> From 1501eddeb506897126d0d3d60a36ca780b24ffdf Mon Sep 17 00:00:00 2001
> From: William Lallemand <wlallem...@haproxy.org>
> Date: Thu, 7 Jun 2018 09:49:04 +0200
> Subject: [PATCH 2/3] BUG/MINOR: don't ignore SIG{BUS,FPE,ILL,SEGV} during
>  signal processing
>
> ---
>  src/signal.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/src/signal.c b/src/signal.c
> index f1f682188..0dadd762c 100644
> --- a/src/signal.c
> +++ b/src/signal.c
> @@ -120,6 +120,14 @@ int signal_init()
>
>       sigfillset(&blocked_sig);
>       sigdelset(&blocked_sig, SIGPROF);
> +     /* man sigprocmask: If SIGBUS, SIGFPE, SIGILL, or SIGSEGV are
> +        generated while they are blocked, the result is undefined, unless
> +        the signal was generated by kill(2),
> +        sigqueue(3), or raise(3) */
> +     sigdelset(&blocked_sig, SIGBUS);
> +     sigdelset(&blocked_sig, SIGFPE);
> +     sigdelset(&blocked_sig, SIGILL);
> +     sigdelset(&blocked_sig, SIGSEGV);
>       for (sig = 0; sig < MAX_SIGNAL; sig++)
>               LIST_INIT(&signal_state[sig].handlers);
>
> --
> 2.16.1
>

> From 73423f636d906e7815589f80088084cb7ca589b1 Mon Sep 17 00:00:00 2001
> From: William Lallemand <wlallem...@haproxy.org>
> Date: Thu, 7 Jun 2018 11:23:40 +0200
> Subject: [PATCH 3/3] BUG/MINOR: signals: sigprocmask is unspecified in a
>  multithread process
>
> Replace sigprocmask by pthread_sigmask.
> ---
>  src/signal.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/src/signal.c b/src/signal.c
> index 0dadd762c..767d85c48 100644
> --- a/src/signal.c
> +++ b/src/signal.c
> @@ -71,7 +71,11 @@ void __signal_process_queue()
>       sigset_t old_sig;
>
>       /* block signal delivery during processing */
> +#ifdef USE_THREAD
> +     pthread_sigmask(SIG_SETMASK, &blocked_sig, &old_sig);
> +#else
>       sigprocmask(SIG_SETMASK, &blocked_sig, &old_sig);
> +#endif
>
>       /* It is important that we scan the queue forwards so that we can
>        * catch any signal that would have been queued by another signal
> @@ -95,7 +99,11 @@ void __signal_process_queue()
>       signal_queue_len = 0;
>
>       /* restore signal delivery */
> +#ifdef USE_THREAD
> +     pthread_sigmask(SIG_SETMASK, &old_sig, NULL);
> +#else
>       sigprocmask(SIG_SETMASK, &old_sig, NULL);
> +#endif
>  }
>
>  /* perform minimal intializations, report 0 in case of error, 1 if OK. */
> --
> 2.16.1

-- 
William

Reply via email to