The first attached patch is good for merge.

The removal of the munamap is not suitable.  It will cause corosync to
leak address space.  Need to find a different solution that cleans up
properly.

Regards
-steve

On Tue, 2010-01-12 at 15:25 +0100, Jan Friesse wrote:
> Steve,
> your deduction of logic is correct (with exception of two seconds, in
> reality it will be (1;2) seconds).
> 
> Patch is now in two parts.
> 
> I hope, this will work on F12 without Segfaults, even solution (remove
> one unmap) is not ideal. On the other hand, Because it is right BEFORE
> death, it should be so big deal.
> 
> Regards,
>   Honza
> 
> Steven Dake wrote:
> > Question about magic number "2" in the coroipcc.c code.
> > 
> > Is the logic to try sem_timedwait, then retry after two seconds if no
> > response has been executed from the server?  Why two seconds?  Is this
> > to return an error code to the library when the semaphore is deleted in
> > the server?
> > 
> > If so, two seconds sounds ok, but please make it a define in the C file
> > rather then magic number.
> > 
> > The patch contains two issues.  Please commit them as separate patches
> > with separate descriptions.  Good catch on the malloc in signal handler.
> > glibc should be able to handle that, but it appears that is not the case
> > here.
> > 
> > Good investigative work
> > 
> > Regards
> > -steve
> > 
> > On Fri, 2010-01-08 at 12:22 +0100, Jan Friesse wrote:
> >> Related to https://bugzilla.redhat.com/show_bug.cgi?id=547511
> >>
> >> This patch solves problem in little different (I hope better) way.
> >>
> >> It fixes problem with sem_destroy + sem_wait and also solves hard freeze 
> >> because malloc(*) + other functions are called in sighandler. This is 
> >> reason, why special thread is created and only purpose in life of these 
> >> thread is to wait for semaphore and begin shutdown sequence.
> >>
> >> According to Fabbio, there are still some segfaults left on Fedora 12.
> >>
> >> Regards,
> >>     Honza
> >>
> >> (*) according to glibc documentation, malloc and free can be called in 
> >> signal handler, but in such case, I really don't understand this:
> >> (gdb) bt
> >> #0  0x00de1424 in __kernel_vsyscall ()
> >> #1  0x002c7e43 in __lll_lock_wait_private () from /lib/libc.so.6
> >> #2  0x00250b94 in _L_lock_9571 () from /lib/libc.so.6
> >> #3  0x0024ebf4 in malloc () from /lib/libc.so.6
> >> #4  0x08054d26 in hdb_handle_create (handle_database=0x805d748, 
> >> instance_size=12, handle_id_out=0x805fa48) at ../include/corosync/hdb.h:178
> >> #5  0x08055422 in schedwrk_create (handle=0x805fa48, 
> >> schedwrk_fn=0x8050bee <unlink_all_schedwrk_handler>, context=0x805d5a0) 
> >> at schedwrk.c:104
> >> #6  0x08050d33 in corosync_service_unlink_all (api=0x805d5a0, 
> >> unlink_all_complete=0x804b3fb <unlink_all_completed>) at service.c:583
> >> #7  0x0804b491 in corosync_shutdown_request () at main.c:171
> >> #8  0x0804b508 in sigintr_handler (num=2) at main.c:195
> >> #9  <signal handler called>
> >> #10 0x0024ca2b in _int_malloc () from /lib/libc.so.6
> >> #11 0x0024ebfe in malloc () from /lib/libc.so.6
> >> #12 0x0023a7df in __fopen_internal () from /lib/libc.so.6
> >> #13 0x0023a8ac in fopen@@GLIBC_2.1 () from /lib/libc.so.6
> >> #14 0x0053352a in pid_to_name (pid=18300, out_name=0xbfda2016 , 
> >> name_len=32) at coroipcs.c:1515
> >> #15 0x00533654 in coroipcs_init_conn_stats (conn=0x82d1bb8) at 
> >> coroipcs.c:1557
> >> #16 0x00533a29 in coroipcs_handler_dispatch (fd=10, revent=1, 
> >> context=0x82d1bb8) at coroipcs.c:1670
> >> #17 0x0804d498 in corosync_poll_handler_dispatch 
> >> (handle=1197105576937521152, fd=10, revent=1, context=0x82d1bb8 at 
> >> main.c:911
> >> #18 0x0057b01b in poll_run (handle=1197105576937521152) at coropoll.c:394
> >> #19 0x0804ec8e in main (argc=1, argv=0xbfda3434) at main.c:1498
> >>
> >> _______________________________________________
> >> Openais mailing list
> >> [email protected]
> >> https://lists.linux-foundation.org/mailman/listinfo/openais
> > 
> 
> plain text document attachment (54511-final-ipc-lib.patch)
> commit f0c8b0c7b6355a418245bb514aecc4dfa6957ee6
> Author: Jan Friesse <[email protected]>
> Date:   Tue Jan 12 15:10:32 2010 +0100
> 
>     Handle freeze of IPC library connection
>     
>     RHBZ#547511
> 
> diff --git a/trunk/lib/coroipcc.c b/trunk/lib/coroipcc.c
> index aa9546c..f884171 100644
> --- a/trunk/lib/coroipcc.c
> +++ b/trunk/lib/coroipcc.c
> @@ -71,6 +71,11 @@
>  
>  #include "util.h"
>  
> +/*
> + * Define sem_wait timeout (real timeout will be (n-1;n) )
> + */
> +#define IPC_SEMWAIT_TIMEOUT 2
> +
>  struct ipc_instance {
>       int fd;
>  #if _POSIX_THREAD_PROCESS_SHARED < 1
> @@ -456,13 +461,31 @@ reply_receive (
>  {
>  #if _POSIX_THREAD_PROCESS_SHARED < 1
>       struct sembuf sop;
> +#else
> +     struct timespec timeout;
> +     struct pollfd pfd;
>  #endif
>       coroipc_response_header_t *response_header;
>       int res;
>  
>  #if _POSIX_THREAD_PROCESS_SHARED > 0
>  retry_semwait:
> -     res = sem_wait (&ipc_instance->control_buffer->sem1);
> +     timeout.tv_sec = time(NULL) + IPC_SEMWAIT_TIMEOUT;
> +     timeout.tv_nsec = 0;
> +
> +     res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout);
> +     if (res == -1 && errno == ETIMEDOUT) {
> +             pfd.fd = ipc_instance->fd;
> +             pfd.events = 0;
> +
> +             poll (&pfd, 1, 0);
> +             if (pfd.revents == POLLERR || pfd.revents == POLLHUP) {
> +                     return (CS_ERR_LIBRARY);
> +             }
> +
> +             goto retry_semwait;
> +     }
> +
>       if (res == -1 && errno == EINTR) {
>               goto retry_semwait;
>       }
> @@ -505,12 +528,30 @@ reply_receive_in_buf (
>  {
>  #if _POSIX_THREAD_PROCESS_SHARED < 1
>       struct sembuf sop;
> +#else
> +     struct timespec timeout;
> +     struct pollfd pfd;
>  #endif
>       int res;
>  
>  #if _POSIX_THREAD_PROCESS_SHARED > 0
>  retry_semwait:
> -     res = sem_wait (&ipc_instance->control_buffer->sem1);
> +     timeout.tv_sec = time(NULL) + IPC_SEMWAIT_TIMEOUT;
> +     timeout.tv_nsec = 0;
> +
> +     res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout);
> +     if (res == -1 && errno == ETIMEDOUT) {
> +             pfd.fd = ipc_instance->fd;
> +             pfd.events = 0;
> +
> +             poll (&pfd, 1, 0);
> +             if (pfd.revents == POLLERR || pfd.revents == POLLHUP) {
> +                     return (CS_ERR_LIBRARY);
> +             }
> +
> +             goto retry_semwait;
> +     }
> +
>       if (res == -1 && errno == EINTR) {
>               goto retry_semwait;
>       }
> plain text document attachment (54511-final-ipc-segfault.patch)
> commit 915ef9af4cdb75571e6c067b7344019b9bed4d1d
> Author: Jan Friesse <[email protected]>
> Date:   Tue Jan 12 15:11:27 2010 +0100
> 
>     Handle segfault on corosync exit
>     
>     RHBZ#547511
> 
> diff --git a/trunk/exec/coroipcs.c b/trunk/exec/coroipcs.c
> index 22643ee..f368887 100644
> --- a/trunk/exec/coroipcs.c
> +++ b/trunk/exec/coroipcs.c
> @@ -1054,6 +1054,11 @@ void coroipcs_ipc_exit (void)
>  
>               conn_info = list_entry (list, struct conn_info, list);
>  
> +             if (conn_info->state != CONN_STATE_THREAD_ACTIVE)
> +                     continue;
> +
> +             ipc_disconnect (conn_info);
> +
>  #if _POSIX_THREAD_PROCESS_SHARED > 0
>               sem_destroy (&conn_info->control_buffer->sem0);
>               sem_destroy (&conn_info->control_buffer->sem1);
> @@ -1064,17 +1069,16 @@ void coroipcs_ipc_exit (void)
>  
>               /*
>                * Unmap memory segments
> +              * Control buffer holds lock, so we will not unmap it
>                */
> -             res = munmap ((void *)conn_info->control_buffer,
> -                     conn_info->control_size);
> +/*           res = munmap ((void *)conn_info->control_buffer,
> +                     conn_info->control_size);*/
>               res = munmap ((void *)conn_info->request_buffer,
>                       conn_info->request_size);
>               res = munmap ((void *)conn_info->response_buffer,
>                       conn_info->response_size);
>               res = circular_memory_unmap (conn_info->dispatch_buffer,
>                       conn_info->dispatch_size);
> -
> -             sem_post_exit_thread (conn_info);
>       }
>  }
>  
> @@ -1656,7 +1660,6 @@ int coroipcs_handler_dispatch (
>                * ipc thread is the only reference at startup
>                */
>               conn_info->refcount = 1;
> -             conn_info->state = CONN_STATE_THREAD_ACTIVE;
>  
>               conn_info->private_data = api->malloc 
> (api->private_data_size_get (conn_info->service));
>               memset (conn_info->private_data, 0,
> @@ -1690,6 +1693,8 @@ int coroipcs_handler_dispatch (
>               if (conn_info->service == SOCKET_SERVICE_INIT) {
>                       conn_info->service = -1;
>               }
> +
> +             conn_info->state = CONN_STATE_THREAD_ACTIVE;
>       } else
>       if (revent & POLLIN) {
>               coroipcs_refcount_inc (conn_info);
> diff --git a/trunk/exec/main.c b/trunk/exec/main.c
> index 2a2fded..afc6e10 100644
> --- a/trunk/exec/main.c
> +++ b/trunk/exec/main.c
> @@ -56,6 +56,7 @@
>  #include <signal.h>
>  #include <sched.h>
>  #include <time.h>
> +#include <semaphore.h>
>  
>  #include <corosync/swab.h>
>  #include <corosync/corotypes.h>
> @@ -133,6 +134,10 @@ static hdb_handle_t object_connection_handle;
>  
>  static corosync_timer_handle_t corosync_stats_timer_handle;
>  
> +static pthread_t corosync_exit_thread;
> +
> +static sem_t corosync_exit_sem;
> +
>  hdb_handle_t corosync_poll_handle_get (void)
>  {
>       return (corosync_poll_handle);
> @@ -152,8 +157,8 @@ void corosync_state_dump (void)
>  static void unlink_all_completed (void)
>  {
>       poll_stop (0);
> -     totempg_finalize ();
>       coroipcs_ipc_exit ();
> +     totempg_finalize ();
>  
>       corosync_exit_error (AIS_DONE_EXIT);
>  }
> @@ -161,13 +166,23 @@ static void unlink_all_completed (void)
>  void corosync_shutdown_request (void)
>  {
>       static int called = 0;
> -     if (called) {
> -             return;
> -     }
> -     if (called == 0) {
> -             called = 1;
> +     if (called++ > 1) {
> +             /*
> +              * User really want exit this process -> hard exit
> +              */
> +             exit (-1);
>       }
> +
> +     sem_post (&corosync_exit_sem);
> +}
> +
> +static void *corosync_exit_thread_handler (void *arg)
> +{
> +     sem_wait (&corosync_exit_sem);
> +
>       corosync_service_unlink_all (api, unlink_all_completed);
> +
> +     return arg;
>  }
>  
>  static void sigusr2_handler (int num)
> @@ -1421,6 +1436,21 @@ int main (int argc, char **argv)
>  // TODO what is this hack for?       usleep(totem_config.token_timeout * 
> 2000);
>  
>       /*
> +      * Create semaphore and start "exit" thread
> +      */
> +     res = sem_init (&corosync_exit_sem, 0, 0);
> +     if (res != 0) {
> +             log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't 
> create exit thread.\n");
> +             corosync_exit_error (AIS_DONE_FATAL_ERR);
> +     }
> +
> +     res = pthread_create (&corosync_exit_thread, NULL, 
> corosync_exit_thread_handler, NULL);
> +     if (res != 0) {
> +             log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't 
> create exit thread.\n");
> +             corosync_exit_error (AIS_DONE_FATAL_ERR);
> +     }
> +
> +     /*
>        * if totempg_initialize doesn't have root priveleges, it cannot
>        * bind to a specific interface.  This only matters if
>        * there is more then one interface in a system, so

_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to