The first attached patch is good for merge. The removal of the munamap is not suitable. It will cause corosync to leak address space. Need to find a different solution that cleans up properly.
Regards -steve On Tue, 2010-01-12 at 15:25 +0100, Jan Friesse wrote: > Steve, > your deduction of logic is correct (with exception of two seconds, in > reality it will be (1;2) seconds). > > Patch is now in two parts. > > I hope, this will work on F12 without Segfaults, even solution (remove > one unmap) is not ideal. On the other hand, Because it is right BEFORE > death, it should be so big deal. > > Regards, > Honza > > Steven Dake wrote: > > Question about magic number "2" in the coroipcc.c code. > > > > Is the logic to try sem_timedwait, then retry after two seconds if no > > response has been executed from the server? Why two seconds? Is this > > to return an error code to the library when the semaphore is deleted in > > the server? > > > > If so, two seconds sounds ok, but please make it a define in the C file > > rather then magic number. > > > > The patch contains two issues. Please commit them as separate patches > > with separate descriptions. Good catch on the malloc in signal handler. > > glibc should be able to handle that, but it appears that is not the case > > here. > > > > Good investigative work > > > > Regards > > -steve > > > > On Fri, 2010-01-08 at 12:22 +0100, Jan Friesse wrote: > >> Related to https://bugzilla.redhat.com/show_bug.cgi?id=547511 > >> > >> This patch solves problem in little different (I hope better) way. > >> > >> It fixes problem with sem_destroy + sem_wait and also solves hard freeze > >> because malloc(*) + other functions are called in sighandler. This is > >> reason, why special thread is created and only purpose in life of these > >> thread is to wait for semaphore and begin shutdown sequence. > >> > >> According to Fabbio, there are still some segfaults left on Fedora 12. > >> > >> Regards, > >> Honza > >> > >> (*) according to glibc documentation, malloc and free can be called in > >> signal handler, but in such case, I really don't understand this: > >> (gdb) bt > >> #0 0x00de1424 in __kernel_vsyscall () > >> #1 0x002c7e43 in __lll_lock_wait_private () from /lib/libc.so.6 > >> #2 0x00250b94 in _L_lock_9571 () from /lib/libc.so.6 > >> #3 0x0024ebf4 in malloc () from /lib/libc.so.6 > >> #4 0x08054d26 in hdb_handle_create (handle_database=0x805d748, > >> instance_size=12, handle_id_out=0x805fa48) at ../include/corosync/hdb.h:178 > >> #5 0x08055422 in schedwrk_create (handle=0x805fa48, > >> schedwrk_fn=0x8050bee <unlink_all_schedwrk_handler>, context=0x805d5a0) > >> at schedwrk.c:104 > >> #6 0x08050d33 in corosync_service_unlink_all (api=0x805d5a0, > >> unlink_all_complete=0x804b3fb <unlink_all_completed>) at service.c:583 > >> #7 0x0804b491 in corosync_shutdown_request () at main.c:171 > >> #8 0x0804b508 in sigintr_handler (num=2) at main.c:195 > >> #9 <signal handler called> > >> #10 0x0024ca2b in _int_malloc () from /lib/libc.so.6 > >> #11 0x0024ebfe in malloc () from /lib/libc.so.6 > >> #12 0x0023a7df in __fopen_internal () from /lib/libc.so.6 > >> #13 0x0023a8ac in fopen@@GLIBC_2.1 () from /lib/libc.so.6 > >> #14 0x0053352a in pid_to_name (pid=18300, out_name=0xbfda2016 , > >> name_len=32) at coroipcs.c:1515 > >> #15 0x00533654 in coroipcs_init_conn_stats (conn=0x82d1bb8) at > >> coroipcs.c:1557 > >> #16 0x00533a29 in coroipcs_handler_dispatch (fd=10, revent=1, > >> context=0x82d1bb8) at coroipcs.c:1670 > >> #17 0x0804d498 in corosync_poll_handler_dispatch > >> (handle=1197105576937521152, fd=10, revent=1, context=0x82d1bb8 at > >> main.c:911 > >> #18 0x0057b01b in poll_run (handle=1197105576937521152) at coropoll.c:394 > >> #19 0x0804ec8e in main (argc=1, argv=0xbfda3434) at main.c:1498 > >> > >> _______________________________________________ > >> Openais mailing list > >> [email protected] > >> https://lists.linux-foundation.org/mailman/listinfo/openais > > > > plain text document attachment (54511-final-ipc-lib.patch) > commit f0c8b0c7b6355a418245bb514aecc4dfa6957ee6 > Author: Jan Friesse <[email protected]> > Date: Tue Jan 12 15:10:32 2010 +0100 > > Handle freeze of IPC library connection > > RHBZ#547511 > > diff --git a/trunk/lib/coroipcc.c b/trunk/lib/coroipcc.c > index aa9546c..f884171 100644 > --- a/trunk/lib/coroipcc.c > +++ b/trunk/lib/coroipcc.c > @@ -71,6 +71,11 @@ > > #include "util.h" > > +/* > + * Define sem_wait timeout (real timeout will be (n-1;n) ) > + */ > +#define IPC_SEMWAIT_TIMEOUT 2 > + > struct ipc_instance { > int fd; > #if _POSIX_THREAD_PROCESS_SHARED < 1 > @@ -456,13 +461,31 @@ reply_receive ( > { > #if _POSIX_THREAD_PROCESS_SHARED < 1 > struct sembuf sop; > +#else > + struct timespec timeout; > + struct pollfd pfd; > #endif > coroipc_response_header_t *response_header; > int res; > > #if _POSIX_THREAD_PROCESS_SHARED > 0 > retry_semwait: > - res = sem_wait (&ipc_instance->control_buffer->sem1); > + timeout.tv_sec = time(NULL) + IPC_SEMWAIT_TIMEOUT; > + timeout.tv_nsec = 0; > + > + res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout); > + if (res == -1 && errno == ETIMEDOUT) { > + pfd.fd = ipc_instance->fd; > + pfd.events = 0; > + > + poll (&pfd, 1, 0); > + if (pfd.revents == POLLERR || pfd.revents == POLLHUP) { > + return (CS_ERR_LIBRARY); > + } > + > + goto retry_semwait; > + } > + > if (res == -1 && errno == EINTR) { > goto retry_semwait; > } > @@ -505,12 +528,30 @@ reply_receive_in_buf ( > { > #if _POSIX_THREAD_PROCESS_SHARED < 1 > struct sembuf sop; > +#else > + struct timespec timeout; > + struct pollfd pfd; > #endif > int res; > > #if _POSIX_THREAD_PROCESS_SHARED > 0 > retry_semwait: > - res = sem_wait (&ipc_instance->control_buffer->sem1); > + timeout.tv_sec = time(NULL) + IPC_SEMWAIT_TIMEOUT; > + timeout.tv_nsec = 0; > + > + res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout); > + if (res == -1 && errno == ETIMEDOUT) { > + pfd.fd = ipc_instance->fd; > + pfd.events = 0; > + > + poll (&pfd, 1, 0); > + if (pfd.revents == POLLERR || pfd.revents == POLLHUP) { > + return (CS_ERR_LIBRARY); > + } > + > + goto retry_semwait; > + } > + > if (res == -1 && errno == EINTR) { > goto retry_semwait; > } > plain text document attachment (54511-final-ipc-segfault.patch) > commit 915ef9af4cdb75571e6c067b7344019b9bed4d1d > Author: Jan Friesse <[email protected]> > Date: Tue Jan 12 15:11:27 2010 +0100 > > Handle segfault on corosync exit > > RHBZ#547511 > > diff --git a/trunk/exec/coroipcs.c b/trunk/exec/coroipcs.c > index 22643ee..f368887 100644 > --- a/trunk/exec/coroipcs.c > +++ b/trunk/exec/coroipcs.c > @@ -1054,6 +1054,11 @@ void coroipcs_ipc_exit (void) > > conn_info = list_entry (list, struct conn_info, list); > > + if (conn_info->state != CONN_STATE_THREAD_ACTIVE) > + continue; > + > + ipc_disconnect (conn_info); > + > #if _POSIX_THREAD_PROCESS_SHARED > 0 > sem_destroy (&conn_info->control_buffer->sem0); > sem_destroy (&conn_info->control_buffer->sem1); > @@ -1064,17 +1069,16 @@ void coroipcs_ipc_exit (void) > > /* > * Unmap memory segments > + * Control buffer holds lock, so we will not unmap it > */ > - res = munmap ((void *)conn_info->control_buffer, > - conn_info->control_size); > +/* res = munmap ((void *)conn_info->control_buffer, > + conn_info->control_size);*/ > res = munmap ((void *)conn_info->request_buffer, > conn_info->request_size); > res = munmap ((void *)conn_info->response_buffer, > conn_info->response_size); > res = circular_memory_unmap (conn_info->dispatch_buffer, > conn_info->dispatch_size); > - > - sem_post_exit_thread (conn_info); > } > } > > @@ -1656,7 +1660,6 @@ int coroipcs_handler_dispatch ( > * ipc thread is the only reference at startup > */ > conn_info->refcount = 1; > - conn_info->state = CONN_STATE_THREAD_ACTIVE; > > conn_info->private_data = api->malloc > (api->private_data_size_get (conn_info->service)); > memset (conn_info->private_data, 0, > @@ -1690,6 +1693,8 @@ int coroipcs_handler_dispatch ( > if (conn_info->service == SOCKET_SERVICE_INIT) { > conn_info->service = -1; > } > + > + conn_info->state = CONN_STATE_THREAD_ACTIVE; > } else > if (revent & POLLIN) { > coroipcs_refcount_inc (conn_info); > diff --git a/trunk/exec/main.c b/trunk/exec/main.c > index 2a2fded..afc6e10 100644 > --- a/trunk/exec/main.c > +++ b/trunk/exec/main.c > @@ -56,6 +56,7 @@ > #include <signal.h> > #include <sched.h> > #include <time.h> > +#include <semaphore.h> > > #include <corosync/swab.h> > #include <corosync/corotypes.h> > @@ -133,6 +134,10 @@ static hdb_handle_t object_connection_handle; > > static corosync_timer_handle_t corosync_stats_timer_handle; > > +static pthread_t corosync_exit_thread; > + > +static sem_t corosync_exit_sem; > + > hdb_handle_t corosync_poll_handle_get (void) > { > return (corosync_poll_handle); > @@ -152,8 +157,8 @@ void corosync_state_dump (void) > static void unlink_all_completed (void) > { > poll_stop (0); > - totempg_finalize (); > coroipcs_ipc_exit (); > + totempg_finalize (); > > corosync_exit_error (AIS_DONE_EXIT); > } > @@ -161,13 +166,23 @@ static void unlink_all_completed (void) > void corosync_shutdown_request (void) > { > static int called = 0; > - if (called) { > - return; > - } > - if (called == 0) { > - called = 1; > + if (called++ > 1) { > + /* > + * User really want exit this process -> hard exit > + */ > + exit (-1); > } > + > + sem_post (&corosync_exit_sem); > +} > + > +static void *corosync_exit_thread_handler (void *arg) > +{ > + sem_wait (&corosync_exit_sem); > + > corosync_service_unlink_all (api, unlink_all_completed); > + > + return arg; > } > > static void sigusr2_handler (int num) > @@ -1421,6 +1436,21 @@ int main (int argc, char **argv) > // TODO what is this hack for? usleep(totem_config.token_timeout * > 2000); > > /* > + * Create semaphore and start "exit" thread > + */ > + res = sem_init (&corosync_exit_sem, 0, 0); > + if (res != 0) { > + log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't > create exit thread.\n"); > + corosync_exit_error (AIS_DONE_FATAL_ERR); > + } > + > + res = pthread_create (&corosync_exit_thread, NULL, > corosync_exit_thread_handler, NULL); > + if (res != 0) { > + log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't > create exit thread.\n"); > + corosync_exit_error (AIS_DONE_FATAL_ERR); > + } > + > + /* > * if totempg_initialize doesn't have root priveleges, it cannot > * bind to a specific interface. This only matters if > * there is more then one interface in a system, so _______________________________________________ Openais mailing list [email protected] https://lists.linux-foundation.org/mailman/listinfo/openais
