Related to https://bugzilla.redhat.com/show_bug.cgi?id=547511

This patch solves problem in little different (I hope better) way.

It fixes problem with sem_destroy + sem_wait and also solves hard freeze because malloc(*) + other functions are called in sighandler. This is reason, why special thread is created and only purpose in life of these thread is to wait for semaphore and begin shutdown sequence.

According to Fabbio, there are still some segfaults left on Fedora 12.

Regards,
   Honza

(*) according to glibc documentation, malloc and free can be called in signal handler, but in such case, I really don't understand this:
(gdb) bt
#0  0x00de1424 in __kernel_vsyscall ()
#1  0x002c7e43 in __lll_lock_wait_private () from /lib/libc.so.6
#2  0x00250b94 in _L_lock_9571 () from /lib/libc.so.6
#3  0x0024ebf4 in malloc () from /lib/libc.so.6
#4 0x08054d26 in hdb_handle_create (handle_database=0x805d748, instance_size=12, handle_id_out=0x805fa48) at ../include/corosync/hdb.h:178 #5 0x08055422 in schedwrk_create (handle=0x805fa48, schedwrk_fn=0x8050bee <unlink_all_schedwrk_handler>, context=0x805d5a0) at schedwrk.c:104 #6 0x08050d33 in corosync_service_unlink_all (api=0x805d5a0, unlink_all_complete=0x804b3fb <unlink_all_completed>) at service.c:583
#7  0x0804b491 in corosync_shutdown_request () at main.c:171
#8  0x0804b508 in sigintr_handler (num=2) at main.c:195
#9  <signal handler called>
#10 0x0024ca2b in _int_malloc () from /lib/libc.so.6
#11 0x0024ebfe in malloc () from /lib/libc.so.6
#12 0x0023a7df in __fopen_internal () from /lib/libc.so.6
#13 0x0023a8ac in fopen@@GLIBC_2.1 () from /lib/libc.so.6
#14 0x0053352a in pid_to_name (pid=18300, out_name=0xbfda2016 , name_len=32) at coroipcs.c:1515 #15 0x00533654 in coroipcs_init_conn_stats (conn=0x82d1bb8) at coroipcs.c:1557 #16 0x00533a29 in coroipcs_handler_dispatch (fd=10, revent=1, context=0x82d1bb8) at coroipcs.c:1670 #17 0x0804d498 in corosync_poll_handler_dispatch (handle=1197105576937521152, fd=10, revent=1, context=0x82d1bb8 at main.c:911
#18 0x0057b01b in poll_run (handle=1197105576937521152) at coropoll.c:394
#19 0x0804ec8e in main (argc=1, argv=0xbfda3434) at main.c:1498

diff --git a/trunk/exec/coroipcs.c b/trunk/exec/coroipcs.c
index 22643ee..a0ef133 100644
--- a/trunk/exec/coroipcs.c
+++ b/trunk/exec/coroipcs.c
@@ -1054,6 +1054,11 @@ void coroipcs_ipc_exit (void)
 
 		conn_info = list_entry (list, struct conn_info, list);
 
+		if (conn_info->state != CONN_STATE_THREAD_ACTIVE)
+			continue;
+
+		ipc_disconnect (conn_info);
+
 #if _POSIX_THREAD_PROCESS_SHARED > 0
 		sem_destroy (&conn_info->control_buffer->sem0);
 		sem_destroy (&conn_info->control_buffer->sem1);
@@ -1073,8 +1078,6 @@ void coroipcs_ipc_exit (void)
 			conn_info->response_size);
 		res = circular_memory_unmap (conn_info->dispatch_buffer,
 			conn_info->dispatch_size);
-
-		sem_post_exit_thread (conn_info);
 	}
 }
 
@@ -1656,7 +1659,6 @@ int coroipcs_handler_dispatch (
 		 * ipc thread is the only reference at startup
 		 */
 		conn_info->refcount = 1;
-		conn_info->state = CONN_STATE_THREAD_ACTIVE;
 
 		conn_info->private_data = api->malloc (api->private_data_size_get (conn_info->service));
 		memset (conn_info->private_data, 0,
@@ -1690,6 +1692,8 @@ int coroipcs_handler_dispatch (
 		if (conn_info->service == SOCKET_SERVICE_INIT) {
 			conn_info->service = -1;
 		}
+
+		conn_info->state = CONN_STATE_THREAD_ACTIVE;
 	} else
 	if (revent & POLLIN) {
 		coroipcs_refcount_inc (conn_info);
diff --git a/trunk/exec/main.c b/trunk/exec/main.c
index 2a2fded..b2b84d2 100644
--- a/trunk/exec/main.c
+++ b/trunk/exec/main.c
@@ -56,6 +56,7 @@
 #include <signal.h>
 #include <sched.h>
 #include <time.h>
+#include <semaphore.h>
 
 #include <corosync/swab.h>
 #include <corosync/corotypes.h>
@@ -133,6 +134,10 @@ static hdb_handle_t object_connection_handle;
 
 static corosync_timer_handle_t corosync_stats_timer_handle;
 
+static pthread_t corosync_exit_thread;
+
+static sem_t corosync_exit_sem;
+
 hdb_handle_t corosync_poll_handle_get (void)
 {
 	return (corosync_poll_handle);
@@ -152,8 +157,8 @@ void corosync_state_dump (void)
 static void unlink_all_completed (void)
 {
 	poll_stop (0);
-	totempg_finalize ();
 	coroipcs_ipc_exit ();
+	totempg_finalize ();
 
 	corosync_exit_error (AIS_DONE_EXIT);
 }
@@ -167,7 +172,17 @@ void corosync_shutdown_request (void)
 	if (called == 0) {
 		called = 1;
 	}
+
+	sem_post (&corosync_exit_sem);
+}
+
+static void *corosync_exit_thread_handler (void *arg)
+{
+	sem_wait (&corosync_exit_sem);
+
 	corosync_service_unlink_all (api, unlink_all_completed);
+
+	return arg;
 }
 
 static void sigusr2_handler (int num)
@@ -1421,6 +1436,21 @@ int main (int argc, char **argv)
 // TODO what is this hack for?	usleep(totem_config.token_timeout * 2000);
 
 	/*
+	 * Create semaphore and start "exit" thread
+	 */
+	res = sem_init (&corosync_exit_sem, 0, 0);
+	if (res != 0) {
+		log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't create exit thread.\n");
+		corosync_exit_error (AIS_DONE_FATAL_ERR);
+	}
+
+	res = pthread_create (&corosync_exit_thread, NULL, corosync_exit_thread_handler, NULL);
+	if (res != 0) {
+		log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't create exit thread.\n");
+		corosync_exit_error (AIS_DONE_FATAL_ERR);
+	}
+
+	/*
 	 * if totempg_initialize doesn't have root priveleges, it cannot
 	 * bind to a specific interface.  This only matters if
 	 * there is more then one interface in a system, so
diff --git a/trunk/lib/coroipcc.c b/trunk/lib/coroipcc.c
index aa9546c..918b5ae 100644
--- a/trunk/lib/coroipcc.c
+++ b/trunk/lib/coroipcc.c
@@ -456,13 +456,31 @@ reply_receive (
 {
 #if _POSIX_THREAD_PROCESS_SHARED < 1
 	struct sembuf sop;
+#else
+	struct timespec timeout;
+	struct pollfd pfd;
 #endif
 	coroipc_response_header_t *response_header;
 	int res;
 
 #if _POSIX_THREAD_PROCESS_SHARED > 0
 retry_semwait:
-	res = sem_wait (&ipc_instance->control_buffer->sem1);
+	timeout.tv_sec = time(NULL) + 2;
+	timeout.tv_nsec = 0;
+
+	res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout);
+	if (res == -1 && errno == ETIMEDOUT) {
+		pfd.fd = ipc_instance->fd;
+		pfd.events = 0;
+
+		poll (&pfd, 1, 0);
+		if (pfd.revents == POLLERR || pfd.revents == POLLHUP) {
+			return (CS_ERR_LIBRARY);
+		}
+
+		goto retry_semwait;
+	}
+
 	if (res == -1 && errno == EINTR) {
 		goto retry_semwait;
 	}
@@ -505,12 +523,30 @@ reply_receive_in_buf (
 {
 #if _POSIX_THREAD_PROCESS_SHARED < 1
 	struct sembuf sop;
+#else
+	struct timespec timeout;
+	struct pollfd pfd;
 #endif
 	int res;
 
 #if _POSIX_THREAD_PROCESS_SHARED > 0
 retry_semwait:
-	res = sem_wait (&ipc_instance->control_buffer->sem1);
+	timeout.tv_sec = time(NULL) + 2;
+	timeout.tv_nsec = 0;
+
+	res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout);
+	if (res == -1 && errno == ETIMEDOUT) {
+		pfd.fd = ipc_instance->fd;
+		pfd.events = 0;
+
+		poll (&pfd, 1, 0);
+		if (pfd.revents == POLLERR || pfd.revents == POLLHUP) {
+			return (CS_ERR_LIBRARY);
+		}
+
+		goto retry_semwait;
+	}
+
 	if (res == -1 && errno == EINTR) {
 		goto retry_semwait;
 	}
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to