Releated to https://bugzilla.redhat.com/show_bug.cgi?id=547511 bug.

It looks like MAIN problem was hidden in following posix definition:

Destroying a semaphore that other processes or threads are currently blocked on (in sem_wait(3)) produces undefined behavior.


Sadly this is exactly what we were doing.

So patch is doing 2 things:
- Don't accept new connections, when shutdown is in progress

- Send message about our shutdown -> every process waiting in sem_wait is no longer waiting. In coroipcc, we will NEVER allow to sem_wait again after receiving this message. Disconnect IPC and join to thread (so it can handle its actions)

Patch looks pretty stable, most of the valgrind errors are gone and corosync will shutdown properly every time in my tests.

Regards,
  Honza

diff --git a/trunk/exec/coroipcs.c b/trunk/exec/coroipcs.c
index 22643ee..641f035 100644
--- a/trunk/exec/coroipcs.c
+++ b/trunk/exec/coroipcs.c
@@ -98,6 +98,8 @@ static struct coroipcs_init_state_v2 *api = NULL;
 
 DECLARE_LIST_INIT (conn_info_list_head);
 
+static int corosync_shutdown_in_progress = 0;
+
 struct outq_item {
 	void *msg;
 	size_t mlen;
@@ -1043,6 +1045,28 @@ static void _corosync_ipc_init(void)
 	api->poll_accept_add (server_fd);
 }
 
+void coroipcs_ipc_send_exitall (void) {
+        coroipc_response_header_t coroipc_response_header;
+	struct list_head *list;
+	struct conn_info *conn_info;
+
+	corosync_shutdown_in_progress = 1;
+
+	for (list = conn_info_list_head.next; list != &conn_info_list_head;) {
+		conn_info = list_entry (list, struct conn_info, list);
+		list = list->next;
+
+		coroipc_response_header.size = sizeof (coroipc_response_header_t);
+		coroipc_response_header.id = COROIPC_SHUTDOWN;
+		coroipc_response_header.error = CS_ERR_LIBRARY;
+		coroipcs_response_send (conn_info,
+			&coroipc_response_header,
+			sizeof (coroipc_response_header_t));
+		ipc_disconnect (conn_info);
+		pthread_join (conn_info->thread, NULL);
+	}
+}
+
 void coroipcs_ipc_exit (void)
 {
 	struct list_head *list;
@@ -1054,6 +1078,8 @@ void coroipcs_ipc_exit (void)
 
 		conn_info = list_entry (list, struct conn_info, list);
 
+		sem_post_exit_thread (conn_info);
+
 #if _POSIX_THREAD_PROCESS_SHARED > 0
 		sem_destroy (&conn_info->control_buffer->sem0);
 		sem_destroy (&conn_info->control_buffer->sem1);
@@ -1073,8 +1099,6 @@ void coroipcs_ipc_exit (void)
 			conn_info->response_size);
 		res = circular_memory_unmap (conn_info->dispatch_buffer,
 			conn_info->dispatch_size);
-
-		sem_post_exit_thread (conn_info);
 	}
 }
 
@@ -1460,6 +1484,12 @@ int coroipcs_handler_accept (
 
 	addrlen = sizeof (struct sockaddr_un);
 
+	if (corosync_shutdown_in_progress) {
+		log_printf (LOGSYS_LEVEL_ERROR,
+			"Corosync shutdown in progress. Do not accept new connection\n");
+		return -1;
+	}
+
 retry_accept:
 	new_fd = accept (fd, (struct sockaddr *)&un_addr, &addrlen);
 	if (new_fd == -1 && errno == EINTR) {
diff --git a/trunk/exec/main.c b/trunk/exec/main.c
index 2a2fded..e6bd7a9 100644
--- a/trunk/exec/main.c
+++ b/trunk/exec/main.c
@@ -167,6 +167,9 @@ void corosync_shutdown_request (void)
 	if (called == 0) {
 		called = 1;
 	}
+
+	coroipcs_ipc_send_exitall ();
+
 	corosync_service_unlink_all (api, unlink_all_completed);
 }
 
diff --git a/trunk/include/corosync/coroipc_ipc.h b/trunk/include/corosync/coroipc_ipc.h
index 5126faa..b17f386 100644
--- a/trunk/include/corosync/coroipc_ipc.h
+++ b/trunk/include/corosync/coroipc_ipc.h
@@ -141,6 +141,8 @@ struct coroipcs_zc_header {
 
 #define SOCKET_SERVICE_INIT	0xFFFFFFFF
 
+#define COROIPC_SHUTDOWN	0xFFFFFFFF
+
 #define ZC_ALLOC_HEADER		0xFFFFFFFF
 #define ZC_FREE_HEADER		0xFFFFFFFE
 #define ZC_EXECUTE_HEADER	0xFFFFFFFD
diff --git a/trunk/include/corosync/coroipcs.h b/trunk/include/corosync/coroipcs.h
index 3838af8..9776402 100644
--- a/trunk/include/corosync/coroipcs.h
+++ b/trunk/include/corosync/coroipcs.h
@@ -152,6 +152,8 @@ extern void coroipcs_refcount_dec (void *conn);
 
 extern void coroipcs_ipc_exit (void);
 
+extern void coroipcs_ipc_send_exitall (void);
+
 extern int coroipcs_handler_accept (
 	int fd,
 	int revent,
diff --git a/trunk/lib/coroipcc.c b/trunk/lib/coroipcc.c
index aa9546c..3204d14 100644
--- a/trunk/lib/coroipcc.c
+++ b/trunk/lib/coroipcc.c
@@ -87,6 +87,7 @@ struct ipc_instance {
 	size_t dispatch_size;
 	uid_t euid;
 	pthread_mutex_t mutex;
+	int corosync_shutdown_in_progress;
 };
 
 void ipc_hdb_destructor (void *context);
@@ -460,6 +461,10 @@ reply_receive (
 	coroipc_response_header_t *response_header;
 	int res;
 
+	if (ipc_instance->corosync_shutdown_in_progress) {
+		return (CS_ERR_LIBRARY);
+	}
+
 #if _POSIX_THREAD_PROCESS_SHARED > 0
 retry_semwait:
 	res = sem_wait (&ipc_instance->control_buffer->sem1);
@@ -494,6 +499,12 @@ retry_semop:
 		return (CS_ERR_TRY_AGAIN);
 	}
 
+	if (response_header->id == COROIPC_SHUTDOWN) {
+		ipc_instance->corosync_shutdown_in_progress = 1;
+
+		return (CS_ERR_LIBRARY);
+	}
+
 	memcpy (res_msg, ipc_instance->response_buffer, res_len);
 	return (CS_OK);
 }
@@ -508,6 +519,10 @@ reply_receive_in_buf (
 #endif
 	int res;
 
+	if (ipc_instance->corosync_shutdown_in_progress) {
+		return (CS_ERR_LIBRARY);
+	}
+
 #if _POSIX_THREAD_PROCESS_SHARED > 0
 retry_semwait:
 	res = sem_wait (&ipc_instance->control_buffer->sem1);
@@ -537,6 +552,12 @@ retry_semop:
 	}
 #endif
 
+	if (((coroipc_response_header_t *)ipc_instance->response_buffer)->id == COROIPC_SHUTDOWN) {
+		ipc_instance->corosync_shutdown_in_progress = 1;
+
+		return (CS_ERR_LIBRARY);
+	}
+
 	*res_msg = (char *)ipc_instance->response_buffer;
 	return (CS_OK);
 }
@@ -581,6 +602,8 @@ coroipcc_service_connect (
 		return (res);
 	}
 
+	ipc_instance->corosync_shutdown_in_progress = 0;
+
 	res_setup.error = CS_ERR_LIBRARY;
 
 #if defined(COROSYNC_SOLARIS)
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to