Steve,
please try following patch.

Regards,
  Honza

Steven Dake wrote:
> On Thu, 2010-01-14 at 09:41 +0100, Jan Friesse wrote:
>> Steve,
>> first patch  Committed revision 2643.
>>
>> But because of malloc race, corosync can still freeze, so it solves nothing.
>>
>> I'm really not sure about leak address space (it doesn't leak me anything).
>>
>> There is one solution to join thread. Main problem is with global lock
>> deadlock. If you will solve this, you will get another segfault from (to
>> me) unknown reason.
>>
> 
> If a process calls mmap, it must call munmap when it is done with the
> memory.  While it may not appear leaked, it is still mapped into the
> process address space and could use all the VM available.
> 
> Regards
> -steve
> 
>> Regards,
>>   Honza
>>
>> Steven Dake wrote:
>>> The first attached patch is good for merge.
>>>
>>> The removal of the munamap is not suitable.  It will cause corosync to
>>> leak address space.  Need to find a different solution that cleans up
>>> properly.
>>>
>>> Regards
>>> -steve
>>>
> 

diff --git a/trunk/exec/coroipcs.c b/trunk/exec/coroipcs.c
index 22643ee..87e34a4 100644
--- a/trunk/exec/coroipcs.c
+++ b/trunk/exec/coroipcs.c
@@ -121,6 +121,7 @@ union semun {
 #endif
 #endif
 
+static int corosync_exiting = 0;
 
 enum conn_state {
        CONN_STATE_THREAD_INACTIVE = 0,
@@ -624,7 +625,7 @@ static void *pthread_ipc_consumer (void *conn)
        coroipc_response_header_t coroipc_response_header;
        int send_ok;
        unsigned int new_message;
-
+       coroipcs_handler_fn_lvalue handler_fn;
 #if defined(HAVE_PTHREAD_SETSCHEDPARAM) && defined(HAVE_SCHED_GET_PRIORITY_MAX)
        if (api->sched_policy != 0) {
                res = pthread_setschedparam (conn_info->thread,
@@ -695,7 +696,11 @@ retry_semop:
                if (send_ok) {
                        api->serialize_lock();
                        api->stats_increment_value (conn_info->stats_handle, 
"requests");
-                       api->handler_fn_get (conn_info->service, header->id) 
(conn_info, header);
+
+                       handler_fn = api->handler_fn_get (conn_info->service, 
header->id);
+                       if (handler_fn) 
+                               handler_fn (conn_info, header);
+
                        api->serialize_unlock();
                } else {
                        /*
@@ -1049,11 +1054,19 @@ void coroipcs_ipc_exit (void)
        struct conn_info *conn_info;
        unsigned int res;
 
+       corosync_exiting = 1;
+
        for (list = conn_info_list_head.next; list != &conn_info_list_head;
                list = list->next) {
 
                conn_info = list_entry (list, struct conn_info, list);
 
+               if (conn_info->state != CONN_STATE_THREAD_ACTIVE)
+                       continue;
+
+               ipc_disconnect (conn_info);
+               pthread_join (conn_info->thread, NULL);
+
 #if _POSIX_THREAD_PROCESS_SHARED > 0
                sem_destroy (&conn_info->control_buffer->sem0);
                sem_destroy (&conn_info->control_buffer->sem1);
@@ -1073,8 +1086,6 @@ void coroipcs_ipc_exit (void)
                        conn_info->response_size);
                res = circular_memory_unmap (conn_info->dispatch_buffer,
                        conn_info->dispatch_size);
-
-               sem_post_exit_thread (conn_info);
        }
 }
 
@@ -1492,6 +1503,10 @@ retry_accept:
        setsockopt(new_fd, SOL_SOCKET, SO_PASSCRED, &on, sizeof (on));
 #endif
 
+       if (corosync_exiting) {
+               return (0);
+       }
+
        res = conn_info_create (new_fd);
        if (res != 0) {
                close (new_fd);
@@ -1656,7 +1671,6 @@ int coroipcs_handler_dispatch (
                 * ipc thread is the only reference at startup
                 */
                conn_info->refcount = 1;
-               conn_info->state = CONN_STATE_THREAD_ACTIVE;
 
                conn_info->private_data = api->malloc 
(api->private_data_size_get (conn_info->service));
                memset (conn_info->private_data, 0,
@@ -1690,6 +1704,8 @@ int coroipcs_handler_dispatch (
                if (conn_info->service == SOCKET_SERVICE_INIT) {
                        conn_info->service = -1;
                }
+
+               conn_info->state = CONN_STATE_THREAD_ACTIVE;
        } else
        if (revent & POLLIN) {
                coroipcs_refcount_inc (conn_info);
diff --git a/trunk/exec/main.c b/trunk/exec/main.c
index 2a2fded..1421fe7 100644
--- a/trunk/exec/main.c
+++ b/trunk/exec/main.c
@@ -56,6 +56,7 @@
 #include <signal.h>
 #include <sched.h>
 #include <time.h>
+#include <semaphore.h>
 
 #include <corosync/swab.h>
 #include <corosync/corotypes.h>
@@ -133,6 +134,12 @@ static hdb_handle_t object_connection_handle;
 
 static corosync_timer_handle_t corosync_stats_timer_handle;
 
+static pthread_t corosync_exit_thread;
+
+static sem_t corosync_exit_sem;
+
+static int totempg_exiting = 0;
+
 hdb_handle_t corosync_poll_handle_get (void)
 {
        return (corosync_poll_handle);
@@ -149,25 +156,17 @@ void corosync_state_dump (void)
        }
 }
 
-static void unlink_all_completed (void)
-{
-       poll_stop (0);
-       totempg_finalize ();
-       coroipcs_ipc_exit ();
-
-       corosync_exit_error (AIS_DONE_EXIT);
-}
-
 void corosync_shutdown_request (void)
 {
        static int called = 0;
-       if (called) {
-               return;
-       }
-       if (called == 0) {
-               called = 1;
+       if (called++ > 1) {
+               /*
+                * User really want exit this process -> hard exit
+                */
+               exit (-1);
        }
-       corosync_service_unlink_all (api, unlink_all_completed);
+
+       sem_post (&corosync_exit_sem);
 }
 
 static void sigusr2_handler (int num)
@@ -243,6 +242,26 @@ static void serialize_unlock (void)
 }
 #endif
 
+static void unlink_all_completed (void)
+{
+       poll_stop (0);
+       serialize_unlock ();
+       coroipcs_ipc_exit ();
+       totempg_exiting = 1;
+       totempg_finalize ();
+
+       corosync_exit_error (AIS_DONE_EXIT);
+}
+
+static void *corosync_exit_thread_handler (void *arg)
+{
+       sem_wait (&corosync_exit_sem);
+
+       corosync_service_unlink_all (api, unlink_all_completed);
+
+       return arg;
+}
+
 static void corosync_sync_completed (void)
 {
        log_printf (LOGSYS_LEVEL_NOTICE,
@@ -443,6 +462,9 @@ static void corosync_totem_stats_updater (void *data)
        int t, prev;
        int32_t token_count;
 
+       if (totempg_exiting)
+               return ;
+
        stats = api->totem_get_stats();
 
        objdb->object_key_replace (stats->mrp->srp->hdr.handle,
@@ -802,7 +824,7 @@ static coroipcs_exit_fn_lvalue corosync_exit_fn_get 
(unsigned int service)
 
 static coroipcs_handler_fn_lvalue corosync_handler_fn_get (unsigned int 
service, unsigned int id)
 {
-       return (ais_service[service]->lib_engine[id].lib_handler_fn);
+       return (ais_service[service] ? 
ais_service[service]->lib_engine[id].lib_handler_fn : NULL);
 }
 
 
@@ -1421,6 +1443,21 @@ int main (int argc, char **argv)
 // TODO what is this hack for? usleep(totem_config.token_timeout * 2000);
 
        /*
+        * Create semaphore and start "exit" thread
+        */
+       res = sem_init (&corosync_exit_sem, 0, 0);
+       if (res != 0) {
+               log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't 
create exit thread.\n");
+               corosync_exit_error (AIS_DONE_FATAL_ERR);
+       }
+
+       res = pthread_create (&corosync_exit_thread, NULL, 
corosync_exit_thread_handler, NULL);
+       if (res != 0) {
+               log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't 
create exit thread.\n");
+               corosync_exit_error (AIS_DONE_FATAL_ERR);
+       }
+
+       /*
         * if totempg_initialize doesn't have root priveleges, it cannot
         * bind to a specific interface.  This only matters if
         * there is more then one interface in a system, so
diff --git a/trunk/lib/coroipcc.c b/trunk/lib/coroipcc.c
index aa9546c..f884171 100644
--- a/trunk/lib/coroipcc.c
+++ b/trunk/lib/coroipcc.c
@@ -71,6 +71,11 @@
 
 #include "util.h"
 
+/*
+ * Define sem_wait timeout (real timeout will be (n-1;n) )
+ */
+#define IPC_SEMWAIT_TIMEOUT 2
+
 struct ipc_instance {
        int fd;
 #if _POSIX_THREAD_PROCESS_SHARED < 1
@@ -456,13 +461,31 @@ reply_receive (
 {
 #if _POSIX_THREAD_PROCESS_SHARED < 1
        struct sembuf sop;
+#else
+       struct timespec timeout;
+       struct pollfd pfd;
 #endif
        coroipc_response_header_t *response_header;
        int res;
 
 #if _POSIX_THREAD_PROCESS_SHARED > 0
 retry_semwait:
-       res = sem_wait (&ipc_instance->control_buffer->sem1);
+       timeout.tv_sec = time(NULL) + IPC_SEMWAIT_TIMEOUT;
+       timeout.tv_nsec = 0;
+
+       res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout);
+       if (res == -1 && errno == ETIMEDOUT) {
+               pfd.fd = ipc_instance->fd;
+               pfd.events = 0;
+
+               poll (&pfd, 1, 0);
+               if (pfd.revents == POLLERR || pfd.revents == POLLHUP) {
+                       return (CS_ERR_LIBRARY);
+               }
+
+               goto retry_semwait;
+       }
+
        if (res == -1 && errno == EINTR) {
                goto retry_semwait;
        }
@@ -505,12 +528,30 @@ reply_receive_in_buf (
 {
 #if _POSIX_THREAD_PROCESS_SHARED < 1
        struct sembuf sop;
+#else
+       struct timespec timeout;
+       struct pollfd pfd;
 #endif
        int res;
 
 #if _POSIX_THREAD_PROCESS_SHARED > 0
 retry_semwait:
-       res = sem_wait (&ipc_instance->control_buffer->sem1);
+       timeout.tv_sec = time(NULL) + IPC_SEMWAIT_TIMEOUT;
+       timeout.tv_nsec = 0;
+
+       res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout);
+       if (res == -1 && errno == ETIMEDOUT) {
+               pfd.fd = ipc_instance->fd;
+               pfd.events = 0;
+
+               poll (&pfd, 1, 0);
+               if (pfd.revents == POLLERR || pfd.revents == POLLHUP) {
+                       return (CS_ERR_LIBRARY);
+               }
+
+               goto retry_semwait;
+       }
+
        if (res == -1 && errno == EINTR) {
                goto retry_semwait;
        }
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to