Steve,
please try following patch.
Regards,
Honza
Steven Dake wrote:
> On Thu, 2010-01-14 at 09:41 +0100, Jan Friesse wrote:
>> Steve,
>> first patch Committed revision 2643.
>>
>> But because of malloc race, corosync can still freeze, so it solves nothing.
>>
>> I'm really not sure about leak address space (it doesn't leak me anything).
>>
>> There is one solution to join thread. Main problem is with global lock
>> deadlock. If you will solve this, you will get another segfault from (to
>> me) unknown reason.
>>
>
> If a process calls mmap, it must call munmap when it is done with the
> memory. While it may not appear leaked, it is still mapped into the
> process address space and could use all the VM available.
>
> Regards
> -steve
>
>> Regards,
>> Honza
>>
>> Steven Dake wrote:
>>> The first attached patch is good for merge.
>>>
>>> The removal of the munamap is not suitable. It will cause corosync to
>>> leak address space. Need to find a different solution that cleans up
>>> properly.
>>>
>>> Regards
>>> -steve
>>>
>
diff --git a/trunk/exec/coroipcs.c b/trunk/exec/coroipcs.c
index 22643ee..87e34a4 100644
--- a/trunk/exec/coroipcs.c
+++ b/trunk/exec/coroipcs.c
@@ -121,6 +121,7 @@ union semun {
#endif
#endif
+static int corosync_exiting = 0;
enum conn_state {
CONN_STATE_THREAD_INACTIVE = 0,
@@ -624,7 +625,7 @@ static void *pthread_ipc_consumer (void *conn)
coroipc_response_header_t coroipc_response_header;
int send_ok;
unsigned int new_message;
-
+ coroipcs_handler_fn_lvalue handler_fn;
#if defined(HAVE_PTHREAD_SETSCHEDPARAM) && defined(HAVE_SCHED_GET_PRIORITY_MAX)
if (api->sched_policy != 0) {
res = pthread_setschedparam (conn_info->thread,
@@ -695,7 +696,11 @@ retry_semop:
if (send_ok) {
api->serialize_lock();
api->stats_increment_value (conn_info->stats_handle,
"requests");
- api->handler_fn_get (conn_info->service, header->id)
(conn_info, header);
+
+ handler_fn = api->handler_fn_get (conn_info->service,
header->id);
+ if (handler_fn)
+ handler_fn (conn_info, header);
+
api->serialize_unlock();
} else {
/*
@@ -1049,11 +1054,19 @@ void coroipcs_ipc_exit (void)
struct conn_info *conn_info;
unsigned int res;
+ corosync_exiting = 1;
+
for (list = conn_info_list_head.next; list != &conn_info_list_head;
list = list->next) {
conn_info = list_entry (list, struct conn_info, list);
+ if (conn_info->state != CONN_STATE_THREAD_ACTIVE)
+ continue;
+
+ ipc_disconnect (conn_info);
+ pthread_join (conn_info->thread, NULL);
+
#if _POSIX_THREAD_PROCESS_SHARED > 0
sem_destroy (&conn_info->control_buffer->sem0);
sem_destroy (&conn_info->control_buffer->sem1);
@@ -1073,8 +1086,6 @@ void coroipcs_ipc_exit (void)
conn_info->response_size);
res = circular_memory_unmap (conn_info->dispatch_buffer,
conn_info->dispatch_size);
-
- sem_post_exit_thread (conn_info);
}
}
@@ -1492,6 +1503,10 @@ retry_accept:
setsockopt(new_fd, SOL_SOCKET, SO_PASSCRED, &on, sizeof (on));
#endif
+ if (corosync_exiting) {
+ return (0);
+ }
+
res = conn_info_create (new_fd);
if (res != 0) {
close (new_fd);
@@ -1656,7 +1671,6 @@ int coroipcs_handler_dispatch (
* ipc thread is the only reference at startup
*/
conn_info->refcount = 1;
- conn_info->state = CONN_STATE_THREAD_ACTIVE;
conn_info->private_data = api->malloc
(api->private_data_size_get (conn_info->service));
memset (conn_info->private_data, 0,
@@ -1690,6 +1704,8 @@ int coroipcs_handler_dispatch (
if (conn_info->service == SOCKET_SERVICE_INIT) {
conn_info->service = -1;
}
+
+ conn_info->state = CONN_STATE_THREAD_ACTIVE;
} else
if (revent & POLLIN) {
coroipcs_refcount_inc (conn_info);
diff --git a/trunk/exec/main.c b/trunk/exec/main.c
index 2a2fded..1421fe7 100644
--- a/trunk/exec/main.c
+++ b/trunk/exec/main.c
@@ -56,6 +56,7 @@
#include <signal.h>
#include <sched.h>
#include <time.h>
+#include <semaphore.h>
#include <corosync/swab.h>
#include <corosync/corotypes.h>
@@ -133,6 +134,12 @@ static hdb_handle_t object_connection_handle;
static corosync_timer_handle_t corosync_stats_timer_handle;
+static pthread_t corosync_exit_thread;
+
+static sem_t corosync_exit_sem;
+
+static int totempg_exiting = 0;
+
hdb_handle_t corosync_poll_handle_get (void)
{
return (corosync_poll_handle);
@@ -149,25 +156,17 @@ void corosync_state_dump (void)
}
}
-static void unlink_all_completed (void)
-{
- poll_stop (0);
- totempg_finalize ();
- coroipcs_ipc_exit ();
-
- corosync_exit_error (AIS_DONE_EXIT);
-}
-
void corosync_shutdown_request (void)
{
static int called = 0;
- if (called) {
- return;
- }
- if (called == 0) {
- called = 1;
+ if (called++ > 1) {
+ /*
+ * User really want exit this process -> hard exit
+ */
+ exit (-1);
}
- corosync_service_unlink_all (api, unlink_all_completed);
+
+ sem_post (&corosync_exit_sem);
}
static void sigusr2_handler (int num)
@@ -243,6 +242,26 @@ static void serialize_unlock (void)
}
#endif
+static void unlink_all_completed (void)
+{
+ poll_stop (0);
+ serialize_unlock ();
+ coroipcs_ipc_exit ();
+ totempg_exiting = 1;
+ totempg_finalize ();
+
+ corosync_exit_error (AIS_DONE_EXIT);
+}
+
+static void *corosync_exit_thread_handler (void *arg)
+{
+ sem_wait (&corosync_exit_sem);
+
+ corosync_service_unlink_all (api, unlink_all_completed);
+
+ return arg;
+}
+
static void corosync_sync_completed (void)
{
log_printf (LOGSYS_LEVEL_NOTICE,
@@ -443,6 +462,9 @@ static void corosync_totem_stats_updater (void *data)
int t, prev;
int32_t token_count;
+ if (totempg_exiting)
+ return ;
+
stats = api->totem_get_stats();
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
@@ -802,7 +824,7 @@ static coroipcs_exit_fn_lvalue corosync_exit_fn_get
(unsigned int service)
static coroipcs_handler_fn_lvalue corosync_handler_fn_get (unsigned int
service, unsigned int id)
{
- return (ais_service[service]->lib_engine[id].lib_handler_fn);
+ return (ais_service[service] ?
ais_service[service]->lib_engine[id].lib_handler_fn : NULL);
}
@@ -1421,6 +1443,21 @@ int main (int argc, char **argv)
// TODO what is this hack for? usleep(totem_config.token_timeout * 2000);
/*
+ * Create semaphore and start "exit" thread
+ */
+ res = sem_init (&corosync_exit_sem, 0, 0);
+ if (res != 0) {
+ log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't
create exit thread.\n");
+ corosync_exit_error (AIS_DONE_FATAL_ERR);
+ }
+
+ res = pthread_create (&corosync_exit_thread, NULL,
corosync_exit_thread_handler, NULL);
+ if (res != 0) {
+ log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't
create exit thread.\n");
+ corosync_exit_error (AIS_DONE_FATAL_ERR);
+ }
+
+ /*
* if totempg_initialize doesn't have root priveleges, it cannot
* bind to a specific interface. This only matters if
* there is more then one interface in a system, so
diff --git a/trunk/lib/coroipcc.c b/trunk/lib/coroipcc.c
index aa9546c..f884171 100644
--- a/trunk/lib/coroipcc.c
+++ b/trunk/lib/coroipcc.c
@@ -71,6 +71,11 @@
#include "util.h"
+/*
+ * Define sem_wait timeout (real timeout will be (n-1;n) )
+ */
+#define IPC_SEMWAIT_TIMEOUT 2
+
struct ipc_instance {
int fd;
#if _POSIX_THREAD_PROCESS_SHARED < 1
@@ -456,13 +461,31 @@ reply_receive (
{
#if _POSIX_THREAD_PROCESS_SHARED < 1
struct sembuf sop;
+#else
+ struct timespec timeout;
+ struct pollfd pfd;
#endif
coroipc_response_header_t *response_header;
int res;
#if _POSIX_THREAD_PROCESS_SHARED > 0
retry_semwait:
- res = sem_wait (&ipc_instance->control_buffer->sem1);
+ timeout.tv_sec = time(NULL) + IPC_SEMWAIT_TIMEOUT;
+ timeout.tv_nsec = 0;
+
+ res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout);
+ if (res == -1 && errno == ETIMEDOUT) {
+ pfd.fd = ipc_instance->fd;
+ pfd.events = 0;
+
+ poll (&pfd, 1, 0);
+ if (pfd.revents == POLLERR || pfd.revents == POLLHUP) {
+ return (CS_ERR_LIBRARY);
+ }
+
+ goto retry_semwait;
+ }
+
if (res == -1 && errno == EINTR) {
goto retry_semwait;
}
@@ -505,12 +528,30 @@ reply_receive_in_buf (
{
#if _POSIX_THREAD_PROCESS_SHARED < 1
struct sembuf sop;
+#else
+ struct timespec timeout;
+ struct pollfd pfd;
#endif
int res;
#if _POSIX_THREAD_PROCESS_SHARED > 0
retry_semwait:
- res = sem_wait (&ipc_instance->control_buffer->sem1);
+ timeout.tv_sec = time(NULL) + IPC_SEMWAIT_TIMEOUT;
+ timeout.tv_nsec = 0;
+
+ res = sem_timedwait (&ipc_instance->control_buffer->sem1, &timeout);
+ if (res == -1 && errno == ETIMEDOUT) {
+ pfd.fd = ipc_instance->fd;
+ pfd.events = 0;
+
+ poll (&pfd, 1, 0);
+ if (pfd.revents == POLLERR || pfd.revents == POLLHUP) {
+ return (CS_ERR_LIBRARY);
+ }
+
+ goto retry_semwait;
+ }
+
if (res == -1 && errno == EINTR) {
goto retry_semwait;
}
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais