CVSROOT:        /cvs/cluster
Module name:    cluster
Branch:         RHEL5
Changes by:     [EMAIL PROTECTED]       2007-07-24 13:53:08

Modified files:
        rgmanager      : ChangeLog 
        rgmanager/src/clulib: Makefile msg_cluster.c msgtest.c vft.c 
        rgmanager/src/daemons: Makefile groups.c main.c nodeevent.c 
                               rg_event.c rg_forward.c rg_state.c 
                               rg_thread.c 
        rgmanager/src/utils: clusvcadm.c 
Added files:
        rgmanager/src/clulib: tmgr.c 

Log message:
        Fix #249314, #247291, #249314

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.31.2.21&r2=1.31.2.22
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/tmgr.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=NONE&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.10.2.2&r2=1.10.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/msg_cluster.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4&r2=1.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/msgtest.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.2&r2=1.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/vft.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.17.2.2&r2=1.17.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.14.2.2&r2=1.14.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.9&r2=1.25.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.34.2.6&r2=1.34.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/nodeevent.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.4.2.3&r2=1.4.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_event.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1&r2=1.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_forward.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.8.2.1&r2=1.8.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.10&r2=1.24.2.11
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_thread.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.7&r2=1.15.2.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.12.2.4&r2=1.12.2.5

--- cluster/rgmanager/ChangeLog 2007/07/12 11:23:16     1.31.2.21
+++ cluster/rgmanager/ChangeLog 2007/07/24 13:53:07     1.31.2.22
@@ -1,4 +1,23 @@
-2007-07*12 Marek Grac <mgrac at redhat.com>
+2007-07-24 Lon Hohberger <lhh at redhat.com>
+       * general: make threads exit with pthread_exit() so we can wrap/track 
them.
+       Add internal statedump (SIGUSR1) support.
+       * src/clulib/msg_cluster.c: Fix rare deadlock condition. bz #249314.
+       * src/clulib/vft.c: Fix rare crash if vf_resolve_views gets called with
+       NULL. bz #247291
+       * src/daemons/main.c: Fix minor memory leak in membership_update()
+       when lots of transitions occur. bz #249311.  Fix crash-on-exit race
+       bz #247291.  Don't exit if someone requests foreground mode.
+       * src/daemons/rg_forward.c: Clean up forwarding logic and handle missed
+       error case; fixes deadlock. bz #249314.
+       * src/daemons/rg_state.c: Move closing / free of contexts out of
+       send_ret/send_response to the caller (where they belong).  Don't let
+       people relocate disabled services. bz #249311.
+       * src/daemons/rg_thread.c: Don't loop forever if the thread exits before
+       we notice that it's started. bz #249314.
+       * src/daemons/clusvcadm.c: Fix error codes if you try to relocate when
+       rgmanager isn't running.
+
+2007-07-12 Marek Grac <mgrac at redhat.com>
        * src/resources/Makefile: Fix #245178 - install RA for named
 
 2007-07-11 Lon Hohberger <lhh at redhat.com>
/cvs/cluster/cluster/rgmanager/src/clulib/tmgr.c,v  -->  standard output
revision 1.1.2.1
--- cluster/rgmanager/src/clulib/tmgr.c
+++ -   2007-07-24 13:53:08.783789000 +0000
@@ -0,0 +1,128 @@
+/*
+  Copyright Red Hat, Inc. 2007
+  Copyright Crosswalk 2006-2007
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the
+  Free Software Foundation; either version 2, or (at your option) any
+  later version.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+#ifdef WRAP_THREADS
+#include <stdio.h>
+#include <sys/types.h>
+#include <gettid.h>
+#include <pthread.h>
+#include <string.h>
+#include <errno.h>
+#include <malloc.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <list.h>
+#include <execinfo.h>
+
+typedef struct _thr {
+       list_head();
+       void *(*fn)(void *arg);
+       char **name;
+       pthread_t th;
+} mthread_t;
+
+static mthread_t *_tlist = NULL;
+static int _tcount = 0;
+static pthread_rwlock_t _tlock = PTHREAD_RWLOCK_INITIALIZER;
+
+void
+dump_thread_states(FILE *fp)
+{
+       int x;
+       mthread_t *curr;
+       fprintf(fp, "Thread Information\n");
+       pthread_rwlock_rdlock(&_tlock);
+       list_for(&_tlist, curr, x) {
+               fprintf(fp, "  Thread #%d   id: %d   function: %s\n",
+                       x, (unsigned)curr->th, curr->name[0]);
+       }
+       pthread_rwlock_unlock(&_tlock);
+       fprintf(fp, "\n\n");
+}
+
+
+int __real_pthread_create(pthread_t *, const pthread_attr_t *,
+                         void *(*)(void*), void *);
+int
+__wrap_pthread_create(pthread_t *th, const pthread_attr_t *attr,
+                     void *(*start_routine)(void*),
+                     void *arg)
+{
+       void *fn = start_routine;
+       mthread_t *new;
+       int ret;
+
+       new = malloc(sizeof (*new));
+
+       ret = __real_pthread_create(th, attr, start_routine, arg);
+       if (ret) {
+               if (new)
+                       free(new);
+               return ret;
+       }
+
+       if (new) {
+               new->th = *th;
+               new->fn = start_routine;
+               new->name = backtrace_symbols(&new->fn, 1);
+               pthread_rwlock_wrlock(&_tlock);
+               list_insert(&_tlist, new);
+               ++_tcount;
+               pthread_rwlock_unlock(&_tlock);
+       }
+
+       return ret;
+}
+
+
+void __real_pthread_exit(void *);
+void
+__wrap_pthread_exit(void *exitval)
+{
+       mthread_t *old;
+       int ret = 0, found = 0;
+       pthread_t me = pthread_self();
+
+       pthread_rwlock_rdlock(&_tlock);
+       list_for(&_tlist, old, ret) {
+               if (old->th == me) {
+                       found = 1;
+                       break;
+               }
+       }
+       if (!found)
+               old = NULL;
+       pthread_rwlock_unlock(&_tlock);
+
+       if (!old)
+               __real_pthread_exit(exitval);
+
+       pthread_rwlock_wrlock(&_tlock);
+       list_remove(&_tlist, old);
+       --_tcount;
+       pthread_rwlock_unlock(&_tlock);
+
+       if (old->name)
+               free(old->name);
+       free(old);
+       __real_pthread_exit(exitval);
+}
+#endif
--- cluster/rgmanager/src/clulib/Makefile       2007/05/10 16:23:43     1.10.2.2
+++ cluster/rgmanager/src/clulib/Makefile       2007/07/24 13:53:08     1.10.2.3
@@ -17,7 +17,7 @@
 INCLUDE += -I $(top_srcdir)/include -I $(top_srcdir)/../cman/lib -I 
$(top_srcdir)/../ccs/lib -I $(top_srcdir)/../dlm/lib
 INCLUDE += -I${incdir}
 
-CFLAGS+= -g -Wstrict-prototypes -Wshadow -fPIC -D_GNU_SOURCE
+CFLAGS+= -g -Wstrict-prototypes -Wshadow -fPIC -D_GNU_SOURCE -DWRAP_THREADS
 CFLAGS+= -DCMAN_RELEASE_NAME=\"${RELEASE}\"
 
 TARGETS=libclulib.a liblalloc.a msgtest 
@@ -34,7 +34,7 @@
 libclulib.a: clulog.o daemon_init.o signals.o msgsimple.o \
                gettid.o rg_strings.o message.o members.o fdops.o \
                lock.o cman.o vft.o msg_cluster.o msg_socket.o \
-               wrap_lock.o
+               wrap_lock.o tmgr.o
        ${AR} cru $@ $^
        ranlib $@
 
--- cluster/rgmanager/src/clulib/msg_cluster.c  2006/10/23 22:47:00     1.4
+++ cluster/rgmanager/src/clulib/msg_cluster.c  2007/07/24 13:53:08     1.4.2.1
@@ -46,7 +46,7 @@
 static msgctx_t *contexts[MAX_CONTEXTS];
 static int _me = 0;
 pthread_t comms_thread;
-int thread_running;
+int thread_running = 0;
 
 #define is_established(ctx) \
        (((ctx->type == MSG_CLUSTER) && \
@@ -856,7 +856,6 @@
        errno = EINVAL;
        cluster_msg_hdr_t *m;
        msg_q_t *n;
-       char done = 0;
        char foo;
 
        if (!listenctx || !acceptctx)
@@ -884,24 +883,38 @@
                m = n->message;
                switch(m->msg_control) {
                case M_OPEN:
+                       /* XXX make this case statement its own function or at 
+                          least make it not a big case block . */
                        list_remove(&listenctx->u.cluster_info.queue, n);
                        /*printf("Accepting connection from %d %d\n",
                                 m->src_nodeid, m->src_ctx);*/
 
-                       /* New connection */
+                       /* Release lock on listen context queue; we're done
+                          with it at this point */
+                       pthread_mutex_unlock(&listenctx->u.cluster_info.mutex);
+
+                       /* New connection: first, create + lock the mutex */
                        pthread_mutex_init(&acceptctx->u.cluster_info.mutex,
                                           NULL);
+                       /* Lock this while we finish initializing */
+                       pthread_mutex_lock(&acceptctx->u.cluster_info.mutex);
+
                        pthread_cond_init(&acceptctx->u.cluster_info.cond,
                                          NULL);
+
                        acceptctx->u.cluster_info.queue = NULL;
                        acceptctx->u.cluster_info.remote_ctx = m->src_ctx;
                        acceptctx->u.cluster_info.nodeid = m->src_nodeid;
                        acceptctx->u.cluster_info.port = m->msg_port;
                        acceptctx->flags = (SKF_READ | SKF_WRITE);
 
-                       if (assign_ctx(acceptctx) < 0) {
+                       /* assign_ctx requires the context lock.  We need to 
+                       ensure we don't try to take the context lock w/ a local
+                       queue lock held on a context that's in progress (i.e.
+                       the global cluster context...) */
+                       if (assign_ctx(acceptctx) < 0)
                                printf("FAILED TO ASSIGN CONTEXT\n");
-                       }
+
                        cluster_send_control_msg(acceptctx, M_OPEN_ACK);
 
                        if (listenctx->u.cluster_info.select_pipe[0] >= 0) {
@@ -910,11 +923,14 @@
                                     &foo, 1);
                        }
 
-                       done = 1;
                        free(m);
                        free(n);
 
-                       break;
+                       /* Let the new context go. */
+                       pthread_mutex_unlock(&acceptctx->u.cluster_info.mutex);
+                       return 0;
+                       /* notreached */
+
                case M_DATA:
                        /* Data messages (i.e. from broadcast msgs) are
                           okay too!...  but we don't handle them here */
@@ -925,9 +941,6 @@
                        break;
                }
 
-               if (done)
-                       break;
-
        } while (!list_done(&listenctx->u.cluster_info.queue, n));
 
        pthread_mutex_unlock(&listenctx->u.cluster_info.mutex);
@@ -950,7 +963,7 @@
                poll_cluster_messages(2);
        }
 
-       return NULL;
+       pthread_exit(NULL);
 }
 
 
@@ -1105,7 +1118,7 @@
 
                pthread_attr_init(&attrs);
                pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
-               pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+               /*pthread_attr_setdetachstate(&attrs, 
PTHREAD_CREATE_DETACHED);*/
 
        thread_running = 1;     
        pthread_create(&comms_thread, &attrs, cluster_comms_thread, NULL);
@@ -1130,16 +1143,81 @@
 }
 
 
+void
+dump_cluster_ctx(FILE *fp)
+{
+       int x;
+       msgctx_t *ctx;
+
+       fprintf(fp, "CMAN/mux subsystem status\n");
+       if (thread_running) {
+               fprintf(fp, "  Thread: %d\n", (unsigned)comms_thread);
+       } else {
+               fprintf(fp, "  Thread Offline\n");
+       }
+
+       pthread_mutex_lock(&context_lock);
+       for (x = 0; x < MAX_CONTEXTS; x++) {
+               if (!contexts[x]) 
+                       continue;
+               ctx = contexts[x];
+
+               fprintf(fp, "    Cluster Message Context %p\n", ctx);
+               fprintf(fp, "      Flags %08x  ", ctx->flags);
+               if (ctx->flags & SKF_READ)
+                       fprintf(fp, "SKF_READ ");
+               if (ctx->flags & SKF_WRITE)
+                       fprintf(fp, "SKF_WRITE ");
+               if (ctx->flags & SKF_LISTEN)
+                       fprintf(fp, "SKF_LISTEN ");
+               if (ctx->flags & SKF_MCAST)
+                       fprintf(fp, "SKF_MCAST ");
+               fprintf(fp, "\n");
+               fprintf(fp, "      Target node ID %d\n", 
ctx->u.cluster_info.nodeid);
+               fprintf(fp, "      Local Index %d\n", 
ctx->u.cluster_info.local_ctx);
+               fprintf(fp, "      Remote Index %d\n", 
ctx->u.cluster_info.remote_ctx);
+       }
+       pthread_mutex_unlock(&context_lock);
+       fprintf(fp, "\n");
+}
+
+
 int
 cluster_msg_shutdown(void)
 {
        cman_handle_t ch;
+       cluster_msg_hdr_t m;
+       msgctx_t *ctx;
+       int x;
+
+       thread_running = 0;
+       pthread_join(comms_thread, NULL);
 
        ch = cman_lock(1, SIGUSR2);
        cman_end_recv_data(ch);
-       pthread_kill(comms_thread, SIGTERM);
        cman_unlock(ch);
 
+       /* Send close message to all open contexts */
+       memset(&m, 0, sizeof(m));
+       m.msg_control = M_CLOSE;
+
+       pthread_mutex_lock(&context_lock);
+       for (x = 0; x < MAX_CONTEXTS; x++) {
+               if (!contexts[x])
+                       continue;
+
+               ctx = contexts[x];
+
+               /* Kill remote side if it exists */
+               if (is_established(ctx))
+                       cluster_send_control_msg(ctx, M_CLOSE);
+
+               /* Queue close for local side */
+               queue_for_context(ctx, (void *)&m, sizeof(m));
+       }
+       pthread_mutex_unlock(&context_lock);
+
+
        return 0;
 }
 
--- cluster/rgmanager/src/clulib/msgtest.c      2006/08/07 22:05:01     1.2
+++ cluster/rgmanager/src/clulib/msgtest.c      2007/07/24 13:53:08     1.2.2.1
@@ -49,7 +49,7 @@
 
        if (msg_open(MSG_CLUSTER, 0, MYPORT, &ctx, 0) != 0) {
                printf("Could not set up mcast socket!\n");
-               return NULL;
+               pthread_exit(NULL);
        }
 
        printf("PIGGYBACK CONTEXT\n");
@@ -66,7 +66,7 @@
 
        printf("PIGGY flies...\n");
 
-       return NULL;
+       pthread_exit(NULL);
 }
 
 
@@ -102,7 +102,7 @@
 
                if (msg_open(MSG_CLUSTER, 0, MYPORT, &ctx, 1) != 0) {
                        printf("Could not set up mcast socket!\n");
-                       return NULL;
+                       pthread_exit(NULL);
                }
 
                snprintf(buf, sizeof(buf), "Babble, babble\n");
@@ -116,7 +116,7 @@
 
        printf("Private thread is outta here...\n");
 
-       return NULL;
+       pthread_exit(NULL);
 }
 
 
--- cluster/rgmanager/src/clulib/vft.c  2007/05/10 16:23:43     1.17.2.2
+++ cluster/rgmanager/src/clulib/vft.c  2007/07/24 13:53:08     1.17.2.3
@@ -121,9 +121,9 @@
 
 
 struct vf_args {
-       uint16_t port;
-       int local_node_id;
        msgctx_t *ctx;
+       int local_node_id;
+       uint16_t port;
 };
 
 
@@ -277,6 +277,9 @@
        uint32_t datalen;
        uint32_t trans;
 
+       if (!key_node)
+               return 0;
+
        while ((trans = vf_try_commit(key_node)) != 0) {
                commits++;
        }
@@ -895,7 +898,7 @@
 
        msg_close(ctx);
        msg_free_ctx(ctx);
-       return NULL;
+       pthread_exit(NULL);
 }
 
 
@@ -1776,3 +1779,40 @@
        return VFR_OK;
 }
 
+
+void
+dump_vf_states(FILE *fp)
+{
+       key_node_t *cur;
+
+       fprintf(fp, "View-Formation States:\n");
+       fprintf(fp, "  Thread: %d\n", (unsigned)vf_thread);
+       fprintf(fp, "  Default callbacks:\n    Vote: %p\n    Commit: %p\n",
+               default_vote_cb, default_commit_cb);
+       fprintf(fp, "  Distributed key metadata:\n");
+
+       pthread_mutex_lock(&key_list_mutex);
+
+       for (cur = key_list; cur; cur = cur->kn_next) {
+               fprintf(fp, "    %s, View: %d, Size: %d, Address: %p\n",
+                       cur->kn_keyid,
+                       (int)cur->kn_viewno,
+                       cur->kn_datalen,
+                       cur->kn_data);
+               if (cur->kn_vote_cb != default_vote_cb) 
+                       fprintf(fp, "      Vote callback: %p\n", 
cur->kn_vote_cb);
+               if (cur->kn_commit_cb != default_commit_cb) 
+                       fprintf(fp, "      Commit callback: %p\n", 
cur->kn_commit_cb);
+
+               if (cur->kn_jvlist)
+                       fprintf(fp, "        This key has unresolved "
+                               "new views pending\n");
+               if (cur->kn_clist)
+                       fprintf(fp, "        This key has unresolved "
+                               "commits pending\n");
+
+       }
+
+       pthread_mutex_unlock(&key_list_mutex);
+       fprintf(fp, "\n");
+}
--- cluster/rgmanager/src/daemons/Makefile      2007/03/20 17:09:11     1.14.2.2
+++ cluster/rgmanager/src/daemons/Makefile      2007/07/24 13:53:08     1.14.2.3
@@ -17,9 +17,9 @@
 INCLUDE += -I $(top_srcdir)/include -I $(top_srcdir)/../cman/lib -I 
$(top_srcdir)/../ccs/lib -I $(top_srcdir)/../dlm/lib
 INCLUDE += -I${incdir} -I/usr/include/libxml2
 
-CFLAGS+= -g -Wstrict-prototypes -Wshadow -fPIC -D_GNU_SOURCE
+CFLAGS+= -g -Wstrict-prototypes -Wshadow -fPIC -D_GNU_SOURCE -DWRAP_THREADS
 
-LDFLAGS+= -L ../clulib -L../../../cman/lib -L../../../ccs/lib 
-L../../../dlm/lib -L${libdir} -lclulib -lxml2 -lpthread -ldl 
+LDFLAGS+= -L ../clulib -L../../../cman/lib -L../../../ccs/lib 
-L../../../dlm/lib -L${libdir} -lclulib -lxml2 -lpthread -ldl 
-Wl,-wrap,pthread_create,-wrap,pthread_exit -rdynamic
 TARGETS=clurgmgrd clurmtabd rg_test
 
 all: ${TARGETS}
--- cluster/rgmanager/src/daemons/groups.c      2007/07/10 18:24:00     1.25.2.9
+++ cluster/rgmanager/src/daemons/groups.c      2007/07/24 13:53:08     
1.25.2.10
@@ -1030,7 +1030,7 @@
                msg_send_simple(ctx, RG_FAIL, RG_EAGAIN, 0);
                msg_close(ctx);
                msg_free_ctx(ctx);
-               return NULL;
+               pthread_exit(NULL);
        }
        
        pthread_rwlock_rdlock(&resource_lock);
@@ -1053,7 +1053,7 @@
        
        rg_dec_status();
 
-       return NULL;
+       pthread_exit(NULL);
 }
 
 
@@ -1169,7 +1169,7 @@
        
        /* Only one status thread at a time, please! */
        if (pthread_mutex_trylock(&status_mutex) != 0)
-               return NULL;
+               pthread_exit(NULL);
 
        pthread_rwlock_rdlock(&resource_lock);
        list_do(&_tree, curr) {
@@ -1195,7 +1195,7 @@
        pthread_rwlock_unlock(&resource_lock);
        pthread_mutex_unlock(&status_mutex);
 
-       return NULL;
+       pthread_exit(NULL);
 }
 
 
@@ -1397,6 +1397,13 @@
 }
 
 
+void
+dump_config_version(FILE *fp)
+{
+       fprintf(fp, "Cluster configuration version %d\n\n", config_version);
+}
+
+
 /**
   Initialize resource groups.  This reads all the resource groups from 
   CCS, builds the tree, etc.  Ideally, we'll have a similar function 
--- cluster/rgmanager/src/daemons/main.c        2007/06/26 21:55:46     1.34.2.6
+++ cluster/rgmanager/src/daemons/main.c        2007/07/24 13:53:08     1.34.2.7
@@ -40,6 +40,9 @@
 #define L_SYS (1<<1)
 #define L_USER (1<<0)
 
+#ifdef WRAP_THREADS
+void dump_thread_states(FILE *);
+#endif
 int configure_logging(int ccsfd, int debug);
 
 void node_event(int, int, int, int);
@@ -63,7 +66,7 @@
 
 int next_node_id(cluster_member_list_t *membership, int me);
 int rg_event_q(char *svcName, uint32_t state, int owner);
-
+void malloc_dump_table(FILE *, size_t, size_t);
 
 void
 segfault(int sig)
@@ -259,6 +262,7 @@
 
        free_member_list(node_delta);
        free_member_list(new_ml);
+       free_member_list(old_membership);
 
        rg_unlockall(L_SYS);
 
@@ -405,7 +409,8 @@
        sz = msg_receive(ctx, msg_hdr, sizeof(msgbuf), 1);
        if (sz < sizeof (generic_msg_hdr)) {
                clulog(LOG_ERR,
-                      "#37: Error receiving message header (%d)\n", sz);
+                      "#37: Error receiving header from %d sz=%d CTX %p\n",
+                      nodeid, sz, ctx);
                goto out;
        }
 
@@ -593,6 +598,7 @@
                break;
 
        case M_DATA:
+               nodeid = msg_get_nodeid(ctx);
                dispatch_msg(ctx, nodeid, 0);
                break;
                
@@ -629,7 +635,26 @@
 }
 
 
-void dump_threads(void);
+void dump_threads(FILE *fp);
+void dump_config_version(FILE *fp);
+void dump_vf_states(FILE *fp);
+void dump_cluster_ctx(FILE *fp);
+
+void
+dump_internal_state(char *loc)
+{
+       FILE *fp;
+       fp=fopen(loc, "w+");
+       dump_config_version(fp);
+       dump_threads(fp);
+       dump_vf_states(fp);
+#ifdef WRAP_THREADS
+       dump_thread_states(fp);
+#endif
+       dump_cluster_ctx(fp);
+       //malloc_dump_table(fp, 1, 16384); /* Only works if alloc.c us used */
+       fclose(fp);
+}
 
 int
 event_loop(msgctx_t *localctx, msgctx_t *clusterctx)
@@ -645,10 +670,8 @@
 
        if (signalled) {
                signalled = 0;
-               /*
-               malloc_stats();
-               dump_threads();
-                */
+ 
+               dump_internal_state("/tmp/rgmanager-dump");
        }
 
        while (running && (tv.tv_sec || tv.tv_usec)) {
@@ -747,7 +770,6 @@
 cleanup(msgctx_t *clusterctx)
 {
        kill_resource_groups();
-       member_list_update(NULL);
        send_exit_msg(clusterctx);
 }
 
@@ -760,7 +782,7 @@
 }
 
 
-void malloc_dump_table(size_t, size_t);
+void malloc_dump_table(FILE *, size_t, size_t);
 
 
 /*
@@ -846,10 +868,13 @@
        rg_doall(RG_STOP_EXITING, 1, NULL);
        running = 0;
 
-       return 0;
+       pthread_exit(NULL);
 }
 
 
+#ifdef WRAP_THREADS
+void dump_thread_states(FILE *);
+#endif
 int
 main(int argc, char **argv)
 {
@@ -871,7 +896,9 @@
                        break;
                case 'f':
                        foreground = 1;
+                       break;
                default:
+                       return 1;
                        break;
                }
        }
@@ -984,6 +1011,9 @@
                event_loop(local_ctx, cluster_ctx);
 
                if (shutdown_pending == 1) {
+                       /* Kill local socket; local requests need to
+                          be ignored here */
+                       msg_close(local_ctx);
                        ++shutdown_pending;
                        clulog(LOG_NOTICE, "Shutting down\n");
                        pthread_create(&th, NULL, shutdown_thread, NULL);
--- cluster/rgmanager/src/daemons/nodeevent.c   2007/06/26 21:55:46     1.4.2.3
+++ cluster/rgmanager/src/daemons/nodeevent.c   2007/07/24 13:53:08     1.4.2.4
@@ -196,7 +196,7 @@
        /* Mutex held */
        ne_thread = 0;
        pthread_mutex_unlock(&ne_queue_mutex);
-       return NULL;
+       pthread_exit(NULL);
 }
 
 
--- cluster/rgmanager/src/daemons/rg_event.c    2006/07/11 23:52:41     1.1
+++ cluster/rgmanager/src/daemons/rg_event.c    2007/07/24 13:53:08     1.1.2.1
@@ -64,7 +64,7 @@
        /* Mutex held */
        rg_ev_thread = 0;
        pthread_mutex_unlock(&rg_queue_mutex);
-       return NULL;
+       pthread_exit(NULL);
 }
 
 
--- cluster/rgmanager/src/daemons/rg_forward.c  2006/12/13 18:38:41     1.8.2.1
+++ cluster/rgmanager/src/daemons/rg_forward.c  2007/07/24 13:53:08     1.8.2.2
@@ -24,6 +24,7 @@
 #include <msgsimple.h>
 #include <clulog.h>
 #include <message.h>
+#include <members.h>
 
 
 void
@@ -49,59 +50,100 @@
        request_t *req = (request_t *)arg;
        struct dlm_lksb lockp;
        msgctx_t *ctx = NULL;
+       cluster_member_list_t *m = NULL;
        SmMessageSt msg;
+       int response_code = RG_EAGAIN, ret;
+       int new_owner = 0, retries = 0;
 
-       if (rg_lock(req->rr_group, &lockp) != 0)
+       if (rg_lock(req->rr_group, &lockp) != 0) {
+               clulog(LOG_WARNING, "FW: Forwarding failed; lock unavailable 
for %s\n",
+                      req->rr_group);
                goto out_fail;
-
+       }
        if (get_rg_state(req->rr_group, &rgs) != 0) {
                rg_unlock(&lockp);
+               clulog(LOG_WARNING, "FW: Forwarding failed; state unavailable 
for %s\n",
+                      req->rr_group);
                goto out_fail;
        }
-
        rg_unlock(&lockp);
 
-       /* Construct message */
-       build_message(&msg, req->rr_request, req->rr_group, req->rr_target);
-
        if (rgs.rs_owner == 0)
                rgs.rs_owner = req->rr_target;
        if (rgs.rs_owner == 0) {
-               msg_close(req->rr_resp_ctx);
-               msg_free_ctx(req->rr_resp_ctx);
-               rq_free(req);
-               clulog(LOG_ERR, "Attempt to forward to invalid node ID\n");
-               pthread_exit(NULL);
+               clulog(LOG_ERR, "FW: Attempt to forward to invalid node ID\n");
+                       goto out_fail;
+       }
+       if (rgs.rs_owner == my_id()) {
+               clulog(LOG_WARNING, "BUG! Attempt to forward to myself!\n");
+                       goto out_fail;
        }
 
-       clulog(LOG_DEBUG, "Forwarding %s request to %d\n",
+       clulog(LOG_DEBUG, "FW: Forwarding %s request to %d\n",
               rg_req_str(req->rr_request), rgs.rs_owner);
 
-       while ((ctx = msg_new_ctx()) == NULL)
-               sleep(1);
-
-       if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 10) < 0)
+       ctx = msg_new_ctx();
+       if (ctx == NULL) {
+               clulog(LOG_DEBUG, "FW: Failed to allocate socket context: %s\n",
+                      strerror(errno));
                goto out_fail;
-       if (msg_send(ctx, &msg, sizeof(msg)) < sizeof(msg))
+       }
+
+       /* Construct message */
+       build_message(&msg, req->rr_request, req->rr_group, req->rr_target);
+
+       if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 10) < 0) {
+               clulog(LOG_DEBUG, "FW: Failed to open channel to %d CTX: %p\n",
+                      rgs.rs_owner, ctx);
                goto out_fail;
-       if (msg_receive(ctx, &msg, sizeof(msg), 600) < sizeof(msg))
+       }
+       if (msg_send(ctx, &msg, sizeof(msg)) < sizeof(msg)) {
+               clulog(LOG_DEBUG, "FW: Failed to send message to %d CTX: %p\n",
+                      rgs.rs_owner, ctx);
                goto out_fail;
+       }
 
-       msg_close(ctx);
-       msg_free_ctx(ctx);
+        /*
+        * Ok, we're forwarding a message to another node.  Keep tabs on
+        * the node to make sure it doesn't die.  Basically, wake up every
+        * now and again to make sure it's still online.  If it isn't, send
+        * a response back to the caller.
+        */
+       do {
+               ret = msg_receive(ctx, &msg, sizeof(msg), 10);
+               if (ret < (int)sizeof(msg)) {
+                       if (ret < 0 && errno == ETIMEDOUT) {
+                               m = member_list();
+                               if (!memb_online(m, rgs.rs_owner)) {
+                                       response_code = RG_ENODE;
+                                       goto out_fail;
+                               }
+                               free_member_list(m);
+                               m = NULL;
+                               continue;
+                       }
+                       goto out_fail;
+               }
+               break;
+       } while(++retries < 60); /* old 60 second rule */
 
        swab_SmMessageSt(&msg);
-       send_response(msg.sm_data.d_ret, msg.sm_data.d_svcOwner, req);
-       rq_free(req);
-       pthread_exit(NULL);
-       
-out_fail: /* Failure path */
+
+       response_code = msg.sm_data.d_ret;
+       new_owner = msg.sm_data.d_svcOwner;
+
+out_fail:
+       send_response(response_code, new_owner, req);
+       msg_close(req->rr_resp_ctx);
+       msg_free_ctx(req->rr_resp_ctx);
+
        if (ctx) {
                msg_close(ctx);
                msg_free_ctx(ctx);
        }
-       msg_close(req->rr_resp_ctx);
-       msg_free_ctx(req->rr_resp_ctx);
+       if (m)
+               free_member_list(m);
+
        rq_free(req);
        pthread_exit(NULL);
 }
--- cluster/rgmanager/src/daemons/rg_state.c    2007/07/02 15:13:43     
1.24.2.10
+++ cluster/rgmanager/src/daemons/rg_state.c    2007/07/24 13:53:08     
1.24.2.11
@@ -217,9 +217,6 @@
 
        swab_SmMessageSt(msgp);
        msg_send(ctx, msgp, sizeof(*msgp));
-
-       /* :) */
-       msg_close(ctx);
 }
 
        
@@ -245,11 +242,6 @@
 
        swab_SmMessageSt(msgp);
        msg_send(req->rr_resp_ctx, msgp, sizeof(*msgp));
-
-       /* :( */
-       msg_close(req->rr_resp_ctx);
-       msg_free_ctx(req->rr_resp_ctx);
-       req->rr_resp_ctx = NULL;
 }
 
 
@@ -548,6 +540,7 @@
                        break;
                }
 
+               ret = 2;
                clulog(LOG_DEBUG, "Not stopping disabled service %s\n",
                       svcName);
                break;
@@ -1510,6 +1503,11 @@
        int ret, x;
        rg_state_t svcStatus;
        
+       get_rg_state_local(svcName, &svcStatus);
+       if (svcStatus.rs_state == RG_STATE_DISABLED ||
+           svcStatus.rs_state == RG_STATE_UNINITIALIZED)
+               return RG_EINVAL;
+
        if (preferred_target > 0) {
                /* TODO: simplify this and don't keep alloc/freeing 
                   member lists */
@@ -1576,8 +1574,10 @@
                 * I am the ONLY one capable of running this service,
                 * PERIOD...
                 */
-               if (target == me && me != preferred_target)
+               if (target == me && me != preferred_target) {
+                       free_member_list(backup);
                        goto exhausted;
+               }
 
                if (target == me) {
                        /*
@@ -1839,8 +1839,16 @@
        int tolerance = FOD_BEST;
        int x;
        uint32_t me = my_id();
-       cluster_member_list_t *membership = member_list();
-       int need_check = have_exclusive_resources();
+       cluster_member_list_t *membership;
+       int need_check;
+
+       if (rg_locked()) {
+               /* don't even calc if rg's locked */
+               return RG_EFAIL;
+       }
+
+       need_check = have_exclusive_resources();
+       membership = member_list();
 
        /* XXX ok, so we need to say "should I start this if I was the
           only cluster member online */
@@ -1933,6 +1941,8 @@
                                          svcName, 1);
                if (target == me) {
                        ret = handle_start_remote_req(svcName, request);
+                       if (ret == RG_EAGAIN)
+                               goto out;
                } else if (target < 0) {
                        ret = RG_EFAIL;
                        goto out;
--- cluster/rgmanager/src/daemons/rg_thread.c   2007/07/10 18:24:00     1.15.2.7
+++ cluster/rgmanager/src/daemons/rg_thread.c   2007/07/24 13:53:08     1.15.2.8
@@ -60,19 +60,39 @@
   SIGUSR1 output
  */
 void
-dump_threads(void)
+dump_threads(FILE *fp)
 {
        resthread_t *rt;
+       request_t *req;
+       int x = 0, y = 0;
 
-       printf("+++ BEGIN Thread dump\n");
+       fprintf(fp, "Resource Group Threads \n");
        pthread_mutex_lock(&reslist_mutex);
-       list_do(&resthread_list, rt) {
-               printf("TID %d group %s (@ %p) request %d\n",
-                      (int)rt->rt_thread,
-                      rt->rt_name, rt, rt->rt_request);
-       } while (!list_done(&resthread_list, rt));
+       list_for(&resthread_list, rt, x) {
+               fprintf(fp, "  %s id:%d (@ %p) processing %s request (%d)\n",
+                       rt->rt_name,
+                       (unsigned)rt->rt_thread,
+                       rt,
+                       rg_req_str(rt->rt_request),
+                       rt->rt_request);
+               if (!*rt->rt_queue) {
+                       fprintf(fp, "    Pending requests: \n");
+                       list_for(rt->rt_queue, req, y) {
+                               fprintf(fp, "      %s tgt:%d  ctx:%p  a0:%d  
a1:%d\n",
+                                       rg_req_str(req->rr_request),
+                                       req->rr_target,
+                                       req->rr_resp_ctx,
+                                       req->rr_arg0,
+                                       req->rr_arg1);
+                       }
+               }
+       }
+
+       x = !!resthread_list;
        pthread_mutex_unlock(&reslist_mutex);
-       printf("--- END Thread dump\n");
+       if (!x)
+               fprintf(fp, "  (none)\n");
+       fprintf(fp, "\n");
 }
 
 
@@ -151,6 +171,8 @@
                dprintf("Removed request %d\n", curr->rr_request);
                if (curr->rr_resp_ctx) {
                        send_response(RG_EABORT, 0, curr);
+                       msg_close(curr->rr_resp_ctx);
+                       msg_free_ctx(curr->rr_resp_ctx);
                }
                rq_free(curr);
        }
@@ -241,12 +263,14 @@
                        break;
 
                case RG_ENABLE:
+                       #if 0
                        if (req->rr_target != 0 &&
                            req->rr_target != my_id()) {
                                error = RG_EFORWARD;
                                ret = RG_NONE;
                                break;
                        }
+                       #endif
                case RG_START:
                        if (req->rr_arg0) {
                                error = handle_fd_start_req(myname,
@@ -464,6 +488,8 @@
                if (ret != RG_NONE && rg_initialized() &&
                    (req->rr_resp_ctx)) {
                        send_response(error, newowner, req);
+                       msg_close(req->rr_resp_ctx);
+                       msg_free_ctx(req->rr_resp_ctx);
                }
                
                rq_free(req);
@@ -572,7 +598,7 @@
 
        pthread_mutex_unlock(&reslist_mutex);
        if (wait_initialize(resgroupname) < 0) {
-               goto retry;
+               return -1;
        }
 
        return ret;
@@ -677,6 +703,8 @@
                case RG_ENABLE:
                        send_ret(response_ctx, resgroup->rt_name, RG_EDEADLCK,
                                 request);
+                       msg_close(response_ctx);
+                       msg_free_ctx(response_ctx);
                        break;
                }
                fprintf(stderr, "Failed to queue request: Would block\n");
--- cluster/rgmanager/src/utils/clusvcadm.c     2007/06/14 13:35:59     1.12.2.4
+++ cluster/rgmanager/src/utils/clusvcadm.c     2007/07/24 13:53:08     1.12.2.5
@@ -380,7 +380,10 @@
                        printf("Member %s %s %s", nodename, actionstr, svcname);
                printf("...");
                fflush(stdout);
-               msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5);
+               if (msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5) < 0) {
+                       printf("Could not connect to resource group manager\n");
+                       return 1;
+               }
        } else {
                if (!svctarget)
                        printf("Trying to %s %s", actionstr, svcname);
@@ -389,7 +392,10 @@
                               nodename);
                printf("...");
                fflush(stdout);
-               msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5);
+               if (msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5) < 0) {
+                       printf("Could not connect to resource group manager\n");
+                       return 1;
+               }
        }
 
        if (ctx.type < 0) {

Reply via email to