CVSROOT:        /cvs/cluster
Module name:    cluster
Changes by:     [EMAIL PROTECTED]       2007-11-30 20:36:18

Modified files:
        rgmanager      : ChangeLog 
        rgmanager/include: resgroup.h reslist.h 
        rgmanager/src/daemons: Makefile fo_domain.c groups.c main.c 
                               reslist.c resrules.c restree.c rg_state.c 
                               test.c 
Added files:
        rgmanager/include: restart_counter.h 
        rgmanager/src/daemons: restart_counter.c 

Log message:
        Merges from RHEL5 branch - round 2.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.60&r2=1.61
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&r1=1.13&r2=1.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.39&r2=1.40
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/reslist.c.diff?cvsroot=cluster&r1=1.19&r2=1.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&r1=1.12&r2=1.13

--- cluster/rgmanager/ChangeLog 2007/11/30 20:06:55     1.60
+++ cluster/rgmanager/ChangeLog 2007/11/30 20:36:17     1.61
@@ -1,6 +1,8 @@
 2007-11-30 Lon Hohberger <lhh at redhat.com>
-       * src/resources/*: Merge from RHEL5 branch.
-       * src/utils/*: Merge from RHEL5 branch.
+       * src/resources/*: Merge misc. updates from RHEL5 branch.
+       * src/utils/*: Merge misc. updates from RHEL5 branch.
+       * include/*.h, src/daemons/*: Merge status-counter patch
+       from RHEL5 branch.
 
 2007-08-30 Lon Hohberger <lhh at redhat.com>
        * src/daemons/restree.c, rg_state.c: Fix tree-restart bug
--- cluster/rgmanager/include/restart_counter.h 2007/11/26 21:46:26     1.1
+++ cluster/rgmanager/include/restart_counter.h 2007/11/30 20:36:17     1.2
@@ -0,0 +1,12 @@
+#ifndef _RESTART_COUNTER_H
+#define _RESTART_COUNTER_H
+
+typedef void *restart_counter_t;
+
+int restart_add(restart_counter_t arg);
+int restart_clear(restart_counter_t arg);
+int restart_count(restart_counter_t arg);
+restart_counter_t restart_init(time_t expire_timeout, int max_restarts);
+int restart_cleanup(restart_counter_t arg);
+
+#endif
--- cluster/rgmanager/include/resgroup.h        2007/06/27 14:03:51     1.23
+++ cluster/rgmanager/include/resgroup.h        2007/11/30 20:36:17     1.24
@@ -150,6 +150,8 @@
 int svc_freeze(char *svcName);
 int svc_unfreeze(char *svcName);
 int svc_migrate(char *svcName, int target);
+int check_restart(char *svcName);
+
 int rt_enqueue_request(const char *resgroupname, int request,
                       msgctx_t *resp_ctx,
                               int max, uint32_t target, int arg0, int arg1);
--- cluster/rgmanager/include/reslist.h 2007/08/02 14:53:37     1.23
+++ cluster/rgmanager/include/reslist.h 2007/11/30 20:36:17     1.24
@@ -126,6 +126,7 @@
        struct _rg_node *rn_child, *rn_parent;
        resource_t      *rn_resource;
        resource_act_t  *rn_actions;
+       restart_counter_t rn_restart_counter;
        int     rn_state; /* State of this instance of rn_resource */
        int     rn_flags;
        int     rn_last_status;
--- cluster/rgmanager/src/daemons/restart_counter.c     2007/11/26 21:46:27     
1.1
+++ cluster/rgmanager/src/daemons/restart_counter.c     2007/11/30 20:36:17     
1.2
@@ -0,0 +1,185 @@
+/*
+  Copyright Red Hat, Inc. 2007
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License version 2 as published
+  by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; see the file COPYING.  If not, write to the
+  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
+  MA 02139, USA.
+*/
+/* Time-based restart counters for rgmanager */
+
+#include <stdio.h>
+#include <list.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <time.h>
+#include <restart_counter.h>
+
+
+
+#define RESTART_INFO_MAGIC 0x184820ab
+
+typedef struct {
+       list_head();
+       time_t restart_time;
+} restart_item_t;
+
+typedef struct {
+       int magic;
+       time_t expire_timeout;
+       int max_restarts;
+       int restart_count;
+       restart_item_t *restart_nodes;
+} restart_info_t;
+
+
+#define VALIDATE(arg, ret) \
+do { \
+       if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\
+               errno = EINVAL; \
+               return ret; \
+       } \
+} while(0)
+
+
+/* Remove expired restarts */
+static int
+restart_timer_purge(restart_counter_t arg, time_t now)
+{
+       restart_info_t *restarts = (restart_info_t *)arg;
+       restart_item_t *i;
+       int x, done = 0;
+
+       VALIDATE(arg, -1);
+
+       /* No timeout */
+       if (restarts->expire_timeout == 0)
+               return 0;
+
+       do {
+               done = 1;
+               list_for(&restarts->restart_nodes, i, x) {
+                       if ((now - i->restart_time) >=
+                           restarts->expire_timeout) {
+                               restarts->restart_count--;
+                               list_remove(&restarts->restart_nodes, i);
+                               done = 0;
+                               break;
+                       }
+               }
+       } while(!done);
+
+       return 0;
+}
+
+
+int
+restart_count(restart_counter_t arg)
+{
+       restart_info_t *restarts = (restart_info_t *)arg;
+       time_t now;
+
+       VALIDATE(arg, -1);
+       now = time(NULL);
+       restart_timer_purge(arg, now);
+       return restarts->restart_count;
+}
+
+
+/* Add a restart entry to the list.  Returns 1 if restart
+   count is exceeded */
+int
+restart_add(restart_counter_t arg)
+{
+       restart_info_t *restarts = (restart_info_t *)arg;
+       restart_item_t *i;
+       time_t t;
+
+       if (!arg)
+               /* No max restarts / threshold = always
+                  ok to restart! */
+               return 0;
+
+       VALIDATE(arg, -1);
+
+       i = malloc(sizeof(*i));
+       if (!i) {
+               return -1;
+       }
+
+       t = time(NULL);
+       i->restart_time = t;
+
+       list_insert(&restarts->restart_nodes, i);
+       restarts->restart_count++;
+
+       /* Check and remove old entries */
+       restart_timer_purge(restarts, t);
+
+       if (restarts->restart_count > restarts->max_restarts)
+               return 1;
+
+       return 0;
+}
+
+
+int
+restart_clear(restart_counter_t arg)
+{
+       restart_info_t *restarts = (restart_info_t *)arg;
+       restart_item_t *i;
+
+       VALIDATE(arg, -1);
+       while ((i = restarts->restart_nodes)) {
+               list_remove(&restarts->restart_nodes, i);
+               free(i);
+       }
+
+       restarts->restart_count = 0;
+
+       return 0;
+}
+
+
+restart_counter_t
+restart_init(time_t expire_timeout, int max_restarts)
+{
+       restart_info_t *info;
+
+       if (max_restarts < 0) {
+               errno = EINVAL;
+               return NULL;
+       }
+
+       info = malloc(sizeof(*info));
+       if (info == NULL)
+               return NULL;
+
+       info->magic = RESTART_INFO_MAGIC;
+       info->expire_timeout = expire_timeout;
+       info->max_restarts = max_restarts;
+       info->restart_count = 0;
+
+       return (void *)info;
+}
+
+
+int
+restart_cleanup(restart_counter_t arg)
+{
+       VALIDATE(arg, -1);
+       restart_clear(arg);
+       free(arg);
+       return 0;
+}
--- cluster/rgmanager/src/daemons/Makefile      2007/08/28 04:35:47     1.23
+++ cluster/rgmanager/src/daemons/Makefile      2007/11/30 20:36:17     1.24
@@ -31,12 +31,14 @@
        rg_queue.o \
        rg_state.o \
        rg_thread.o \
+       restart_counter.o \
        watchdog.o
 
 OBJS2= clurmtabd.o \
        clurmtabd_lib.o
 
-OBJS3= test-noccs.o
+OBJS3= test-noccs.o \
+       restart_counter.o
 
 OBJS4= dtest-noccs.o
 
--- cluster/rgmanager/src/daemons/fo_domain.c   2007/03/20 17:09:57     1.13
+++ cluster/rgmanager/src/daemons/fo_domain.c   2007/11/30 20:36:17     1.14
@@ -27,6 +27,7 @@
 #include <list.h>
 #include <clulog.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <ccs.h>
 #include <pthread.h>
--- cluster/rgmanager/src/daemons/groups.c      2007/08/02 14:53:38     1.39
+++ cluster/rgmanager/src/daemons/groups.c      2007/11/30 20:36:17     1.40
@@ -20,6 +20,7 @@
 //#define DEBUG
 #include <platform.h>
 #include <resgroup.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <vf.h>
 #include <message.h>
@@ -179,6 +180,29 @@
 }
 
 
+resource_node_t *
+node_by_ref(resource_node_t **tree, char *name)
+{
+       resource_t *res;
+       resource_node_t *node, *ret = NULL;
+       char rgname[64];
+       int x;
+
+       list_for(&_tree, node, x) {
+
+               res = node->rn_resource;
+               res_build_name(rgname, sizeof(rgname), res);
+
+               if (!strcasecmp(name, rgname)) {
+                       ret = node;
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+
 int
 count_resource_groups_local(cman_node_t *mp)
 {
@@ -1587,6 +1611,28 @@
 }
 
 
+int
+check_restart(char *rg_name)
+{
+       resource_node_t *node;
+       int ret = 1;
+
+       pthread_rwlock_rdlock(&resource_lock);
+       node = node_by_ref(&_tree, rg_name);
+       if (node) {
+               ret = restart_add(node->rn_restart_counter);
+               if (ret) {
+                       /* Clear it out - caller is about 
+                          to relocate the service anyway */
+                       restart_clear(node->rn_restart_counter);
+               }
+       }
+       pthread_rwlock_unlock(&resource_lock);
+
+       return ret;
+}
+
+
 void
 kill_resource_groups(void)
 {
--- cluster/rgmanager/src/daemons/main.c        2007/09/19 09:54:19     1.44
+++ cluster/rgmanager/src/daemons/main.c        2007/11/30 20:36:17     1.45
@@ -166,6 +166,7 @@
 
        old_membership = member_list();
        new_ml = get_member_list(h);
+       memb_mark_down(new_ml, 0);
 
        for (x = 0; x < new_ml->cml_count; x++) {
 
@@ -182,19 +183,25 @@
                        quorate = cman_is_listening(h,
                                        new_ml->cml_members[x].cn_nodeid,
                                        port);
+
                        if (quorate == 0) {
                                clulog(LOG_DEBUG, "Node %d is not listening\n",
                                        new_ml->cml_members[x].cn_nodeid);
                                new_ml->cml_members[x].cn_member = 0;
                        } else if (quorate < 0) {
+                               if (errno == ENOTCONN) {
+                                       new_ml->cml_members[x].cn_member = 0;
+                                       break;
+                               }
                                perror("cman_is_listening");
                                usleep(50000);
                                continue;
                        }
-
 #ifdef DEBUG
-                       printf("Node %d IS listening\n",
-                              new_ml->cml_members[x].cn_nodeid);
+                       else {
+                               printf("Node %d IS listening\n",
+                                      new_ml->cml_members[x].cn_nodeid);
+                       }
 #endif
                        break;
                } while(1);
@@ -202,7 +209,6 @@
 
        cman_finish(h);
        member_list_update(new_ml);
-       member_set_state(0, 0);         /* Mark qdisk as dead */
 
        /*
         * Handle nodes lost.  Do our local node event first.
--- cluster/rgmanager/src/daemons/reslist.c     2007/07/31 18:00:25     1.19
+++ cluster/rgmanager/src/daemons/reslist.c     2007/11/30 20:36:17     1.20
@@ -26,6 +26,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #ifndef NO_CCS
--- cluster/rgmanager/src/daemons/resrules.c    2007/07/31 18:00:25     1.23
+++ cluster/rgmanager/src/daemons/resrules.c    2007/11/30 20:36:17     1.24
@@ -27,6 +27,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <ctype.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <dirent.h>
@@ -230,43 +232,70 @@
 
 
 int
-expand_time(char *val)
+expand_time (char *val)
 {
-       int l = strlen(val);
-       char c = val[l - 1];
-       int ret = atoi(val);
+       int curval, len;
+       int ret = 0;
+       char *start = val, ival[16];
 
-       if (ret <= 0)
-               return 0;
+       if (!val)
+               return (time_t)0;
+
+       while (start[0]) {
+
+               len = 0;
+               curval = 0;
+               memset(ival, 0, sizeof(ival));
+
+               while (isdigit(start[len])) {
+                       ival[len] = start[len];
+                       len++;
+               }
+
+               if (len) {
+                       curval = atoi(ival);
+               } else {
+                       len = 1;
+               }
 
-       if ((c >= '0') && (c <= '9'))
-               return ret;
+               switch(start[len]) {
+               case 0:
+               case 'S':
+               case 's':
+                       break;
+               case 'M':
+               case 'm':
+                       curval *= 60;
+                       break;
+               case 'h':
+               case 'H':
+                       curval *= 3600;
+                       break;
+               case 'd':
+               case 'D':
+                       curval *= 86400;
+                       break;
+               case 'w':
+               case 'W':
+                       curval *= 604800;
+                       break;
+               case 'y':
+               case 'Y':
+                       curval *= 31536000;
+                       break;
+               default:
+                       curval = 0;
+               }
 
-       switch(c) {
-       case 'S':
-       case 's':
-               return (ret);
-       case 'M':
-       case 'm':
-               return (ret * 60);
-       case 'h':
-       case 'H':
-               return (ret * 3600);
-       case 'd':
-       case 'D':
-               return (ret * 86400);
-       case 'w':
-       case 'W':
-               return (ret * 604800);
-       case 'y':
-       case 'Y':
-               return (ret * 31536000);
+               ret += (time_t)curval;
+               start += len;
        }
 
        return ret;
 }
 
 
+
 /**
  * Store a resource action
  * @param actsp                Action array; may be modified and returned!
--- cluster/rgmanager/src/daemons/restree.c     2007/08/30 16:09:39     1.37
+++ cluster/rgmanager/src/daemons/restree.c     2007/11/30 20:36:17     1.38
@@ -30,6 +30,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <clulog.h>
@@ -432,6 +433,39 @@
 }
 
 
+static inline void
+assign_restart_policy(resource_t *curres, resource_node_t *parent,
+                     resource_node_t *node)
+{
+       char *val;
+       int max_restarts = 0;
+       time_t restart_expire_time = 0;
+
+       node->rn_restart_counter = NULL;
+
+       if (!curres || !node)
+               return;
+       if (parent) /* Non-parents don't get one for now */
+               return;
+
+       val = res_attr_value(curres, "max_restarts");
+       if (!val)
+               return;
+       max_restarts = atoi(val);
+       if (max_restarts <= 0)
+               return;
+       val = res_attr_value(curres, "restart_expire_time");
+       if (val) {
+               restart_expire_time = (time_t)expand_time(val);
+               if (!restart_expire_time)
+                       return;
+       }
+
+       node->rn_restart_counter = restart_init(restart_expire_time,
+                                               max_restarts);
+}
+
+
 static inline int
 do_load_resource(int ccsfd, char *base,
                 resource_rule_t *rule,
@@ -514,6 +548,7 @@
        node->rn_state = RES_STOPPED;
        node->rn_flags = 0;
        node->rn_actions = (resource_act_t *)act_dup(curres->r_actions);
+       assign_restart_policy(curres, parent, node);
 
        snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base);
 #ifndef NO_CCS
@@ -769,6 +804,11 @@
                        destroy_resource_tree(&(*tree)->rn_child);
 
                list_remove(tree, node);
+
+               if (node->rn_restart_counter) {
+                       restart_cleanup(node->rn_restart_counter);
+               }
+
                if(node->rn_actions){
                        free(node->rn_actions);
                }
--- cluster/rgmanager/src/daemons/rg_state.c    2007/08/30 16:09:39     1.40
+++ cluster/rgmanager/src/daemons/rg_state.c    2007/11/30 20:36:18     1.41
@@ -1350,7 +1350,8 @@
        }
 
        if ((svcStatus.rs_state != RG_STATE_STOPPING) &&
-            (svcStatus.rs_state != RG_STATE_ERROR)) {
+           (svcStatus.rs_state != RG_STATE_ERROR) &&
+           (svcStatus.rs_state != RG_STATE_RECOVER)) {
                rg_unlock(&lockp);
                return 0;
        }
@@ -1829,8 +1830,10 @@
         * We got sent here from handle_start_req.
         * We're DONE.
         */
-       if (request == RG_START_RECOVER)
+       if (request == RG_START_RECOVER) {
+               _svc_stop_finish(svcName, 0, RG_STATE_STOPPED);
                return RG_EFAIL;
+       }
 
        /*
         * All potential places for the service to start have been exhausted.
@@ -1839,7 +1842,7 @@
 exhausted:
        if (!rg_locked()) {
                clulog(LOG_WARNING,
-                      "#70: Attempting to restart service %s locally.\n",
+                      "#70: Failed to relocate %s; restarting locally\n",
                       svcName);
                if (svc_start(svcName, RG_START_RECOVER) == 0) {
                        *new_owner = me;
@@ -2078,6 +2081,14 @@
                                           new_owner);
        }
 
+       /* Check restart counter/timer for this resource */
+       if (check_restart(svcName) > 0) {
+               clulog(LOG_NOTICE, "Restart threshold for %s exceeded; "
+                      "attempting to relocate\n", svcName);
+               return handle_relocate_req(svcName, RG_START_RECOVER, -1,
+                                          new_owner);
+       }
+
        return handle_start_req(svcName, RG_START_RECOVER, new_owner);
 }
 
--- cluster/rgmanager/src/daemons/test.c        2007/07/31 18:02:49     1.12
+++ cluster/rgmanager/src/daemons/test.c        2007/11/30 20:36:18     1.13
@@ -25,6 +25,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <list.h>
+#include <restart_counter.h>
 #include <reslist.h>
 #include <pthread.h>
 #include <depends.h>

Reply via email to