CVSROOT:        /cvs/cluster
Module name:    cluster
Branch:         RHEL5
Changes by:     [EMAIL PROTECTED]       2008-02-01 15:15:03

Modified files:
        rgmanager/include: resgroup.h restart_counter.h 
        rgmanager/src/daemons: groups.c restart_counter.c rg_state.c 
                               slang_event.c 
        rgmanager/src/resources: default_event_script.sl 

Log message:
        Allow restart counters to work with central_processing; #400211 / 
#431130

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.9&r2=1.15.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.2&r2=1.1.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.14&r2=1.25.2.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.1&r2=1.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.17&r2=1.24.2.18
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/slang_event.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3.2.1&r2=1.3.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/default_event_script.sl.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.3&r2=1.1.2.4

--- cluster/rgmanager/include/resgroup.h        2007/12/18 17:52:56     1.15.2.9
+++ cluster/rgmanager/include/resgroup.h        2008/02/01 15:15:02     
1.15.2.10
@@ -135,6 +135,7 @@
 int svc_fail(char *svcName);
 int svc_migrate(char *svcName, int target);
 int check_restart(char *svcName);
+int add_restart(char *svcName);
 
 int rt_enqueue_request(const char *resgroupname, int request,
                       msgctx_t *resp_ctx,
--- cluster/rgmanager/include/restart_counter.h 2007/12/18 17:52:56     1.1.2.2
+++ cluster/rgmanager/include/restart_counter.h 2008/02/01 15:15:02     1.1.2.3
@@ -25,6 +25,7 @@
 int restart_add(restart_counter_t arg);
 int restart_clear(restart_counter_t arg);
 int restart_count(restart_counter_t arg);
+int restart_treshold_exceeded(restart_counter_t arg);
 restart_counter_t restart_init(time_t expire_timeout, int max_restarts);
 int restart_cleanup(restart_counter_t arg);
 
--- cluster/rgmanager/src/daemons/groups.c      2007/12/18 17:52:56     
1.25.2.14
+++ cluster/rgmanager/src/daemons/groups.c      2008/02/01 15:15:02     
1.25.2.15
@@ -1787,7 +1787,7 @@
 
 
 int
-check_restart(char *rg_name)
+add_restart(char *rg_name)
 {
        resource_node_t *node;
        int ret = 1;
@@ -1796,11 +1796,24 @@
        node = node_by_ref(&_tree, rg_name);
        if (node) {
                ret = restart_add(node->rn_restart_counter);
-               if (ret) {
-                       /* Clear it out - caller is about 
-                          to relocate the service anyway */
-                       restart_clear(node->rn_restart_counter);
-               }
+       }
+       pthread_rwlock_unlock(&resource_lock);
+
+       return ret;
+}
+
+
+int
+check_restart(char *rg_name)
+{
+       resource_node_t *node;
+       int ret = 0;
+
+       pthread_rwlock_rdlock(&resource_lock);
+       node = node_by_ref(&_tree, rg_name);
+       if (node) {
+               printf("%s %p\n", rg_name, node->rn_restart_counter);
+               ret = restart_threshold_exceeded(node->rn_restart_counter);
        }
        pthread_rwlock_unlock(&resource_lock);
 
--- cluster/rgmanager/src/daemons/restart_counter.c     2007/11/26 21:46:27     
1.1.2.1
+++ cluster/rgmanager/src/daemons/restart_counter.c     2008/02/01 15:15:02     
1.1.2.2
@@ -46,6 +46,10 @@
 
 #define VALIDATE(arg, ret) \
 do { \
+       if (!arg) {\
+               errno = EINVAL; \
+               return ret; \
+       } \
        if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\
                errno = EINVAL; \
                return ret; \
@@ -97,6 +101,21 @@
 }
 
 
+int
+restart_threshold_exceeded(restart_counter_t arg)
+{
+       restart_info_t *restarts = (restart_info_t *)arg;
+       time_t now;
+
+       VALIDATE(arg, -1);
+       now = time(NULL);
+       restart_timer_purge(arg, now);
+       if (restarts->restart_count >= restarts->max_restarts)
+               return 1;
+       return 0;
+}
+
+
 /* Add a restart entry to the list.  Returns 1 if restart
    count is exceeded */
 int
@@ -127,7 +146,7 @@
        /* Check and remove old entries */
        restart_timer_purge(restarts, t);
 
-       if (restarts->restart_count > restarts->max_restarts)
+       if (restarts->restart_count >= restarts->max_restarts)
                return 1;
 
        return 0;
@@ -170,6 +189,7 @@
        info->expire_timeout = expire_timeout;
        info->max_restarts = max_restarts;
        info->restart_count = 0;
+       info->restart_nodes = NULL;
 
        return (void *)info;
 }
--- cluster/rgmanager/src/daemons/rg_state.c    2008/01/25 18:09:24     
1.24.2.17
+++ cluster/rgmanager/src/daemons/rg_state.c    2008/02/01 15:15:02     
1.24.2.18
@@ -678,7 +678,6 @@
                        clulog(LOG_NOTICE,
                               "Recovering failed service %s\n",
                               svcName);
-                       svcStatus->rs_state = RG_STATE_STOPPED;
                        /* Start! */
                        ret = 1;
                        break;
@@ -789,13 +788,16 @@
        /* LOCK HELD if we get here */
 
        svcStatus.rs_owner = my_id();
-       svcStatus.rs_state = RG_STATE_STARTING;
        svcStatus.rs_transition = (uint64_t)time(NULL);
 
-       if (req == RG_START_RECOVER)
+       if (svcStatus.rs_state == RG_STATE_RECOVER) {
+               add_restart(svcName);
                svcStatus.rs_restarts++;
-       else
+       } else {
                svcStatus.rs_restarts = 0;
+       }
+
+       svcStatus.rs_state = RG_STATE_STARTING;
 
        if (set_rg_state(svcName, &svcStatus) < 0) {
                clulog(LOG_ERR,
@@ -1248,7 +1250,7 @@
 {
        struct dlm_lksb lockp;
        rg_state_t svcStatus;
-       int ret;
+       int ret = 0;
        int old_state;
 
        if (!rg_quorate()) {
@@ -1291,6 +1293,18 @@
 
        old_state = svcStatus.rs_state;
 
+       if (old_state == RG_STATE_RECOVER) {
+               clulog(LOG_DEBUG, "%s is clean; skipping double-stop\n",
+                      svcName);
+               svcStatus.rs_state = newstate;
+
+               if (set_rg_state(svcName, &svcStatus) != 0) {
+                       rg_unlock(&lockp);
+                       clulog(LOG_ERR, "#52: Failed changing RG status\n");
+                       return RG_EFAIL;
+               }
+       } 
+
        clulog(LOG_NOTICE, "Stopping service %s\n", svcName);
 
        if (recover) 
--- cluster/rgmanager/src/daemons/slang_event.c 2007/12/18 17:52:56     1.3.2.1
+++ cluster/rgmanager/src/daemons/slang_event.c 2008/02/01 15:15:02     1.3.2.2
@@ -80,6 +80,7 @@
    _node_clean = 0,
    _service_owner = 0,
    _service_last_owner = 0,
+   _service_restarts_exceeded = 0,
    _user_request = 0,
    _user_arg1 = 0,
    _user_arg2 = 0,
@@ -123,6 +124,8 @@
        MAKE_VARIABLE("service_owner",  &_service_owner,SLANG_INT_TYPE, 1),
        MAKE_VARIABLE("service_last_owner", &_service_last_owner,
                                                        SLANG_INT_TYPE, 1),
+       MAKE_VARIABLE("service_restarts_exceeded", &_service_restarts_exceeded,
+                                                       SLANG_INT_TYPE, 1),
 
        /* User event information */
        MAKE_VARIABLE("user_request",   &_user_request, SLANG_INT_TYPE,1),
@@ -226,6 +229,7 @@
 sl_service_status(char *svcName)
 {
        rg_state_t svcStatus;
+       int restarts_exceeded = 0;
        char *state_str;
 
        if (get_service_state_internal(svcName, &svcStatus) < 0) {
@@ -236,6 +240,15 @@
                return;
        }
 
+       restarts_exceeded = check_restart(svcName);
+       if (SLang_push_integer(restarts_exceeded) < 0) {
+               SLang_verror(SL_RunTime_Error,
+                            "%s: Failed to push restarts_exceeded %s",
+                            __FUNCTION__,
+                            svcName);
+               return;
+       }
+
        if (SLang_push_integer(svcStatus.rs_restarts) < 0) {
                SLang_verror(SL_RunTime_Error,
                             "%s: Failed to push restarts for %s",
@@ -1085,6 +1098,7 @@
        _service_state = (char *)rg_state_str(state);
        _service_owner = owner;
        _service_last_owner = last_owner;
+       _service_restarts_exceeded = check_restart(name);
 
        switch(state) {
        case RG_STATE_DISABLED:
@@ -1102,6 +1116,7 @@
        _service_state = 0;
        _service_owner = 0;
        _service_last_owner = 0;
+       _service_restarts_exceeded = 0;
 
        return ret;
 }
--- cluster/rgmanager/src/resources/default_event_script.sl     2007/12/19 
21:33:26     1.1.2.3
+++ cluster/rgmanager/src/resources/default_event_script.sl     2008/02/01 
15:15:03     1.1.2.4
@@ -154,7 +154,8 @@
                debug("Recovering",
                      " Service: ", service_name,
                      " Last owner: ", service_last_owner,
-                     " Policy: ", policy);
+                     " Policy: ", policy,
+                     " RTE: ", service_restarts_exceeded);
 
                if (policy == "disable") {
                        () = service_stop(service_name, 1);
@@ -162,13 +163,17 @@
                }
 
                nodes = allowed_nodes(service_name);
-               if (policy == "restart") {
-                       tmp = union(service_last_owner, nodes);
+               if (policy == "restart" and service_restarts_exceeded == 0) {
+                       nodes = union(service_last_owner, nodes);
                } else {
                        % relocate 
                        tmp = subtract(nodes, service_last_owner);
-                       nodes = tmp;
-                       tmp = union(nodes, service_last_owner);
+                       if (length(tmp) == 0) {
+                               () = service_stop(service_name,0);
+                               return;
+                       }
+
+                       nodes = union(tmp, service_last_owner);
                }
 
                ()=move_or_start(service_name, nodes);

Reply via email to