CVSROOT: /cvs/cluster Module name: cluster Changes by: [EMAIL PROTECTED] 2007-11-30 20:36:18
Modified files: rgmanager : ChangeLog rgmanager/include: resgroup.h reslist.h rgmanager/src/daemons: Makefile fo_domain.c groups.c main.c reslist.c resrules.c restree.c rg_state.c test.c Added files: rgmanager/include: restart_counter.h rgmanager/src/daemons: restart_counter.c Log message: Merges from RHEL5 branch - round 2. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.60&r2=1.61 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&r1=1.1&r2=1.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&r1=1.23&r2=1.24 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/reslist.h.diff?cvsroot=cluster&r1=1.23&r2=1.24 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&r1=1.1&r2=1.2 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&r1=1.23&r2=1.24 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/fo_domain.c.diff?cvsroot=cluster&r1=1.13&r2=1.14 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.39&r2=1.40 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/reslist.c.diff?cvsroot=cluster&r1=1.19&r2=1.20 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/resrules.c.diff?cvsroot=cluster&r1=1.23&r2=1.24 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restree.c.diff?cvsroot=cluster&r1=1.37&r2=1.38 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.40&r2=1.41 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/test.c.diff?cvsroot=cluster&r1=1.12&r2=1.13 --- cluster/rgmanager/ChangeLog 2007/11/30 20:06:55 1.60 +++ cluster/rgmanager/ChangeLog 2007/11/30 20:36:17 1.61 @@ -1,6 +1,8 @@ 2007-11-30 Lon Hohberger <lhh at redhat.com> - * src/resources/*: Merge from RHEL5 branch. - * src/utils/*: Merge from RHEL5 branch. + * src/resources/*: Merge misc. updates from RHEL5 branch. + * src/utils/*: Merge misc. updates from RHEL5 branch. + * include/*.h, src/daemons/*: Merge status-counter patch + from RHEL5 branch. 2007-08-30 Lon Hohberger <lhh at redhat.com> * src/daemons/restree.c, rg_state.c: Fix tree-restart bug --- cluster/rgmanager/include/restart_counter.h 2007/11/26 21:46:26 1.1 +++ cluster/rgmanager/include/restart_counter.h 2007/11/30 20:36:17 1.2 @@ -0,0 +1,12 @@ +#ifndef _RESTART_COUNTER_H +#define _RESTART_COUNTER_H + +typedef void *restart_counter_t; + +int restart_add(restart_counter_t arg); +int restart_clear(restart_counter_t arg); +int restart_count(restart_counter_t arg); +restart_counter_t restart_init(time_t expire_timeout, int max_restarts); +int restart_cleanup(restart_counter_t arg); + +#endif --- cluster/rgmanager/include/resgroup.h 2007/06/27 14:03:51 1.23 +++ cluster/rgmanager/include/resgroup.h 2007/11/30 20:36:17 1.24 @@ -150,6 +150,8 @@ int svc_freeze(char *svcName); int svc_unfreeze(char *svcName); int svc_migrate(char *svcName, int target); +int check_restart(char *svcName); + int rt_enqueue_request(const char *resgroupname, int request, msgctx_t *resp_ctx, int max, uint32_t target, int arg0, int arg1); --- cluster/rgmanager/include/reslist.h 2007/08/02 14:53:37 1.23 +++ cluster/rgmanager/include/reslist.h 2007/11/30 20:36:17 1.24 @@ -126,6 +126,7 @@ struct _rg_node *rn_child, *rn_parent; resource_t *rn_resource; resource_act_t *rn_actions; + restart_counter_t rn_restart_counter; int rn_state; /* State of this instance of rn_resource */ int rn_flags; int rn_last_status; --- cluster/rgmanager/src/daemons/restart_counter.c 2007/11/26 21:46:27 1.1 +++ cluster/rgmanager/src/daemons/restart_counter.c 2007/11/30 20:36:17 1.2 @@ -0,0 +1,185 @@ +/* + Copyright Red Hat, Inc. 2007 + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License version 2 as published + by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 675 Mass Ave, Cambridge, + MA 02139, USA. +*/ +/* Time-based restart counters for rgmanager */ + +#include <stdio.h> +#include <list.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/types.h> +#include <time.h> +#include <restart_counter.h> + + + +#define RESTART_INFO_MAGIC 0x184820ab + +typedef struct { + list_head(); + time_t restart_time; +} restart_item_t; + +typedef struct { + int magic; + time_t expire_timeout; + int max_restarts; + int restart_count; + restart_item_t *restart_nodes; +} restart_info_t; + + +#define VALIDATE(arg, ret) \ +do { \ + if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\ + errno = EINVAL; \ + return ret; \ + } \ +} while(0) + + +/* Remove expired restarts */ +static int +restart_timer_purge(restart_counter_t arg, time_t now) +{ + restart_info_t *restarts = (restart_info_t *)arg; + restart_item_t *i; + int x, done = 0; + + VALIDATE(arg, -1); + + /* No timeout */ + if (restarts->expire_timeout == 0) + return 0; + + do { + done = 1; + list_for(&restarts->restart_nodes, i, x) { + if ((now - i->restart_time) >= + restarts->expire_timeout) { + restarts->restart_count--; + list_remove(&restarts->restart_nodes, i); + done = 0; + break; + } + } + } while(!done); + + return 0; +} + + +int +restart_count(restart_counter_t arg) +{ + restart_info_t *restarts = (restart_info_t *)arg; + time_t now; + + VALIDATE(arg, -1); + now = time(NULL); + restart_timer_purge(arg, now); + return restarts->restart_count; +} + + +/* Add a restart entry to the list. Returns 1 if restart + count is exceeded */ +int +restart_add(restart_counter_t arg) +{ + restart_info_t *restarts = (restart_info_t *)arg; + restart_item_t *i; + time_t t; + + if (!arg) + /* No max restarts / threshold = always + ok to restart! */ + return 0; + + VALIDATE(arg, -1); + + i = malloc(sizeof(*i)); + if (!i) { + return -1; + } + + t = time(NULL); + i->restart_time = t; + + list_insert(&restarts->restart_nodes, i); + restarts->restart_count++; + + /* Check and remove old entries */ + restart_timer_purge(restarts, t); + + if (restarts->restart_count > restarts->max_restarts) + return 1; + + return 0; +} + + +int +restart_clear(restart_counter_t arg) +{ + restart_info_t *restarts = (restart_info_t *)arg; + restart_item_t *i; + + VALIDATE(arg, -1); + while ((i = restarts->restart_nodes)) { + list_remove(&restarts->restart_nodes, i); + free(i); + } + + restarts->restart_count = 0; + + return 0; +} + + +restart_counter_t +restart_init(time_t expire_timeout, int max_restarts) +{ + restart_info_t *info; + + if (max_restarts < 0) { + errno = EINVAL; + return NULL; + } + + info = malloc(sizeof(*info)); + if (info == NULL) + return NULL; + + info->magic = RESTART_INFO_MAGIC; + info->expire_timeout = expire_timeout; + info->max_restarts = max_restarts; + info->restart_count = 0; + + return (void *)info; +} + + +int +restart_cleanup(restart_counter_t arg) +{ + VALIDATE(arg, -1); + restart_clear(arg); + free(arg); + return 0; +} --- cluster/rgmanager/src/daemons/Makefile 2007/08/28 04:35:47 1.23 +++ cluster/rgmanager/src/daemons/Makefile 2007/11/30 20:36:17 1.24 @@ -31,12 +31,14 @@ rg_queue.o \ rg_state.o \ rg_thread.o \ + restart_counter.o \ watchdog.o OBJS2= clurmtabd.o \ clurmtabd_lib.o -OBJS3= test-noccs.o +OBJS3= test-noccs.o \ + restart_counter.o OBJS4= dtest-noccs.o --- cluster/rgmanager/src/daemons/fo_domain.c 2007/03/20 17:09:57 1.13 +++ cluster/rgmanager/src/daemons/fo_domain.c 2007/11/30 20:36:17 1.14 @@ -27,6 +27,7 @@ #include <list.h> #include <clulog.h> #include <resgroup.h> +#include <restart_counter.h> #include <reslist.h> #include <ccs.h> #include <pthread.h> --- cluster/rgmanager/src/daemons/groups.c 2007/08/02 14:53:38 1.39 +++ cluster/rgmanager/src/daemons/groups.c 2007/11/30 20:36:17 1.40 @@ -20,6 +20,7 @@ //#define DEBUG #include <platform.h> #include <resgroup.h> +#include <restart_counter.h> #include <reslist.h> #include <vf.h> #include <message.h> @@ -179,6 +180,29 @@ } +resource_node_t * +node_by_ref(resource_node_t **tree, char *name) +{ + resource_t *res; + resource_node_t *node, *ret = NULL; + char rgname[64]; + int x; + + list_for(&_tree, node, x) { + + res = node->rn_resource; + res_build_name(rgname, sizeof(rgname), res); + + if (!strcasecmp(name, rgname)) { + ret = node; + break; + } + } + + return ret; +} + + int count_resource_groups_local(cman_node_t *mp) { @@ -1587,6 +1611,28 @@ } +int +check_restart(char *rg_name) +{ + resource_node_t *node; + int ret = 1; + + pthread_rwlock_rdlock(&resource_lock); + node = node_by_ref(&_tree, rg_name); + if (node) { + ret = restart_add(node->rn_restart_counter); + if (ret) { + /* Clear it out - caller is about + to relocate the service anyway */ + restart_clear(node->rn_restart_counter); + } + } + pthread_rwlock_unlock(&resource_lock); + + return ret; +} + + void kill_resource_groups(void) { --- cluster/rgmanager/src/daemons/main.c 2007/09/19 09:54:19 1.44 +++ cluster/rgmanager/src/daemons/main.c 2007/11/30 20:36:17 1.45 @@ -166,6 +166,7 @@ old_membership = member_list(); new_ml = get_member_list(h); + memb_mark_down(new_ml, 0); for (x = 0; x < new_ml->cml_count; x++) { @@ -182,19 +183,25 @@ quorate = cman_is_listening(h, new_ml->cml_members[x].cn_nodeid, port); + if (quorate == 0) { clulog(LOG_DEBUG, "Node %d is not listening\n", new_ml->cml_members[x].cn_nodeid); new_ml->cml_members[x].cn_member = 0; } else if (quorate < 0) { + if (errno == ENOTCONN) { + new_ml->cml_members[x].cn_member = 0; + break; + } perror("cman_is_listening"); usleep(50000); continue; } - #ifdef DEBUG - printf("Node %d IS listening\n", - new_ml->cml_members[x].cn_nodeid); + else { + printf("Node %d IS listening\n", + new_ml->cml_members[x].cn_nodeid); + } #endif break; } while(1); @@ -202,7 +209,6 @@ cman_finish(h); member_list_update(new_ml); - member_set_state(0, 0); /* Mark qdisk as dead */ /* * Handle nodes lost. Do our local node event first. --- cluster/rgmanager/src/daemons/reslist.c 2007/07/31 18:00:25 1.19 +++ cluster/rgmanager/src/daemons/reslist.c 2007/11/30 20:36:17 1.20 @@ -26,6 +26,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <list.h> +#include <restart_counter.h> #include <reslist.h> #include <pthread.h> #ifndef NO_CCS --- cluster/rgmanager/src/daemons/resrules.c 2007/07/31 18:00:25 1.23 +++ cluster/rgmanager/src/daemons/resrules.c 2007/11/30 20:36:17 1.24 @@ -27,6 +27,8 @@ #include <sys/types.h> #include <sys/stat.h> #include <list.h> +#include <ctype.h> +#include <restart_counter.h> #include <reslist.h> #include <pthread.h> #include <dirent.h> @@ -230,43 +232,70 @@ int -expand_time(char *val) +expand_time (char *val) { - int l = strlen(val); - char c = val[l - 1]; - int ret = atoi(val); + int curval, len; + int ret = 0; + char *start = val, ival[16]; - if (ret <= 0) - return 0; + if (!val) + return (time_t)0; + + while (start[0]) { + + len = 0; + curval = 0; + memset(ival, 0, sizeof(ival)); + + while (isdigit(start[len])) { + ival[len] = start[len]; + len++; + } + + if (len) { + curval = atoi(ival); + } else { + len = 1; + } - if ((c >= '0') && (c <= '9')) - return ret; + switch(start[len]) { + case 0: + case 'S': + case 's': + break; + case 'M': + case 'm': + curval *= 60; + break; + case 'h': + case 'H': + curval *= 3600; + break; + case 'd': + case 'D': + curval *= 86400; + break; + case 'w': + case 'W': + curval *= 604800; + break; + case 'y': + case 'Y': + curval *= 31536000; + break; + default: + curval = 0; + } - switch(c) { - case 'S': - case 's': - return (ret); - case 'M': - case 'm': - return (ret * 60); - case 'h': - case 'H': - return (ret * 3600); - case 'd': - case 'D': - return (ret * 86400); - case 'w': - case 'W': - return (ret * 604800); - case 'y': - case 'Y': - return (ret * 31536000); + ret += (time_t)curval; + start += len; } return ret; } + /** * Store a resource action * @param actsp Action array; may be modified and returned! --- cluster/rgmanager/src/daemons/restree.c 2007/08/30 16:09:39 1.37 +++ cluster/rgmanager/src/daemons/restree.c 2007/11/30 20:36:17 1.38 @@ -30,6 +30,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <list.h> +#include <restart_counter.h> #include <reslist.h> #include <pthread.h> #include <clulog.h> @@ -432,6 +433,39 @@ } +static inline void +assign_restart_policy(resource_t *curres, resource_node_t *parent, + resource_node_t *node) +{ + char *val; + int max_restarts = 0; + time_t restart_expire_time = 0; + + node->rn_restart_counter = NULL; + + if (!curres || !node) + return; + if (parent) /* Non-parents don't get one for now */ + return; + + val = res_attr_value(curres, "max_restarts"); + if (!val) + return; + max_restarts = atoi(val); + if (max_restarts <= 0) + return; + val = res_attr_value(curres, "restart_expire_time"); + if (val) { + restart_expire_time = (time_t)expand_time(val); + if (!restart_expire_time) + return; + } + + node->rn_restart_counter = restart_init(restart_expire_time, + max_restarts); +} + + static inline int do_load_resource(int ccsfd, char *base, resource_rule_t *rule, @@ -514,6 +548,7 @@ node->rn_state = RES_STOPPED; node->rn_flags = 0; node->rn_actions = (resource_act_t *)act_dup(curres->r_actions); + assign_restart_policy(curres, parent, node); snprintf(tok, sizeof(tok), "%s/@__independent_subtree", base); #ifndef NO_CCS @@ -769,6 +804,11 @@ destroy_resource_tree(&(*tree)->rn_child); list_remove(tree, node); + + if (node->rn_restart_counter) { + restart_cleanup(node->rn_restart_counter); + } + if(node->rn_actions){ free(node->rn_actions); } --- cluster/rgmanager/src/daemons/rg_state.c 2007/08/30 16:09:39 1.40 +++ cluster/rgmanager/src/daemons/rg_state.c 2007/11/30 20:36:18 1.41 @@ -1350,7 +1350,8 @@ } if ((svcStatus.rs_state != RG_STATE_STOPPING) && - (svcStatus.rs_state != RG_STATE_ERROR)) { + (svcStatus.rs_state != RG_STATE_ERROR) && + (svcStatus.rs_state != RG_STATE_RECOVER)) { rg_unlock(&lockp); return 0; } @@ -1829,8 +1830,10 @@ * We got sent here from handle_start_req. * We're DONE. */ - if (request == RG_START_RECOVER) + if (request == RG_START_RECOVER) { + _svc_stop_finish(svcName, 0, RG_STATE_STOPPED); return RG_EFAIL; + } /* * All potential places for the service to start have been exhausted. @@ -1839,7 +1842,7 @@ exhausted: if (!rg_locked()) { clulog(LOG_WARNING, - "#70: Attempting to restart service %s locally.\n", + "#70: Failed to relocate %s; restarting locally\n", svcName); if (svc_start(svcName, RG_START_RECOVER) == 0) { *new_owner = me; @@ -2078,6 +2081,14 @@ new_owner); } + /* Check restart counter/timer for this resource */ + if (check_restart(svcName) > 0) { + clulog(LOG_NOTICE, "Restart threshold for %s exceeded; " + "attempting to relocate\n", svcName); + return handle_relocate_req(svcName, RG_START_RECOVER, -1, + new_owner); + } + return handle_start_req(svcName, RG_START_RECOVER, new_owner); } --- cluster/rgmanager/src/daemons/test.c 2007/07/31 18:02:49 1.12 +++ cluster/rgmanager/src/daemons/test.c 2007/11/30 20:36:18 1.13 @@ -25,6 +25,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <list.h> +#include <restart_counter.h> #include <reslist.h> #include <pthread.h> #include <depends.h>