Linux-ha-cvs Digest, Vol 29, Issue 113

linux-ha-cvs-request Fri, 21 Apr 2006 10:47:51 -0700

Send Linux-ha-cvs mailing list submissions to
        [email protected]


To subscribe or unsubscribe via the World Wide Web, visit
        http://lists.community.tummy.com/mailman/listinfo/linux-ha-cvs
or, via email, send a message with subject or body 'help' to
        [EMAIL PROTECTED]

You can reach the person managing the list at
        [EMAIL PROTECTED]

When replying, please edit your Subject line so it is more specific
than "Re: Contents of Linux-ha-cvs digest..."


Today's Topics:

   1. Linux-HA CVS: crm by andrew from 
      ([email protected])
   2. Linux-HA CVS: cts by blaschke from 
      ([email protected])
   3. Linux-HA CVS: heartbeat by alan from 
      ([email protected])


----------------------------------------------------------------------

Message: 1
Date: Fri, 21 Apr 2006 03:18:27 -0600 (MDT)
From: [email protected]
Subject: [Linux-ha-cvs] Linux-HA CVS: crm by andrew from 
To: [EMAIL PROTECTED]
Message-ID: <[EMAIL PROTECTED]>

linux-ha CVS committal

Author  : andrew
Host    : 
Project : linux-ha
Module  : crm

Dir     : linux-ha/crm/pengine


Modified Files:
        stages.c utils.c pe_utils.h 


Log Message:


Log a warning if more than one node has the highest score for running a
  given resource.

===================================================================
RCS file: /home/cvs/linux-ha/linux-ha/crm/pengine/stages.c,v
retrieving revision 1.90
retrieving revision 1.91
diff -u -3 -r1.90 -r1.91
--- stages.c    3 Apr 2006 09:51:56 -0000       1.90
+++ stages.c    21 Apr 2006 09:18:26 -0000      1.91
@@ -1,4 +1,4 @@
-/* $Id: stages.c,v 1.90 2006/04/03 09:51:56 andrew Exp $ */
+/* $Id: stages.c,v 1.91 2006/04/21 09:18:26 andrew Exp $ */
 /* 
  * Copyright (C) 2004 Andrew Beekhof <[EMAIL PROTECTED]>
  * 
@@ -511,11 +511,13 @@
        */
        GListPtr nodes = color->details->candidate_nodes;
        node_t *chosen = NULL;
+       int multiple = 0;
 
        crm_debug_2("Choosing node for color %d", color->id);
        color->details->candidate_nodes = g_list_sort(nodes, sort_node_weight);
+       nodes = color->details->candidate_nodes;
 
-       chosen = g_list_nth_data(color->details->candidate_nodes, 0);
+       chosen = g_list_nth_data(nodes, 0);
 
        color->details->chosen_node = NULL;
        color->details->pending = FALSE;
@@ -537,11 +539,38 @@
                return FALSE;
        }
 
+
+       slist_iter(candidate, node_t, nodes, lpc, 
+                  if(chosen->weight > 0
+                     && candidate->details->unclean == FALSE
+                     && candidate->weight == chosen->weight) {
+                          multiple++;
+                  } else {
+                          break;
+                  }
+               );
+
+       if(multiple > 1) {
+               int log_level = LOG_WARNING;
+               char *score = score2char(chosen->weight);
+               
+               crm_warn("%d nodes with equal score (%s) for running the"
+                        " listed resources (chose %s):",
+                        multiple, score, chosen->details->uname);
+               slist_iter(rsc, resource_t,
+                          color->details->allocated_resources, lpc,
+                          rsc->fns->print(
+                                  rsc, "\t", pe_print_log|pe_print_rsconly,
+                                  &log_level);
+                       );
+               crm_free(score);
+       }
+       
        /* todo: update the old node for each resource to reflect its
         * new resource count
         */
 
-       crm_debug_2("assigned %s to color %d", chosen->details->uname, 
color->id);
+       crm_debug_2("assigned %s to color %d",chosen->details->uname,color->id);
        chosen->details->num_resources += color->details->num_resources;
        color->details->chosen_node = node_copy(chosen);
        
===================================================================
RCS file: /home/cvs/linux-ha/linux-ha/crm/pengine/utils.c,v
retrieving revision 1.131
retrieving revision 1.132
diff -u -3 -r1.131 -r1.132
--- utils.c     21 Apr 2006 07:08:04 -0000      1.131
+++ utils.c     21 Apr 2006 09:18:26 -0000      1.132
@@ -1,4 +1,4 @@
-/* $Id: utils.c,v 1.131 2006/04/21 07:08:04 andrew Exp $ */
+/* $Id: utils.c,v 1.132 2006/04/21 09:18:26 andrew Exp $ */
 /* 
  * Copyright (C) 2004 Andrew Beekhof <[EMAIL PROTECTED]>
  * 
@@ -1855,6 +1855,20 @@
        return score_f;
 }
 
+
+char *
+score2char(int score) 
+{
+
+       if(score >= INFINITY) {
+               return crm_strdup("+"INFINITY_S);
+
+       } else if(score <= -INFINITY) {
+               return crm_strdup("-"INFINITY_S);
+       } 
+       return crm_itoa(score);
+}
+
 rsc_to_node_t *
 rsc2node_new(const char *id, resource_t *rsc,
             int node_weight, node_t *foo_node, pe_working_set_t *data_set)
===================================================================
RCS file: /home/cvs/linux-ha/linux-ha/crm/pengine/pe_utils.h,v
retrieving revision 1.41
retrieving revision 1.42
diff -u -3 -r1.41 -r1.42
--- pe_utils.h  31 Mar 2006 12:05:37 -0000      1.41
+++ pe_utils.h  21 Apr 2006 09:18:26 -0000      1.42
@@ -1,4 +1,4 @@
-/* $Id: pe_utils.h,v 1.41 2006/03/31 12:05:37 andrew Exp $ */
+/* $Id: pe_utils.h,v 1.42 2006/04/21 09:18:26 andrew Exp $ */
 /* 
  * Copyright (C) 2004 Andrew Beekhof <[EMAIL PROTECTED]>
  * 
@@ -173,6 +173,7 @@
 extern const char *fail2text(enum action_fail_response fail);
 
 extern int char2score(const char *score);
+extern char *score2char(int score);
 
 
 /* free the various structures */




------------------------------

Message: 2
Date: Fri, 21 Apr 2006 10:04:35 -0600 (MDT)
From: [email protected]
Subject: [Linux-ha-cvs] Linux-HA CVS: cts by blaschke from 
To: [EMAIL PROTECTED]
Message-ID: <[EMAIL PROTECTED]>

linux-ha CVS committal

Author  : blaschke
Host    : 
Project : linux-ha
Module  : cts

Dir     : linux-ha/cts


Modified Files:
        CTSlab.py.in 


Log Message:

fix one missed typo


===================================================================
RCS file: /home/cvs/linux-ha/linux-ha/cts/CTSlab.py.in,v
retrieving revision 1.62
retrieving revision 1.63
diff -u -3 -r1.62 -r1.63
--- CTSlab.py.in        19 Apr 2006 14:52:32 -0000      1.62
+++ CTSlab.py.in        21 Apr 2006 16:04:34 -0000      1.63
@@ -784,7 +784,7 @@
         
     tests.summarize()
     if tests.Stats["failure"] > 0:
-        sys.exit(test.Stats["failure"])
+        sys.exit(tests.Stats["failure"])
 
     elif tests.Stats["success"] != NumIter:
         cm.Env.log("No failure count but success != requested iterations")




------------------------------

Message: 3
Date: Fri, 21 Apr 2006 11:47:46 -0600 (MDT)
From: [email protected]
Subject: [Linux-ha-cvs] Linux-HA CVS: heartbeat by alan from 
To: [EMAIL PROTECTED]
Message-ID: <[EMAIL PROTECTED]>

linux-ha CVS committal

Author  : alan
Host    : 
Project : linux-ha
Module  : heartbeat

Dir     : linux-ha/heartbeat


Modified Files:
        heartbeat.c 


Log Message:
Put in a bug fix which only occurs if we send lots of packets
before initdead expires, and we're the only node up.
It caused problems in BSC with a 10ms heartbeat time.
What triggered the need for this patch was decreasing the window
size of the protocol to "only" 200 packets (instead of 1000).

===================================================================
RCS file: /home/cvs/linux-ha/linux-ha/heartbeat/heartbeat.c,v
retrieving revision 1.503
retrieving revision 1.504
diff -u -3 -r1.503 -r1.504
--- heartbeat.c 20 Apr 2006 17:14:56 -0000      1.503
+++ heartbeat.c 21 Apr 2006 17:47:46 -0000      1.504
@@ -1,8 +1,4 @@
-/*
- * TODO:
- * 1) Man page update
- */
-/* $Id: heartbeat.c,v 1.503 2006/04/20 17:14:56 alan Exp $ */
+/* $Id: heartbeat.c,v 1.504 2006/04/21 17:47:46 alan Exp $ */
 /*
  * heartbeat: Linux-HA heartbeat code
  *
@@ -283,6 +279,8 @@
 
 #define        ALWAYSRESTART_ON_SPLITBRAIN     1
 
+#define        FLOWCONTROL_LIMIT        ((seqno_t)(MAXMSGHIST/2))
+
 
 static char                    hbname []= "heartbeat";
 const char *                   cmdname = hbname;
@@ -334,8 +332,9 @@
 static int                     deadtime_tmpadd_count = 0;
 gboolean                       enable_flow_control = TRUE;
 static int                     send_cluster_msg_level = 0;
+static int                     live_node_count = 1; /* namely us... */
 static void print_a_child_client(gpointer childentry, gpointer unused);
-static seqno_t timer_lowseq = 0;
+static seqno_t                 timer_lowseq = 0;
 static gboolean                init_deadtime_passed = FALSE;           
 static int                     PrintDefaults = FALSE;
 static int                     WikiOutput = FALSE;
@@ -399,6 +398,7 @@
 static void    init_xmit_hist (struct msg_xmit_hist * hist);
 static void    process_rexmit(struct msg_xmit_hist * hist
 ,                      struct ha_msg* msg);
+static void    update_ackseq(seqno_t new_ackseq) ;
 static void    process_clustermsg(struct ha_msg* msg, struct link* lnk);
 extern void    process_registerevent(IPC_Channel* chan,  gpointer user_data);
 static void    nak_rexmit(struct msg_xmit_hist * hist, 
@@ -2148,21 +2148,20 @@
 HBDoMsg_T_ACKMSG(const char * type, struct node_info * fromnode,
              TIME_T msgtime, seqno_t seqno, const char * iface, struct ha_msg 
* msg)
 {
-       const char*     ackseq_str = ha_msg_value(msg, F_ACKSEQ);
-       seqno_t         ackseq;
-       struct msg_xmit_hist* hist = &msghist;  
-       const char*     to =  
-               (const char*)ha_msg_value(msg, F_TO);
-       struct node_info* tonode;
-       seqno_t         old_hist_ackseq;
+       const char*             ackseq_str = ha_msg_value(msg, F_ACKSEQ);
+       seqno_t                 ackseq;
+       struct msg_xmit_hist*   hist = &msghist;        
+       const char*             to =  (const char*)ha_msg_value(msg, F_TO);
+       struct node_info*       tonode;
+       seqno_t                 new_ackseq = hist->ackseq;
        
        if (!to || (tonode = lookup_tables(to, NULL)) == NULL
-           || tonode != curnode){
+       ||      tonode != curnode){
                return;
        }
 
-       if (ackseq_str == NULL||
-           sscanf(ackseq_str, "%lx", &ackseq) != 1){
+       if (ackseq_str == NULL
+       ||      sscanf(ackseq_str, "%lx", &ackseq) != 1){
                goto out;
        }
 
@@ -2172,11 +2171,10 @@
                goto out;
        }
        
-       if (ackseq <= hist->ackseq){
+       if (ackseq <= new_ackseq){
                /* late or dup ack
                 * ignored
                 */
-               
                goto out;
        }else if (ackseq > hist->hiseq){
                cl_log(LOG_ERR, "HBDoMsg_T_ACK"
@@ -2187,7 +2185,7 @@
                goto out;
        }
        
-       if ( ackseq < fromnode->track.ackseq){
+       if (ackseq < fromnode->track.ackseq) {
                /* late or dup ack
                 * ignored
                 */
@@ -2196,20 +2194,18 @@
        
        fromnode->track.ackseq = ackseq;
        
-       if (hist->lowest_acknode != NULL &&
-           STRNCMP_CONST(hist->lowest_acknode->status, 
-                   DEADSTATUS) == 0){
+       if (hist->lowest_acknode != NULL
+       &&      STRNCMP_CONST(hist->lowest_acknode->status,DEADSTATUS)==0){
                /* the lowest acked node is dead
-                * we cannnot count on that node 
+                * we cannot count on that node 
                 * to update our ackseq
                 */
                hist->lowest_acknode = NULL;
        }
 
-       old_hist_ackseq = hist->ackseq;
        
-       if (hist->lowest_acknode == NULL ||
-           hist->lowest_acknode == fromnode){
+       if (hist->lowest_acknode == NULL
+       ||      hist->lowest_acknode == fromnode){
                /*find the new lowest and update hist->ackseq*/
                seqno_t minseq;
                int     minidx;
@@ -2221,79 +2217,103 @@
                for (i = 0; i < config->nodecount; i++){
                        struct node_info* hip = &config->nodes[i];
                        
-                       if (STRNCMP_CONST(hip->status,DEADSTATUS) == 0
-                           || hip->nodetype == PINGNODE_I){
+                       if (hip->nodetype == PINGNODE_I
+                       ||      STRNCMP_CONST(hip->status, DEADSTATUS) == 0) {
                                continue;
                        }
                        
-                       if (minidx == -1 || 
-                           hip->track.ackseq < minseq){
+                       if (minidx == -1
+                       ||      hip->track.ackseq < minseq){
                                minseq = hip->track.ackseq;
                                minidx = i;
                        }
                }
                
-               if (minidx == -1){
-                       /*each node is in either DEASTATUS or INITSTATUS*/
+               if (minidx == -1) {
+                       /* Every node is DEADSTATUS */
                        goto out;
                }
-               if (minidx == config->nodecount){
+               if (live_node_count < 2) {
+                       /*
+                        * Update hist->ackseq so we don't hang onto
+                        * messages indefinitely and flow control clients
+                        */
+                       if ((hist->hiseq - new_ackseq) >= FLOWCONTROL_LIMIT) {
+                               new_ackseq = hist->hiseq - 
(FLOWCONTROL_LIMIT-1);
+                       }
+                       hist->lowest_acknode = NULL;
+                       goto cleanupandout;
+               }
+               if (minidx >= config->nodecount) {
                        cl_log(LOG_ERR, "minidx out of bound"
                               "minidx=%d",minidx );
                        goto out;
                }
 
 
-               if (minseq != 0){                       
-                       hist->ackseq = minseq;
+               if (minseq > 0){
+                       new_ackseq = minseq;
                }
-               
                hist->lowest_acknode = &config->nodes[minidx];
-               
-               if (hist->hiseq - hist->ackseq < MAXMSGHIST/2){
-                       all_clients_resume();
-               }
        }
        
+cleanupandout:
+       update_ackseq(new_ackseq);
+out:
+       return;
+}
+
+static void
+update_ackseq(seqno_t new_ackseq) 
+{
+       struct msg_xmit_hist*   hist = &msghist;        
+       long                    count;
+       seqno_t                 start;
+       seqno_t                 old_ackseq = hist->ackseq;
+
 #if 0  
-       cl_log(LOG_INFO, "hist->ackseq =%ld, old_hist_ackseq=%ld",
-              hist->ackseq, old_hist_ackseq);
+       cl_log(LOG_INFO, "new_ackseq = %ld, old_ackseq=%ld"
+       ,       new_ackseq, old_ackseq);
 #endif
-       if (hist->ackseq > old_hist_ackseq){
-               long count;
-               seqno_t start;
-               count = hist->ackseq - hist->lowseq - send_cluster_msg_level;
-               if (old_hist_ackseq == 0){
-                       start = 0;
-                       count = count - 1;
-               }else{
-                       start = hist->lowseq;
+       if (new_ackseq <= old_ackseq){
+               return;
+       }
+       hist->ackseq = new_ackseq;
+
+       if ((hist->hiseq - hist->ackseq) < FLOWCONTROL_LIMIT){
+               all_clients_resume();
+       }
+
+       count = hist->ackseq - hist->lowseq - send_cluster_msg_level;
+       if (old_ackseq == 0){
+               start = 0;
+               count = count - 1;
+       }else{
+               start = hist->lowseq;
+       }
+       
+       while(count -- > 0){
+               /*
+                * If the seq number is greater than the lowseq number
+                * the timer set, we should not free any more messages
+                */
+               if (start > timer_lowseq){
+                       break;
                }
-               
-               while(count -- > 0){
-                       
-                       /*if the seq number is greater than the lowseq number
-                         the timer set, we should not free any more messages*/
-                       if (start > timer_lowseq){
-                               break;
-                       }
-                       
-                       free_one_hist_slot(hist, start%MAXMSGHIST);
-                       start++;
-                       
-                       if (hist->lowseq > hist->ackseq){
-                               cl_log(LOG_ERR, "lowseq cannnot be greater than 
ackseq");
-                               cl_log(LOG_INFO, "hist->ackseq =%ld, 
old_hist_ackseq=%ld",
-                                      hist->ackseq, old_hist_ackseq);
-                               cl_log(LOG_INFO, "hist->lowseq =%ld, 
hist->hiseq=%ld,"
-                                      "send_cluster_msg_level=%d",
-                                      hist->lowseq, hist->hiseq, 
send_cluster_msg_level);
-                               abort();
-                       }
+       
+               free_one_hist_slot(hist, start%MAXMSGHIST);
+               start++;
 
+               if (hist->lowseq > hist->ackseq){
+                       cl_log(LOG_ERR, "lowseq cannnot be greater than 
ackseq");
+                       cl_log(LOG_INFO, "hist->ackseq =%ld, old_ackseq=%ld"
+                       ,       hist->ackseq, old_ackseq);
+                       cl_log(LOG_INFO, "hist->lowseq =%ld, hist->hiseq=%ld"
+                       ", send_cluster_msg_level=%d"
+                       ,       hist->lowseq, hist->hiseq, 
send_cluster_msg_level);
+                       abort();
                }
        }
-
        (void)dump_missing_pkts_info;
 
 #ifdef DEBUG_FOR_GSHI
@@ -2312,13 +2332,8 @@
        }
 
 #endif 
- out:
-
-       return;
 }
 
-
-
 static int
 getnodes(const char* nodelist, char** nodes, int* num){
        
@@ -2798,9 +2813,30 @@
                        ,       "Status seqno: %ld msgtime: %ld"
                        ,       seqno, msgtime);
                }
+               /*
+                *   IF
+                *      It's from a normal node
+                *      It isn't from us
+                *      The node's old status was dead or init
+                *      The node's new status is up or active
+                *   THEN
+                *      increment the count of live nodes.
+                */
+               if (fromnode->nodetype == NORMALNODE_I
+               &&      fromnode != curnode
+               &&      ( STRNCMP_CONST(fromnode->status, DEADSTATUS) == 0
+               ||        STRNCMP_CONST(fromnode->status, INITSTATUS) == 0)
+               &&      ( STRNCMP_CONST(status, UPSTATUS) == 0
+               ||        STRNCMP_CONST(status, ACTIVESTATUS) == 0)) {
+                       ++live_node_count;
+                       if (live_node_count > config->nodecount) {
+                               cl_log(LOG_ERR
+                               ,       "live_node_count too big (%d)"
+                               ,       live_node_count);
+                       }
+               }
                
-               strncpy(fromnode->status, status
-               ,       sizeof(fromnode->status));
+               strncpy(fromnode->status, status, sizeof(fromnode->status));
                if (!fromnode->status_suppressed) {
                        QueueRemoteRscReq(PerformQueuedNotifyWorld, msg);
                        heartbeat_monitor(msg, KEEPIT, iface);
@@ -4190,6 +4226,11 @@
                return;
        }
 
+       if (hip->nodetype == NORMALNODE_I
+       &&      STRNCMP_CONST(hip->status, DEADSTATUS) != 0
+       &&      STRNCMP_CONST(hip->status, INITSTATUS) != 0) {
+               --live_node_count;
+       }
        strncpy(hip->status, DEADSTATUS, sizeof(hip->status));
        
 
@@ -5687,7 +5728,7 @@
        
        struct msg_xmit_hist* hist = &msghist;
        
-       return hist->hiseq - hist->ackseq > MAXMSGHIST/2;
+       return hist->hiseq - hist->ackseq > FLOWCONTROL_LIMIT;
        
 }
 
@@ -5729,20 +5770,30 @@
        hist->lastrexmit[slot] = 0L;
        hist->lastmsg = slot;
        
-       if (enable_flow_control && hist->hiseq - hist->lowseq > MAXMSGHIST*3 / 
4){
-               cl_log(LOG_ERR, "Message hist queue is filling up (%d messages 
in queue)",
-                      (int)(hist->hiseq - hist->lowseq));
+       if (enable_flow_control
+       &&      live_node_count > 1
+       &&      (hist->hiseq - hist->lowseq) > ((MAXMSGHIST*3)/4)) {
+               cl_log(LOG_ERR
+               ,       "Message hist queue is filling up"
+               " (%d messages in queue)"
+               ,       (int)(hist->hiseq - hist->lowseq));
+               hist_display(hist);
        }
-       
 
        AUDITXMITHIST;
        
        if (enable_flow_control
-           && hist->hiseq - hist->ackseq > MAXMSGHIST/2){
-               all_clients_pause();
-               hist_display(hist);
+       &&      hist->hiseq - hist->ackseq > FLOWCONTROL_LIMIT){
+               if (live_node_count < 2) {
+                       update_ackseq(hist->hiseq - (FLOWCONTROL_LIMIT-1));
+                       all_clients_resume();
+               }else{
+                       cl_log(LOG_ERR, "Flow control engaged with %d live 
nodes"
+                       ,       live_node_count);
+                       all_clients_pause();
+                       hist_display(hist);
+               }
        }
-
 }
 
 
@@ -6132,6 +6183,13 @@
 
 /*
  * $Log: heartbeat.c,v $
+ * Revision 1.504  2006/04/21 17:47:46  alan
+ * Put in a bug fix which only occurs if we send lots of packets
+ * before initdead expires, and we're the only node up.
+ * It caused problems in BSC with a 10ms heartbeat time.
+ * What triggered the need for this patch was decreasing the window
+ * size of the protocol to "only" 200 packets (instead of 1000).
+ *
  * Revision 1.503  2006/04/20 17:14:56  alan
  * Changed some timing code to not be quite a particular as it had been...
  *




------------------------------

_______________________________________________
Linux-ha-cvs mailing list
[email protected]
http://lists.community.tummy.com/mailman/listinfo/linux-ha-cvs


End of Linux-ha-cvs Digest, Vol 29, Issue 113
*********************************************

Linux-ha-cvs Digest, Vol 29, Issue 113

Reply via email to