cman members are queried in response to a callback,
and members sometimes leave and rejoin between queries
(e.g. when they leave and rejoin before corosync
detects they left.)

This means that simply checking if a node is a member
in consecutive queries sometimes misses events.  We
need to compare the incarnation numbers of members
from consecutive queries to avoid this.

bz 663397

Signed-off-by: David Teigland <teigl...@redhat.com>
---
 group/gfs_controld/member_cman.c |   51 +++++++++++++++++++++++++++++++++++---
 1 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/group/gfs_controld/member_cman.c b/group/gfs_controld/member_cman.c
index 277a11e..7f354b1 100644
--- a/group/gfs_controld/member_cman.c
+++ b/group/gfs_controld/member_cman.c
@@ -9,6 +9,7 @@ static cman_node_t      old_nodes[MAX_NODES];
 static int              old_node_count;
 static cman_node_t      cman_nodes[MAX_NODES];
 static int              cman_node_count;
+static uint32_t         cluster_ringid_seq;
 
 void kick_node_from_cluster(int nodeid)
 {
@@ -22,6 +23,17 @@ void kick_node_from_cluster(int nodeid)
        }
 }
 
+static cman_node_t *get_node(cman_node_t *node_list, int count, int nodeid)
+{
+       int i;
+
+       for (i = 0; i < count; i++) {
+               if (node_list[i].cn_nodeid == nodeid)
+                       return &node_list[i];
+       }
+       return NULL;
+}
+
 static int is_member(cman_node_t *node_list, int count, int nodeid)
 {
        int i;
@@ -45,8 +57,18 @@ static int is_cluster_member(int nodeid)
 
 static void statechange(void)
 {
+       cman_cluster_t info;
+       cman_node_t *old;
        int i, rv;
 
+       rv = cman_get_cluster(ch, &info);
+       if (rv < 0) {
+               log_error("cman_get_cluster error %d %d", rv, errno);
+               /* keep going, this is just informational */
+               memset(&info, 0, sizeof(info));
+       }
+       cluster_ringid_seq = info.ci_generation;
+
        old_node_count = cman_node_count;
        memcpy(&old_nodes, &cman_nodes, sizeof(old_nodes));
 
@@ -70,8 +92,8 @@ static void statechange(void)
                if (old_nodes[i].cn_member &&
                    !is_cluster_member(old_nodes[i].cn_nodeid)) {
 
-                       log_debug("cluster node %d removed",
-                                 old_nodes[i].cn_nodeid);
+                       log_debug("cluster node %d removed seq %u",
+                                 old_nodes[i].cn_nodeid, cluster_ringid_seq);
 
                        node_history_cluster_remove(old_nodes[i].cn_nodeid);
                }
@@ -81,10 +103,31 @@ static void statechange(void)
                if (cman_nodes[i].cn_member &&
                    !is_old_member(cman_nodes[i].cn_nodeid)) {
 
-                       log_debug("cluster node %d added",
-                                 cman_nodes[i].cn_nodeid);
+                       log_debug("cluster node %d added seq %u",
+                                 cman_nodes[i].cn_nodeid, cluster_ringid_seq);
 
                        node_history_cluster_add(cman_nodes[i].cn_nodeid);
+               } else {
+                       /* look for any nodes that were members of both
+                        * old and new but have a new incarnation number
+                        * from old to new, indicating they left and rejoined
+                        * in between */
+
+                       old = get_node(old_nodes, old_node_count, 
cman_nodes[i].cn_nodeid);
+
+                       if (!old)
+                               continue;
+                       if (cman_nodes[i].cn_incarnation == old->cn_incarnation)
+                               continue;
+
+                       log_debug("cluster node %d removed and added seq %u "
+                                 "old %u new %u",
+                                  cman_nodes[i].cn_nodeid, cluster_ringid_seq,
+                                  old->cn_incarnation,
+                                  cman_nodes[i].cn_incarnation);
+
+                       node_history_cluster_remove(cman_nodes[i].cn_nodeid);
+                       node_history_cluster_add(cman_nodes[i].cn_nodeid);
                }
        }
 }
-- 
1.7.6

Reply via email to