CLM and MDS callbacks are delivered to the main thread via different paths.
If a node is restarted quickly, sometimes CLM JOIN is processed before the
prior MDS down. This means the node will not be able to join the cluster
as it is not in node_id_db (deleted in MDS down processing).

This patch ensures addition to, and removal from node_id_db is only done
from CLM callbacks to avoid race conditions such as above.
---
 src/amf/amfd/clm.cc    | 10 ++++++++--
 src/amf/amfd/ndfsm.cc  |  1 +
 src/amf/amfd/ndproc.cc |  2 +-
 src/amf/amfd/node.cc   |  1 +
 src/amf/amfd/node.h    |  1 +
 5 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/amf/amfd/clm.cc b/src/amf/amfd/clm.cc
index da951d223..b2133b57e 100644
--- a/src/amf/amfd/clm.cc
+++ b/src/amf/amfd/clm.cc
@@ -203,6 +203,7 @@ static void clm_node_exit_complete(SaClmNodeIdT nodeId) {
   }
 
   avd_node_failover(node);
+  avd_node_delete_nodeid(node);
   m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, AVSV_CKPT_AVD_NODE_CONFIG);
   node->clm_change_start_preceded = false;
 
@@ -246,7 +247,7 @@ static void clm_track_cb(
       case SA_CLM_CHANGE_VALIDATE:
         if (notifItem->clusterChange == SA_CLM_NODE_LEFT) {
           node = avd_node_find_nodeid(notifItem->clusterNode.nodeId);
-          if (node == nullptr) {
+          if (node == nullptr || node->node_up == false) {
             LOG_IN("%s: CLM node '%s' is not an AMF cluster member",
                    __FUNCTION__, node_name.c_str());
             goto done;
@@ -262,7 +263,7 @@ static void clm_track_cb(
 
       case SA_CLM_CHANGE_START:
         node = avd_node_find_nodeid(notifItem->clusterNode.nodeId);
-        if (node == nullptr) {
+        if (node == nullptr || node->node_up == false) {
           LOG_IN("%s: CLM node '%s' is not an AMF cluster member", 
__FUNCTION__,
                  node_name.c_str());
           goto done;
@@ -293,6 +294,11 @@ static void clm_track_cb(
             LOG_IN("%s: CLM node '%s' is not an AMF cluster member",
                    __FUNCTION__, node_name.c_str());
             goto done;
+          } else if (node->node_up == false) {
+            LOG_IN("%s: CLM node '%s' is not an AMF cluster member; MDS down 
received",
+                   __FUNCTION__, node_name.c_str());
+            avd_node_delete_nodeid(node);
+            goto done;
           }
           TRACE(" Node Left: rootCauseEntity %s for node %u",
                 osaf_extended_name_borrow(rootCauseEntity),
diff --git a/src/amf/amfd/ndfsm.cc b/src/amf/amfd/ndfsm.cc
index ca2e3f698..223f57f20 100644
--- a/src/amf/amfd/ndfsm.cc
+++ b/src/amf/amfd/ndfsm.cc
@@ -247,6 +247,7 @@ void record_node_up_msg_info(AVD_AVND *avnd, const 
AVD_DND_MSG *n2d_msg) {
   osafassert(avnd != nullptr);
 
   avnd->adest = n2d_msg->msg_info.n2d_node_up.adest_address;
+  avnd->node_up = true;
 
   if (n2d_msg->msg_info.n2d_node_up.msg_id >= avnd->rcv_msg_id) {
     LOG_NO("Received node_up from %x: msg_id %u",
diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc
index e80a0b3b8..2edb9b16e 100644
--- a/src/amf/amfd/ndproc.cc
+++ b/src/amf/amfd/ndproc.cc
@@ -1221,6 +1221,6 @@ void avd_node_failover(AVD_AVND *node) {
   avd_pg_node_csi_del_all(avd_cb, node);
   avd_node_down_mw_susi_failover(avd_cb, node);
   avd_node_down_appl_susi_failover(avd_cb, node);
-  avd_node_delete_nodeid(node);
+  node->node_up = false; // postpone deletion from node_id_db
   TRACE_LEAVE();
 }
diff --git a/src/amf/amfd/node.cc b/src/amf/amfd/node.cc
index 37f6ee389..8390515b4 100644
--- a/src/amf/amfd/node.cc
+++ b/src/amf/amfd/node.cc
@@ -120,6 +120,7 @@ void AVD_AVND::initialize() {
   clm_change_start_preceded = {};
   recvr_fail_sw = {};
   admin_ng = {};
+  node_up = false;
 }
 
 //
diff --git a/src/amf/amfd/node.h b/src/amf/amfd/node.h
index e64bf8c93..4cee956cc 100644
--- a/src/amf/amfd/node.h
+++ b/src/amf/amfd/node.h
@@ -148,6 +148,7 @@ class AVD_AVND {
   bool is_campaign_set_for_all_sus() const;
   // Member functions.
   void node_sus_termstate_set(bool term_state) const;
+  bool node_up; // true if MDS is up, false if MDS is down
 
  private:
   void initialize();
-- 
2.11.0


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to