There is a case that after AMFD send reboot order due to “out of sync window”.
AMFD receive CLM track callback but node is not AMF member yet and delete node.
Later AMFND MDS down will do nothing since it cannot find the node.
When node reboot up, AMFD continue use old msg_id counter send to AMFND
cause messasge ID mismatch in AMFND then AMFND order reboot itself node.

Also, if AMFND already synced info after headless to active AMFD,
then node failover actions need consider for this AMFND down.

Use a flag synced_headless for node, turn it true if susi recreate,
then in AMFND down handler, searching the node_id in node_name_db.
If found, check if need do node failover base on synced_headless flag.
---
 src/amf/amfd/ndfsm.cc | 21 ++++++++++++++++++++-
 src/amf/amfd/node.cc  |  1 +
 src/amf/amfd/node.h   |  1 +
 src/amf/amfd/siass.cc |  1 +
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/amf/amfd/ndfsm.cc b/src/amf/amfd/ndfsm.cc
index 9d54df13d..6323d3a73 100644
--- a/src/amf/amfd/ndfsm.cc
+++ b/src/amf/amfd/ndfsm.cc
@@ -767,6 +767,7 @@ void avd_mds_avnd_up_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
  **************************************************************************/
 
 void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
+  bool node_failover = true;
   AVD_AVND *node = avd_node_find_nodeid(evt->info.node_id);
 
   TRACE_ENTER2("%x, %p", evt->info.node_id, node);
@@ -775,6 +776,20 @@ void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
   nds_mds_ver_db.erase(evt->info.node_id);
   amfnd_svc_db->erase(evt->info.node_id);
 
+  if (node == nullptr) {
+    for (const auto &value : *node_name_db) {
+      AVD_AVND *avnd = value.second;
+      if (avnd->node_info.nodeId == evt->info.node_id) {
+        node_failover = false;
+        node = avnd;
+        if (node->synced_headless) {
+          node_failover = true;
+        }
+        break;
+      }
+    }
+  }
+
   if (node != nullptr) {
     // Do nothing if the local node goes down. Most likely due to system
     // shutdown. If node director goes down due to a bug, the AMF watchdog will
@@ -784,7 +799,9 @@ void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
     }
 
     if (avd_cb->avail_state_avd == SA_AMF_HA_ACTIVE) {
-      avd_node_failover(node);
+      if (node_failover) {
+        avd_node_failover(node);
+      }
       // Update standby out of sync if standby sc goes down
       if (avd_cb->node_id_avd_other == node->node_info.nodeId) {
         cb->stby_sync_state = AVD_STBY_OUT_OF_SYNC;
@@ -802,6 +819,7 @@ void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
       node->recvr_fail_sw = false;
       node->node_info.initialViewNumber = 0;
       node->node_info.member = SA_FALSE;
+      node->synced_headless = false;
     }
   }
 
@@ -1122,6 +1140,7 @@ void avd_node_mark_absent(AVD_AVND *node) {
 
   node->node_info.initialViewNumber = 0;
   node->node_info.member = SA_FALSE;
+  node->synced_headless = false;
 
   /* Increment node failfast counter */
   avd_cb->nodes_exit_cnt++;
diff --git a/src/amf/amfd/node.cc b/src/amf/amfd/node.cc
index 0ffcfb782..f421e68de 100644
--- a/src/amf/amfd/node.cc
+++ b/src/amf/amfd/node.cc
@@ -94,6 +94,7 @@ void AVD_AVND::initialize() {
   node_name = {};
   node_info = {};
   node_info.member = SA_FALSE;
+  synced_headless = false;
   adest = {};
   saAmfNodeClmNode = {};
   saAmfNodeCapacity = {};
diff --git a/src/amf/amfd/node.h b/src/amf/amfd/node.h
index e64bf8c93..02b15bca8 100644
--- a/src/amf/amfd/node.h
+++ b/src/amf/amfd/node.h
@@ -145,6 +145,7 @@ class AVD_AVND {
   uint16_t node_up_msg_count;    /* to count of node_up msg that director had
                                     received from this node */
   bool reboot;
+  bool synced_headless;
   bool is_campaign_set_for_all_sus() const;
   // Member functions.
   void node_sus_termstate_set(bool term_state) const;
diff --git a/src/amf/amfd/siass.cc b/src/amf/amfd/siass.cc
index 267c55c07..f23c5510e 100644
--- a/src/amf/amfd/siass.cc
+++ b/src/amf/amfd/siass.cc
@@ -1136,6 +1136,7 @@ SaAisErrorT 
avd_susi_recreate(AVSV_N2D_ND_SISU_STATE_MSG_INFO *info) {
     return SA_AIS_ERR_NOT_EXIST;
   }
 
+  node->synced_headless = true;
   for (su_state = info->su_list; su_state != nullptr;
        su_state = su_state->next) {
     AVD_SU *su = su_db->find(Amf::to_string(&su_state->safSU));
-- 
2.18.0


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to