OpenSAF has relied on reliable, redundant links between nodes in a cluster.
This can no longer be assumed in virtualised environments.

In order to avoid duplicate assignments, we need to delay
node failover in environments where temporary network partitioning is expected.

When delayed node failover is enabled, AMF will not perform a node
failover until a node has been fenced if remote fencing is available,
or until the specified period has occurred (osafAmfDelayNodeFailoverTimeout).

If MDS connectivity is re-established while waiting, AMF will wait
osafAmfDelayNodeFailoverNodeUpWait seconds for a node
up (with leds_set == false) message to indicate the node
has been already rebooted, and finish the node failover.

Otherwise, AMF will send a message to the node
asking it to reboot itself. When AMF sees that the MDS connectivity is
lost again, or after osafAmfDelayNodeFailoverNodeUpWait seconds,
it can consider the fencing to be complete and finish the node failover.
---
 src/amf/Makefile.am                |   6 +
 src/amf/amfd/cb.h                  |  24 +-
 src/amf/amfd/clm.cc                |  12 +-
 src/amf/amfd/cluster.cc            |  18 ++
 src/amf/amfd/cluster.h             |   1 +
 src/amf/amfd/config.cc             |  35 ++-
 src/amf/amfd/evt.h                 |   1 +
 src/amf/amfd/main.cc               |  13 +-
 src/amf/amfd/ndfsm.cc              |  70 +++++-
 src/amf/amfd/ndproc.cc             |  14 +-
 src/amf/amfd/node.cc               |   2 +
 src/amf/amfd/node_state.cc         | 338 +++++++++++++++++++++++++++++
 src/amf/amfd/node_state.h          | 101 +++++++++
 src/amf/amfd/node_state_machine.cc |  98 +++++++++
 src/amf/amfd/node_state_machine.h  |  39 ++++
 src/amf/amfd/proc.h                |   2 +-
 src/amf/amfd/role.cc               |   9 +-
 src/amf/amfd/timer.cc              |   6 +-
 src/amf/amfd/timer.h               |   1 +
 19 files changed, 761 insertions(+), 29 deletions(-)
 create mode 100644 src/amf/amfd/node_state.cc
 create mode 100644 src/amf/amfd/node_state.h
 create mode 100644 src/amf/amfd/node_state_machine.cc
 create mode 100644 src/amf/amfd/node_state_machine.h

diff --git a/src/amf/Makefile.am b/src/amf/Makefile.am
index 413571a52..8544effd4 100644
--- a/src/amf/Makefile.am
+++ b/src/amf/Makefile.am
@@ -107,6 +107,8 @@ noinst_HEADERS += \
        src/amf/amfd/mds.h \
        src/amf/amfd/msg.h \
        src/amf/amfd/node.h \
+       src/amf/amfd/node_state.h \
+       src/amf/amfd/node_state_machine.h \
        src/amf/amfd/ntf.h \
        src/amf/amfd/pg.h \
        src/amf/amfd/proc.h \
@@ -225,6 +227,8 @@ bin_testamfd_LDFLAGS = \
        src/amf/amfd/bin_osafamfd-ndmsg.o \
        src/amf/amfd/bin_osafamfd-ndproc.o \
        src/amf/amfd/bin_osafamfd-node.o \
+       src/amf/amfd/bin_osafamfd-node_state.o \
+       src/amf/amfd/bin_osafamfd-node_state_machine.o \
        src/amf/amfd/bin_osafamfd-nodegroup.o \
        src/amf/amfd/bin_osafamfd-nodeswbundle.o \
        src/amf/amfd/bin_osafamfd-ntf.o \
@@ -327,6 +331,8 @@ bin_osafamfd_SOURCES = \
        src/amf/amfd/ndmsg.cc \
        src/amf/amfd/ndproc.cc \
        src/amf/amfd/node.cc \
+       src/amf/amfd/node_state.cc \
+       src/amf/amfd/node_state_machine.cc \
        src/amf/amfd/nodegroup.cc \
        src/amf/amfd/nodeswbundle.cc \
        src/amf/amfd/ntf.cc \
diff --git a/src/amf/amfd/cb.h b/src/amf/amfd/cb.h
index 3b7e6d13f..d3d88c1ed 100644
--- a/src/amf/amfd/cb.h
+++ b/src/amf/amfd/cb.h
@@ -37,18 +37,21 @@
 #include <saImmOi.h>
 #include <saClm.h>
 
+#include <atomic>
+#include <list>
+#include <map>
+#include <memory>
+#include <queue>
+
 #include "base/ncssysf_lck.h"
 #include "mds/mds_papi.h"
 #include "mbc/mbcsv_papi.h"
 #include "base/ncs_edu_pub.h"
 
 #include "amf/amfd/ckpt.h"
+#include "amf/amfd/node_state_machine.h"
 #include "amf/amfd/timer.h"
 
-#include <list>
-#include <queue>
-#include <atomic>
-
 class AVD_SI;
 class AVD_AVND;
 
@@ -248,6 +251,19 @@ typedef struct cl_cb_tag {
   /* The duration that amfd should tolerate the absence of SCs */
   uint32_t scs_absence_max_duration;
   AVD_IMM_INIT_STATUS avd_imm_status;
+
+  // MDS_DOWN received for node, we are delaying node failover by this
+  // number of seconds (timer1)
+  SaTimeT node_failover_delay;
+
+  // after receiving MDS_UP, we will wait for NODE_UP up to this number
+  // of seconds (timer2)
+  SaTimeT node_failover_nodeup_wait;
+
+  using FailedNodeMap = std::map<SaClmNodeIdT, 
std::shared_ptr<NodeStateMachine>>;
+  // We received amfnd down for these nodes
+  FailedNodeMap failover_list;
+
 } AVD_CL_CB;
 
 extern AVD_CL_CB *avd_cb;
diff --git a/src/amf/amfd/clm.cc b/src/amf/amfd/clm.cc
index 1e67ff389..aeae93931 100644
--- a/src/amf/amfd/clm.cc
+++ b/src/amf/amfd/clm.cc
@@ -202,8 +202,11 @@ static void clm_node_exit_complete(SaClmNodeIdT nodeId) {
     goto done;
   }
 
-  avd_node_failover(node);
-  avd_node_delete_nodeid(node);
+  if (avd_cb->failover_list.count(node->node_info.nodeId) == 0 &&
+    avd_cb->node_failover_delay == 0) {
+    avd_node_failover(node);
+    avd_node_delete_nodeid(node);
+  }
   m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, AVSV_CKPT_AVD_NODE_CONFIG);
   node->clm_change_start_preceded = false;
 
@@ -318,7 +321,10 @@ static void clm_track_cb(
           } else if (node->node_state == AVD_AVND_STATE_ABSENT) {
             LOG_IN("%s: CLM node '%s' is not an AMF cluster member; MDS down 
received",
                    __FUNCTION__, node_name.c_str());
-            avd_node_delete_nodeid(node);
+            if (avd_cb->failover_list.count(node->node_info.nodeId) == 0 &&
+              avd_cb->node_failover_delay == 0) {
+              avd_node_delete_nodeid(node);
+            }
             goto done;
           }
           TRACE(" Node Left: rootCauseEntity %s for node %u",
diff --git a/src/amf/amfd/cluster.cc b/src/amf/amfd/cluster.cc
index 07d9b5a33..456b9c03e 100644
--- a/src/amf/amfd/cluster.cc
+++ b/src/amf/amfd/cluster.cc
@@ -155,6 +155,24 @@ void avd_node_sync_tmr_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
   TRACE_LEAVE();
 }
 
+void avd_node_failover_tmr_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
+  TRACE_ENTER();
+
+  osafassert(evt->info.tmr.is_active == false);
+  osafassert(evt->info.tmr.type == AVD_TMR_NODE_FAILOVER);
+
+  const SaClmNodeIdT node_id = evt->info.tmr.node_id;
+
+  LOG_NO("Node failover timeout");
+
+  if (cb->failover_list.count(node_id) > 0) {
+    std::shared_ptr<NodeStateMachine> failed_node = 
cb->failover_list.at(node_id);
+    failed_node->TimerExpired();
+  } else {
+    LOG_WA("Node '%x' is not in failover_list", node_id);
+  }
+}
+
 static void ccb_apply_modify_hdlr(struct CcbUtilOperationData *opdata) {
   const SaImmAttrModificationT_2 *attr_mod;
   int i = 0;
diff --git a/src/amf/amfd/cluster.h b/src/amf/amfd/cluster.h
index 88dd899b3..aaa6fc41c 100644
--- a/src/amf/amfd/cluster.h
+++ b/src/amf/amfd/cluster.h
@@ -39,6 +39,7 @@ extern AVD_CLUSTER *avd_cluster;
 extern SaAisErrorT avd_cluster_config_get(void);
 extern void avd_cluster_tmr_init_evh(AVD_CL_CB *cb, struct AVD_EVT *evt);
 extern void avd_node_sync_tmr_evh(AVD_CL_CB *cb, struct AVD_EVT *evt);
+extern void avd_node_failover_tmr_evh(AVD_CL_CB *cb, struct AVD_EVT *evt);
 extern void avd_cluster_constructor(void);
 
 #endif  // AMF_AMFD_CLUSTER_H_
diff --git a/src/amf/amfd/config.cc b/src/amf/amfd/config.cc
index 921d20413..872a9d28a 100644
--- a/src/amf/amfd/config.cc
+++ b/src/amf/amfd/config.cc
@@ -41,6 +41,26 @@ static void ccb_apply_modify_hdlr(struct 
CcbUtilOperationData *opdata) {
       }
       TRACE("osafAmfRestrictAutoRepairEnable changed to '%d'", enabled);
       configuration->restrict_auto_repair(enabled);
+    } else if (!strcmp(attr_mod->modAttr.attrName,
+                "osafAmfDelayNodeFailoverTimeout")) {
+      uint32_t delay = 0;  // default to 0 if attribute is blank
+      if (attr_mod->modType != SA_IMM_ATTR_VALUES_DELETE &&
+          attr_mod->modAttr.attrValues != nullptr) {
+        delay = (*((SaUint32T *)attr_mod->modAttr.attrValues[0]));
+      }
+      avd_cb->node_failover_delay = delay;
+      TRACE("osafAmfDelayNodeFailoverTimeout changed to '%llu'",
+             avd_cb->node_failover_delay);
+    } else if (!strcmp(attr_mod->modAttr.attrName,
+                "osafAmfDelayNodeFailoverNodeUpWait")) {
+      uint32_t delay = 0;  // default to 0 if attribute is blank
+      if (attr_mod->modType != SA_IMM_ATTR_VALUES_DELETE &&
+          attr_mod->modAttr.attrValues != nullptr) {
+        delay = (*((SaUint32T *)attr_mod->modAttr.attrValues[0]));
+      }
+      avd_cb->node_failover_nodeup_wait = delay;
+      TRACE("osafAmfDelayNodeFailoverNodeUpWait changed to '%llu'",
+             avd_cb->node_failover_nodeup_wait);
     }
   }
   TRACE_LEAVE();
@@ -151,11 +171,24 @@ SaAisErrorT Configuration::get_config(void) {
                         &value) == SA_AIS_OK) {
       configuration->restrict_auto_repair(static_cast<bool>(value));
     }
+    if (immutil_getAttr("osafAmfDelayNodeFailoverTimeout", attributes, 0,
+                        &value) == SA_AIS_OK) {
+      avd_cb->node_failover_delay = value;
+    }
+    if (immutil_getAttr("osafAmfDelayNodeFailoverNodeUpWait", attributes, 0,
+                        &value) == SA_AIS_OK) {
+      avd_cb->node_failover_nodeup_wait = value;
+    }
   }
 
   error = SA_AIS_OK;
-  TRACE("osafAmfRestrictAutoRepairEnable set to '%d'",
+
+  LOG_NO("osafAmfRestrictAutoRepairEnable set to '%d'",
         restrict_auto_repair_enabled());
+  LOG_NO("osafAmfDelayNodeFailoverTimeout set to '%llu'",
+        avd_cb->node_failover_delay);
+  LOG_NO("osafAmfDelayNodeFailoverNodeUpWait set to '%llu'",
+        avd_cb->node_failover_nodeup_wait);
 
   (void)immutil_saImmOmSearchFinalize(searchHandle);
 done1:
diff --git a/src/amf/amfd/evt.h b/src/amf/amfd/evt.h
index 104ceb640..a9028cde3 100644
--- a/src/amf/amfd/evt.h
+++ b/src/amf/amfd/evt.h
@@ -58,6 +58,7 @@ typedef enum avd_evt_type {
   AVD_EVT_TMR_CL_INIT,
   AVD_EVT_TMR_SI_DEP_TOL,
   AVD_EVT_TMR_NODE_SYNC,
+  AVD_EVT_TMR_NODE_FAILOVER,
   AVD_EVT_TMR_MAX,
   AVD_EVT_MDS_AVD_UP = AVD_EVT_TMR_MAX,
   AVD_EVT_MDS_AVD_DOWN,
diff --git a/src/amf/amfd/main.cc b/src/amf/amfd/main.cc
index d42fd4ea9..b8f5c224d 100644
--- a/src/amf/amfd/main.cc
+++ b/src/amf/amfd/main.cc
@@ -113,6 +113,7 @@ static const AVD_EVT_HDLR g_actv_list[AVD_EVT_MAX] = {
     avd_cluster_tmr_init_evh, /* AVD_EVT_TMR_CL_INIT */
     avd_sidep_tol_tmr_evh,    /* AVD_EVT_TMR_SI_DEP_TOL */
     avd_node_sync_tmr_evh,    /* AVD_EVT_TMR_ALL_NODE_UP */
+    avd_node_failover_tmr_evh,/* AVD_TMR_NODE_FAILOVER */
 
     /* active AvD MDS events processing */
     avd_mds_avd_up_evh,    /* AVD_EVT_MDS_AVD_UP */
@@ -156,6 +157,7 @@ static const AVD_EVT_HDLR g_stndby_list[AVD_EVT_MAX] = {
     standby_invalid_evh,   /* AVD_EVT_TMR_CL_INIT */
     avd_sidep_tol_tmr_evh, /* AVD_EVT_TMR_SI_DEP_TOL */
     standby_invalid_evh,   /* AVD_EVT_TMR_ALL_NODE_UP */
+    avd_node_failover_tmr_evh,/* AVD_TMR_NODE_FAILOVER */
 
     /* standby AvD MDS events processing */
     avd_mds_avd_up_evh,       /* AVD_EVT_MDS_AVD_UP */
@@ -200,6 +202,7 @@ static const AVD_EVT_HDLR g_quiesc_list[AVD_EVT_MAX] = {
     qsd_ignore_evh,        /* AVD_EVT_TMR_CL_INIT */
     avd_sidep_tol_tmr_evh, /* AVD_EVT_TMR_SI_DEP_TOL */
     qsd_ignore_evh,        /* AVD_EVT_TMR_ALL_NODE_UP */
+    avd_node_failover_tmr_evh,/* AVD_TMR_NODE_FAILOVER */
 
     /* active AvD MDS events processing */
     avd_mds_avd_up_evh,    /* AVD_EVT_MDS_AVD_UP */
@@ -381,7 +384,8 @@ static void handle_event_in_failover_state(AVD_EVT *evt) {
    */
   if ((evt->rcv_evt == AVD_EVT_VERIFY_ACK_NACK_MSG) ||
       (evt->rcv_evt == AVD_EVT_MDS_AVND_DOWN) ||
-      (evt->rcv_evt == AVD_EVT_TMR_SND_HB)) {
+      (evt->rcv_evt == AVD_EVT_TMR_SND_HB) ||
+      (evt->rcv_evt == AVD_EVT_TMR_NODE_FAILOVER)) {
     process_event(cb, evt);
   } else {
     AVD_EVT_QUEUE *queue_evt;
@@ -417,7 +421,8 @@ static void handle_event_in_failover_state(AVD_EVT *evt) {
          it != node_id_db->end();) {
       AVD_AVND *node = it->second;
       ++it;
-      if (AVD_AVND_STATE_ABSENT == node->node_state) {
+      if (AVD_AVND_STATE_ABSENT == node->node_state &&
+          cb->failover_list.find(node->node_info.nodeId) == 
cb->failover_list.end()) {
         bool fover_done = false;
         /* Check whether this node failover has been
            performed or not. */
@@ -429,7 +434,9 @@ static void handle_event_in_failover_state(AVD_EVT *evt) {
             break;
           }
         }
-        if (fover_done == false) avd_node_failover(node);
+        if (fover_done == false) {
+          avd_node_failover(node);
+        }
       }
     }
     /* Since we are sending lots of async update to its peer from
diff --git a/src/amf/amfd/ndfsm.cc b/src/amf/amfd/ndfsm.cc
index edc993988..c460d8fd4 100644
--- a/src/amf/amfd/ndfsm.cc
+++ b/src/amf/amfd/ndfsm.cc
@@ -32,6 +32,7 @@
 #include "amf/amfd/cluster.h"
 #include "base/daemon.h"
 #include <algorithm>
+#include <memory>
 
 AmfDb<uint32_t, AVD_FAIL_OVER_NODE> *node_list_db = 0; /* SaClmNodeIdT index */
 
@@ -290,6 +291,7 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
   uint32_t rc = NCSCC_RC_SUCCESS;
   uint32_t sync_nd_size = avd_count_sync_node_size(cb);
   bool act_nd;
+  AVD_CL_CB::FailedNodeMap::iterator failed_node;
 
   TRACE_ENTER2(
       "from %x, %s", n2d_msg->msg_info.n2d_node_up.node_id,
@@ -303,6 +305,32 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
     goto done;
   }
 
+  /* Cannot use avd_msg_sanity_chk here since this is a special case */
+  if ((avnd = avd_node_find_nodeid(n2d_msg->msg_info.n2d_node_up.node_id)) ==
+      nullptr) {
+    TRACE("invalid node ID (%x)", n2d_msg->msg_info.n2d_node_up.node_id);
+    goto done;
+  }
+
+  TRACE("leds_set %d", n2d_msg->msg_info.n2d_node_up.leds_set);
+
+  failed_node = cb->failover_list.find(
+                  n2d_msg->msg_info.n2d_node_up.node_id);
+  if (failed_node != cb->failover_list.end()) {
+    if (n2d_msg->msg_info.n2d_node_up.leds_set == false) {
+      // if set_leds is false indicating the node has rebooted and not
+      // in headless sync state
+      failed_node->second->NodeUp();
+    } else {
+      // split network partition probably occurred, failover and reboot node
+      failed_node->second->TimerExpired();
+      goto done;
+    }
+  } else {
+    TRACE("node_id '%x' not in failover_list.",
+           n2d_msg->msg_info.n2d_node_up.node_id);
+  }
+
   act_nd = n2d_msg->msg_info.n2d_node_up.node_id == cb->node_id_avd;
   if (cb->scs_absence_max_duration > 0 && cb->all_nodes_synced == false &&
       cb->node_sync_window_closed == false) {
@@ -357,13 +385,6 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
     }
   }
 
-  /* Cannot use avd_msg_sanity_chk here since this is a special case */
-  if ((avnd = avd_node_find_nodeid(n2d_msg->msg_info.n2d_node_up.node_id)) ==
-      nullptr) {
-    TRACE("invalid node ID (%x)", n2d_msg->msg_info.n2d_node_up.node_id);
-    goto done;
-  }
-
   /* Retrieve the information from the message */
   record_node_up_msg_info(avnd, n2d_msg);
 
@@ -741,10 +762,18 @@ void avd_nd_ncs_su_failed(AVD_CL_CB *cb, AVD_AVND *avnd) {
  **************************************************************************/
 
 void avd_mds_avnd_up_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
+  TRACE_ENTER();
+
   if (evt->info.node_id == cb->node_id_avd) {
     TRACE("Local node director is up, start sending heart beats to %" PRIx64,
           cb->local_avnd_adest);
     avd_tmr_snd_hb_evh(cb, evt);
+  } else {
+    auto search = cb->failover_list.find(evt->info.node_id);
+    if (search != cb->failover_list.end()) {
+      std::shared_ptr<NodeStateMachine> failed_node = search->second;
+      failed_node->MdsUp();
+    }
   }
 
   TRACE("amfnd on %x is up", evt->info.node_id);
@@ -793,13 +822,29 @@ void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
       daemon_exit();
     }
 
+    if (cb->failover_list.find(evt->info.node_id) != cb->failover_list.end()) {
+      std::shared_ptr<NodeStateMachine> failed_node =
+        cb->failover_list.at(evt->info.node_id);
+      failed_node->MdsDown();
+    } else if (cb->node_failover_delay > 0) {
+      LOG_NO("Node '%s' is down. Start failover delay timer",
+              node->node_name.c_str());
+
+      auto missing_node = std::make_shared<NodeStateMachine>(cb, 
evt->info.node_id);
+      cb->failover_list[evt->info.node_id] = missing_node;
+      missing_node->MdsDown();
+    }
+
     if (avd_cb->avail_state_avd == SA_AMF_HA_ACTIVE) {
-      avd_node_failover(node);
+      if (cb->node_failover_delay == 0) {
+        avd_node_failover(node);
+      }
+      node->node_info.member = SA_FALSE;
       // Update standby out of sync if standby sc goes down
       if (avd_cb->node_id_avd_other == node->node_info.nodeId) {
         cb->stby_sync_state = AVD_STBY_OUT_OF_SYNC;
       }
-    } else {
+    } else if (cb->node_failover_delay == 0) {
       /* Remove dynamic info for node but keep in nodeid tree.
        * Possibly used at the end of controller failover to
        * to failover payload nodes.
@@ -871,6 +916,12 @@ void avd_fail_over_event(AVD_CL_CB *cb) {
     if ((AVD_AVND_STATE_PRESENT == avnd->node_state) ||
         (AVD_AVND_STATE_NO_CONFIG == avnd->node_state) ||
         (AVD_AVND_STATE_NCS_INIT == avnd->node_state)) {
+      // this node is in delayed failover state, do not send data verify to it
+      if (cb->failover_list.find(avnd->node_info.nodeId) !=
+            cb->failover_list.end()) {
+        continue;
+      }
+
       /*
        * Send verify message to this node.
        */
@@ -1129,7 +1180,6 @@ void avd_node_mark_absent(AVD_AVND *node) {
   node->recvr_fail_sw = false;
 
   node->node_info.initialViewNumber = 0;
-  node->node_info.member = SA_FALSE;
 
   /* Increment node failfast counter */
   avd_cb->nodes_exit_cnt++;
diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc
index 31d2263d2..853a68b6e 100644
--- a/src/amf/amfd/ndproc.cc
+++ b/src/amf/amfd/ndproc.cc
@@ -1231,12 +1231,16 @@ done:
  * services to other nodes.
  * @param node
  */
-void avd_node_failover(AVD_AVND *node) {
+void avd_node_failover(AVD_AVND *node, const bool mw_only) {
   TRACE_ENTER2("'%s'", node->name.c_str());
-  avd_node_mark_absent(node);
-  avd_pg_node_csi_del_all(avd_cb, node);
-  avd_node_down_mw_susi_failover(avd_cb, node);
-  avd_node_down_appl_susi_failover(avd_cb, node);
+  if (mw_only == true) {
+    avd_node_down_mw_susi_failover(avd_cb, node);
+  } else {
+    avd_node_mark_absent(node);
+    avd_pg_node_csi_del_all(avd_cb, node);
+    avd_node_down_mw_susi_failover(avd_cb, node);
+    avd_node_down_appl_susi_failover(avd_cb, node);
+  }
 
   Consensus consensus_service;
   if (consensus_service.IsRemoteFencingEnabled() == false &&
diff --git a/src/amf/amfd/node.cc b/src/amf/amfd/node.cc
index 0ffcfb782..201f1fc41 100644
--- a/src/amf/amfd/node.cc
+++ b/src/amf/amfd/node.cc
@@ -58,6 +58,7 @@ uint32_t avd_node_add_nodeid(AVD_AVND *node) {
 }
 
 void avd_node_delete_nodeid(AVD_AVND *node) {
+  TRACE_ENTER2("%s", node->node_name.c_str());
   node_id_db->erase(node->node_info.nodeId);
 }
 
@@ -793,6 +794,7 @@ static void node_ccb_apply_delete_hdlr(AVD_AVND *node) {
     return;
   }
   TRACE_ENTER2("'%s'", node->name.c_str());
+  avd_cb->failover_list.erase(node->node_info.nodeId);
   avd_node_delete_nodeid(node);
   avd_node_delete(node);
   TRACE_LEAVE();
diff --git a/src/amf/amfd/node_state.cc b/src/amf/amfd/node_state.cc
new file mode 100644
index 000000000..c4d65c915
--- /dev/null
+++ b/src/amf/amfd/node_state.cc
@@ -0,0 +1,338 @@
+#include "base/logtrace.h"
+#include "base/ncssysf_def.h"
+#include "osaf/consensus/consensus.h"
+#include "amf/amfd/node.h"
+#include "amf/amfd/node_state.h"
+#include "amf/amfd/node_state_machine.h"
+#include "amf/amfd/proc.h"
+
+NodeState::NodeState(NodeStateMachine *fsm) :
+  fsm_(fsm) {
+}
+
+// state 'Start'
+Start::Start(NodeStateMachine *fsm) :
+  NodeState(fsm) {
+  avd_stop_tmr(fsm_->cb_, fsm_->timer_.get());
+}
+
+void Start::TimerExpired() {
+  LOG_ER("unexpected timer event");
+}
+
+void Start::MdsUp() {
+  TRACE_ENTER();
+}
+
+void Start::MdsDown() {
+  TRACE_ENTER();
+
+  if (fsm_->Active() == true) {
+    Consensus consensus_service;
+    if (consensus_service.IsRemoteFencingEnabled() == true) {
+      // get CLM node name
+      AVD_AVND *node = avd_node_find_nodeid(fsm_->node_id_);
+      std::string hostname = osaf_extended_name_borrow(
+        &node->node_info.nodeName);
+      size_t first = hostname.find_first_of("=") + 1;
+      size_t end = hostname.find_first_of(",");
+      hostname = hostname.substr(first, end - first);
+
+      // fence the lost node
+      opensaf_reboot(fsm_->node_id_, hostname.c_str(), "Fencing remote node");
+
+      // failover node
+      avd_node_failover(node);
+
+      fsm_->SetState(std::make_shared<End>(fsm_));
+      return;
+    }
+  }
+
+  // transition to 'Lost' state
+  fsm_->SetState(std::make_shared<Lost>(fsm_));
+}
+
+void Start::NodeUp() {
+  TRACE_ENTER();
+}
+
+// state 'Lost'
+Lost::Lost(NodeStateMachine *fsm) :
+  NodeState(fsm) {
+  avd_stop_tmr(fsm_->cb_, fsm_->timer_.get());
+  LOG_NO("Start timer for '%x'", fsm_->node_id_);
+  avd_start_tmr(fsm_->cb_, fsm_->timer_.get(),
+                fsm_->cb_->node_failover_delay * SA_TIME_ONE_SECOND);
+}
+
+void Lost::TimerExpired() {
+  TRACE_ENTER2("node_id %x", fsm_->node_id_);
+  if (fsm_->Active() == true) {
+    AVD_AVND *node = avd_node_find_nodeid(fsm_->node_id_);
+    osafassert(node != nullptr);
+
+    LOG_NO("Completing delayed node failover for '%s'",
+            node->node_name.c_str());
+    avd_node_failover(node);
+
+    // transition to 'Failed' state
+    fsm_->SetState(std::make_shared<Failed>(fsm_));
+  } else {
+    TRACE("Timer expired in 'Lost' state for '%x'  on standby. Restart timer",
+            fsm_->node_id_);
+
+    // wait for checkpoint to transition state
+    // meanwhile, restart timer in case a SC failover to this node occurs
+    avd_start_tmr(fsm_->cb_, fsm_->timer_.get(),
+                  fsm_->cb_->node_failover_delay * SA_TIME_ONE_SECOND);
+  }
+}
+
+void Lost::MdsUp() {
+  TRACE_ENTER2("node_id %x", fsm_->node_id_);
+
+  // transition to 'LostFound' state
+  fsm_->SetState(std::make_shared<LostFound>(fsm_));
+}
+
+void Lost::MdsDown() {
+  if (fsm_->Active() == true) {
+    LOG_ER("unexpected MDS down event");
+  }
+}
+
+void Lost::NodeUp() {
+  LOG_ER("unexpected node up event");
+}
+
+// state 'LostFound'
+LostFound::LostFound(NodeStateMachine *fsm) :
+  NodeState(fsm) {
+  avd_stop_tmr(fsm_->cb_, fsm_->timer_.get());
+  avd_start_tmr(fsm_->cb_, fsm_->timer_.get(),
+                fsm_->cb_->node_failover_nodeup_wait * SA_TIME_ONE_SECOND);
+}
+
+void LostFound::TimerExpired() {
+  TRACE_ENTER2("node_id %x", fsm_->node_id_);
+
+  AVD_AVND *node = avd_node_find_nodeid(fsm_->node_id_);
+  osafassert(node != nullptr);
+
+  // reboot node if it hasn't sent node_up within timer duration,
+  // meaning it hasn't rebooted since we lost contact
+  LOG_WA("Lost node '%s' has reappeared after network separation",
+          node->node_name.c_str());
+
+  if (fsm_->Active() == true) {
+    LOG_WA("Sending node reboot order");
+    avd_d2n_reboot_snd(node);
+
+    // transition to 'Failed' state
+    fsm_->SetState(std::make_shared<LostRebooting>(fsm_));
+  } else {
+    TRACE("Timer expired in 'LostFound' state for '%x'  on standby. Restart 
timer",
+            fsm_->node_id_);
+
+    // wait for checkpoint to transition state
+    // meanwhile, restart timer in case a SC failover to this node occurs
+    avd_start_tmr(fsm_->cb_, fsm_->timer_.get(),
+                  fsm_->cb_->node_failover_nodeup_wait * SA_TIME_ONE_SECOND);
+  }
+}
+
+void LostFound::MdsUp() {
+  if (fsm_->Active() == true) {
+    LOG_ER("unexpected MDS up event");
+  }
+}
+
+void LostFound::MdsDown() {
+  LOG_WA("unexpected MDS down event");
+}
+
+void LostFound::NodeUp() {
+  TRACE_ENTER2("node_id %x", fsm_->node_id_);
+
+  if (fsm_->Active() == true) {
+    AVD_AVND *node = avd_node_find_nodeid(fsm_->node_id_);
+    osafassert(node != nullptr);
+
+    // don't call avd_node_delete_nodeid as the node is already up
+    avd_node_failover(node);
+    node->node_info.member = SA_TRUE;
+
+    fsm_->SetState(std::make_shared<End>(fsm_));
+  } else {
+    // wait for checkpoint to transition state
+    // we are standby and shouldn't get node up
+    LOG_ER("unexpected node up event");
+  }
+}
+
+// state 'LostRebooting'
+LostRebooting::LostRebooting(NodeStateMachine *fsm) :
+  NodeState(fsm) {
+  avd_stop_tmr(fsm_->cb_, fsm_->timer_.get());
+  avd_start_tmr(fsm_->cb_, fsm_->timer_.get(),
+                fsm_->cb_->node_failover_nodeup_wait * SA_TIME_ONE_SECOND);
+}
+
+void LostRebooting::TimerExpired() {
+  TRACE_ENTER2("node_id %x", fsm_->node_id_);
+
+  if (fsm_->Active() == true) {
+    AVD_AVND *node = avd_node_find_nodeid(fsm_->node_id_);
+    osafassert(node != nullptr);
+
+    LOG_NO("Completing delayed node failover for '%s'",
+            node->node_name.c_str());
+    avd_node_failover(node);
+
+    fsm_->SetState(std::make_shared<End>(fsm_));
+  } else {
+    TRACE("Timer expired in 'LostRebooting' state for '%x'  on standby. 
Restart timer",
+            fsm_->node_id_);
+
+    // wait for checkpoint to transition state
+    // meanwhile, restart timer in case a SC failover to this node occurs
+    avd_start_tmr(fsm_->cb_, fsm_->timer_.get(),
+                  fsm_->cb_->node_failover_nodeup_wait * SA_TIME_ONE_SECOND);
+  }
+}
+
+void LostRebooting::MdsUp() {
+  if (fsm_->Active() == true) {
+    LOG_ER("unexpected MDS up event");
+  }
+}
+
+void LostRebooting::MdsDown() {
+  TRACE_ENTER2("node_id %x", fsm_->node_id_);
+
+  AVD_AVND *node = avd_node_find_nodeid(fsm_->node_id_);
+  osafassert(node != nullptr);
+
+  if (fsm_->Active() == true) {
+    LOG_WA("Node '%s' is down. Failover its previous assignments",
+            node->node_name.c_str());
+    avd_node_failover(node);
+
+    AVD_AVND *node = avd_node_find_nodeid(fsm_->node_id_);
+    osafassert(node != nullptr);
+
+    fsm_->SetState(std::make_shared<End>(fsm_));
+  } else {
+    // wait for checkpoint to transition state
+  }
+}
+
+void LostRebooting::NodeUp() {
+  LOG_ER("unexpected node up event");
+}
+
+// state 'Failed'
+
+Failed::Failed(NodeStateMachine *fsm) :
+  NodeState(fsm) {
+  avd_stop_tmr(fsm_->cb_, fsm_->timer_.get());
+}
+
+void Failed::TimerExpired() {
+  LOG_ER("unexpected timer event");
+}
+
+void Failed::MdsUp() {
+  TRACE_ENTER2("node_id %x", fsm_->node_id_);
+
+  // transition to 'FailedFound' state
+  fsm_->SetState(std::make_shared<FailedFound>(fsm_));
+}
+
+void Failed::MdsDown() {
+  LOG_WA("unexpected MDS down event");
+}
+
+void Failed::NodeUp() {
+  LOG_WA("unexpected node up event");
+
+  // transition to 'FailedFound' state anyway, as it's evident
+  // the node is back up even though we missed MDS up
+  fsm_->SetState(std::make_shared<FailedFound>(fsm_));
+}
+
+// state 'FailedFound'
+
+FailedFound::FailedFound(NodeStateMachine *fsm) :
+  NodeState(fsm) {
+  avd_stop_tmr(fsm_->cb_, fsm_->timer_.get());
+
+  // start timer2, wait for node up
+  avd_start_tmr(fsm_->cb_, fsm_->timer_.get(),
+                fsm_->cb_->node_failover_nodeup_wait * SA_TIME_ONE_SECOND);
+}
+
+void FailedFound::TimerExpired() {
+  TRACE_ENTER2("node_id %x", fsm_->node_id_);
+
+  AVD_AVND *node = avd_node_find_nodeid(fsm_->node_id_);
+  osafassert(node != nullptr);
+
+  LOG_WA("Failed node '%s' has reappeared after network separation",
+          node->node_name.c_str());
+
+  if (fsm_->Active() == true) {
+    LOG_WA("Sending node reboot order");
+    avd_d2n_reboot_snd(node);
+
+    fsm_->SetState(std::make_shared<End>(fsm_));
+  } else {
+    TRACE("Timer expired in 'FailedFound' state for '%x'  on standby. Restart 
timer",
+            fsm_->node_id_);
+
+    // wait for checkpoint to transition state
+    // meanwhile, restart timer in case a SC failover to this node occurs
+    avd_start_tmr(fsm_->cb_, fsm_->timer_.get(),
+                  fsm_->cb_->node_failover_nodeup_wait * SA_TIME_ONE_SECOND);
+  }
+}
+
+void FailedFound::MdsUp() {
+  if (fsm_->Active() == true) {
+    LOG_ER("unexpected MDS up event");
+  }
+}
+
+void FailedFound::MdsDown() {
+  LOG_WA("unexpected MDS down event");
+}
+
+void FailedFound::NodeUp() {
+  TRACE_ENTER2("node_id %x", fsm_->node_id_);
+
+  fsm_->SetState(std::make_shared<End>(fsm_));
+}
+
+// state 'End'
+
+End::End(NodeStateMachine *fsm) :
+  NodeState(fsm) {
+  avd_stop_tmr(fsm_->cb_, fsm_->timer_.get());
+}
+
+void End::TimerExpired() {
+  osafassert(false);
+}
+
+void End::MdsUp() {
+  osafassert(false);
+}
+
+void End::MdsDown() {
+  osafassert(false);
+}
+
+void End::NodeUp() {
+  osafassert(false);
+}
diff --git a/src/amf/amfd/node_state.h b/src/amf/amfd/node_state.h
new file mode 100644
index 000000000..7d2f021fa
--- /dev/null
+++ b/src/amf/amfd/node_state.h
@@ -0,0 +1,101 @@
+#ifndef AMF_AMFD_NODE_STATE_H_
+#define AMF_AMFD_NODE_STATE_H_
+
+#include <stdint.h>
+#include "base/macros.h"
+
+class NodeStateMachine;
+
+class NodeState {
+ public:
+  enum NodeStates : uint32_t {kUndefined = 0, kStart, kLost,
+                              kLostFound, kLostRebooting,
+                              kFailed, kFailedFound, kEnd};
+
+  virtual void TimerExpired() =0;
+  virtual void MdsUp() =0;
+  virtual void MdsDown() =0;
+  virtual void NodeUp() =0;
+  virtual uint32_t GetInt() =0;
+  NodeState(NodeStateMachine *fsm);
+  virtual ~NodeState() {};
+
+ protected:
+  NodeStateMachine *fsm_;
+
+ private:
+  NodeState();
+  DELETE_COPY_AND_MOVE_OPERATORS(NodeState);
+};
+
+class Start : public NodeState {
+ public:
+  Start(NodeStateMachine *fsm);
+  virtual void TimerExpired();
+  virtual void MdsUp();
+  virtual void MdsDown();
+  virtual void NodeUp();
+  virtual uint32_t GetInt() {return kStart;}
+};
+
+class Lost : public NodeState {
+ public:
+  Lost(NodeStateMachine *fsm);
+  virtual void TimerExpired();
+  virtual void MdsUp();
+  virtual void MdsDown();
+  virtual void NodeUp();
+  virtual uint32_t GetInt() {return kLost;}
+};
+
+class LostFound : public NodeState {
+ public:
+  LostFound(NodeStateMachine *fsm);
+  virtual void TimerExpired();
+  virtual void MdsUp();
+  virtual void MdsDown();
+  virtual void NodeUp();
+  virtual uint32_t GetInt() {return kLostFound;}
+};
+
+class LostRebooting : public NodeState {
+ public:
+  LostRebooting(NodeStateMachine *fsm);
+  virtual void TimerExpired();
+  virtual void MdsUp();
+  virtual void MdsDown();
+  virtual void NodeUp();
+  virtual uint32_t GetInt() {return kLostRebooting;}
+};
+
+class Failed : public NodeState {
+ public:
+  Failed(NodeStateMachine *fsm);
+  virtual void TimerExpired();
+  virtual void MdsUp();
+  virtual void MdsDown();
+  virtual void NodeUp();
+  virtual uint32_t GetInt() {return kFailed;}
+};
+
+class FailedFound : public NodeState {
+ public:
+  FailedFound(NodeStateMachine *fsm);
+  virtual void TimerExpired();
+  virtual void MdsUp();
+  virtual void MdsDown();
+  virtual void NodeUp();
+  virtual uint32_t GetInt() {return kFailedFound;}
+};
+
+class End : public NodeState {
+ public:
+  End(NodeStateMachine *fsm);
+  virtual void TimerExpired();
+  virtual void MdsUp();
+  virtual void MdsDown();
+  virtual void NodeUp();
+  virtual uint32_t GetInt() {return kEnd;}
+};
+
+#endif // AMF_AMFD_NODE_STATE_H_
diff --git a/src/amf/amfd/node_state_machine.cc 
b/src/amf/amfd/node_state_machine.cc
new file mode 100644
index 000000000..478ad2a48
--- /dev/null
+++ b/src/amf/amfd/node_state_machine.cc
@@ -0,0 +1,98 @@
+#include "base/logtrace.h"
+#include "amf/amfd/amfd.h"
+#include "amf/amfd/node_state_machine.h"
+
+NodeStateMachine::NodeStateMachine(struct cl_cb_tag *cb,
+                                  const SaClmNodeIdT node_id) :
+  node_id_(node_id),
+  cb_(cb) {
+  timer_ = std::make_shared<AVD_TMR>();
+  timer_->node_id = node_id;
+  timer_->type = AVD_TMR_NODE_FAILOVER;
+  timer_->is_active = false;
+  timer_->tmr_id = TMR_T_NULL;
+  state_ = std::make_shared<Start>(this);
+}
+
+NodeStateMachine::~NodeStateMachine() {
+  avd_stop_tmr(cb_, timer_.get());
+}
+
+void NodeStateMachine::TimerExpired() {
+  // in case this was triggered manually, stop the timer
+  avd_stop_tmr(cb_, timer_.get());
+  state_->TimerExpired();
+}
+
+void NodeStateMachine::MdsUp() {
+  state_->MdsUp();
+}
+
+void NodeStateMachine::MdsDown() {
+  state_->MdsDown();
+}
+
+void NodeStateMachine::NodeUp() {
+  state_->NodeUp();
+}
+
+void NodeStateMachine::SetState(std::shared_ptr<NodeState> state) {
+  TRACE_ENTER2("'%x'", node_id_);
+  state_ = state;
+  AVD_AVND *node = avd_node_find_nodeid(node_id_);
+  osafassert(node != nullptr);
+  m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb_, node, AVSV_CKPT_NODE_FAILOVER_STATE);
+
+  if (state->GetInt() == NodeState::kEnd) {
+    cb_->failover_list.erase(node_id_);
+  }
+}
+
+// this is called as a result of check pointing from the active
+void NodeStateMachine::SetState(uint32_t state) {
+  TRACE_ENTER2("'%x', state '%u', current state '%u'",
+                node_id_, state, state_->GetInt());
+
+  if (state == state_->GetInt()) {
+    LOG_NO("Node state unchanged");
+    return;
+  } else {
+    LOG_NO("New state '%u'", state);
+  }
+
+  switch (state) {
+    case NodeState::kStart:
+      state_ = std::make_shared<Start>(this);
+      break;
+    case NodeState::kLost:
+      state_ = std::make_shared<Lost>(this);
+      break;
+    case NodeState::kLostFound:
+      state_ = std::make_shared<LostFound>(this);
+      break;
+    case NodeState::kLostRebooting:
+      state_ = std::make_shared<LostRebooting>(this);
+      break;
+    case NodeState::kFailed:
+      state_ = std::make_shared<Failed>(this);
+      break;
+    case NodeState::kFailedFound:
+      state_ = std::make_shared<FailedFound>(this);
+      break;
+    case NodeState::kEnd:
+      state_ = std::make_shared<FailedFound>(this);
+      cb_->failover_list.erase(node_id_);
+      break;
+    default:
+      LOG_ER("undefined state '%u'", state);
+      break;
+  }
+}
+
+uint32_t NodeStateMachine::GetState() {
+  return state_->GetInt();
+}
+
+bool NodeStateMachine::Active() {
+  return cb_->avail_state_avd == SA_AMF_HA_ACTIVE;
+}
diff --git a/src/amf/amfd/node_state_machine.h 
b/src/amf/amfd/node_state_machine.h
new file mode 100644
index 000000000..3bfabd09f
--- /dev/null
+++ b/src/amf/amfd/node_state_machine.h
@@ -0,0 +1,39 @@
+#ifndef AMF_AMFD_NODE_STATE_MACHINE_H_
+#define AMF_AMFD_NODE_STATE_MACHINE_H_
+
+#include <memory>
+#include <saClm.h>
+#include "base/macros.h"
+#include "amf/amfd/node_state.h"
+#include "amf/amfd/timer.h"
+
+class NodeStateMachine {
+ public:
+  void TimerExpired();
+  void MdsUp();
+  void MdsDown();
+  void NodeUp();
+  void SetState(std::shared_ptr<NodeState> state);
+
+  // is this the active controller
+  bool Active();
+
+  // use for ckpt encode/decode only
+  void SetState(uint32_t state);
+  uint32_t GetState();
+
+  std::shared_ptr<AVD_TMR> timer_;
+  std::shared_ptr<NodeState> state_;
+
+  SaClmNodeIdT node_id_;
+  struct cl_cb_tag *cb_;
+
+  NodeStateMachine(struct cl_cb_tag *cb, const SaClmNodeIdT node_id);
+  ~NodeStateMachine();
+
+ private:
+ NodeStateMachine();
+ DELETE_COPY_AND_MOVE_OPERATORS(NodeStateMachine);
+};
+
+#endif // AMF_AMFD_NODE_STATE_MACHINE_H_
diff --git a/src/amf/amfd/proc.h b/src/amf/amfd/proc.h
index 7d461e82e..99d1cbfc2 100644
--- a/src/amf/amfd/proc.h
+++ b/src/amf/amfd/proc.h
@@ -95,7 +95,7 @@ void avd_rcv_hb_d_evh(AVD_CL_CB *cb, struct AVD_EVT *evt);
 void avd_process_hb_event(AVD_CL_CB *cb_now, struct AVD_EVT *evt);
 extern void avd_node_mark_absent(AVD_AVND *node);
 extern void avd_tmr_snd_hb_evh(AVD_CL_CB *cb, AVD_EVT *evt);
-extern void avd_node_failover(AVD_AVND *node);
+extern void avd_node_failover(AVD_AVND *node, const bool mw_only = false);
 extern AVD_SU *get_other_su_from_oper_list(AVD_SU *su);
 extern void su_complete_admin_op(AVD_SU *su, SaAisErrorT result);
 extern void comp_complete_admin_op(AVD_COMP *comp, SaAisErrorT result);
diff --git a/src/amf/amfd/role.cc b/src/amf/amfd/role.cc
index dc7166516..42f77f817 100644
--- a/src/amf/amfd/role.cc
+++ b/src/amf/amfd/role.cc
@@ -490,7 +490,14 @@ static uint32_t avd_role_failover(AVD_CL_CB *cb, 
SaAmfHAStateT role) {
 
   avd_cb->is_implementer = true;
 
-  avd_node_failover(failed_node);
+  if (cb->failover_list.find(failed_node->node_info.nodeId) != 
cb->failover_list.end()) {
+    // triggered by FM, in which case we should failover OpenSAF SUs only
+    // application SUs will be handled via the node failover delay timer
+    LOG_NO("Failing over OpenSAF components only");
+    avd_node_failover(failed_node, true);
+  } else {
+    avd_node_failover(failed_node);
+  }
   avd_act_on_sis_in_tol_timer_state();
 
   LOG_NO("FAILOVER StandBy --> Active DONE!");
diff --git a/src/amf/amfd/timer.cc b/src/amf/amfd/timer.cc
index 0c448fa1c..cb7d52de9 100644
--- a/src/amf/amfd/timer.cc
+++ b/src/amf/amfd/timer.cc
@@ -118,6 +118,10 @@ void avd_stop_tmr(AVD_CL_CB *cb, AVD_TMR *tmr) {
     return;
   }
 
+  if (tmr->tmr_id == TMR_T_NULL) {
+    return;
+  }
+
   /* Stop the timer if it is active... */
   if (tmr->is_active == true) {
     tmr->is_active = false;
@@ -149,7 +153,7 @@ void avd_tmr_exp(void *uarg) {
   AVD_TMR *tmr = (AVD_TMR *)uarg;
   AVD_EVT *evt = AVD_EVT_NULL;
 
-  TRACE_ENTER();
+  TRACE_ENTER2("%u", tmr->type);
 
   /*
    * Check if this timer was stopped after "avd_tmr_exp" was called
diff --git a/src/amf/amfd/timer.h b/src/amf/amfd/timer.h
index ee6c4d562..53168797f 100644
--- a/src/amf/amfd/timer.h
+++ b/src/amf/amfd/timer.h
@@ -46,6 +46,7 @@ typedef enum avd_tmr_type {
 
   AVD_TMR_SI_DEP_TOL, /* SI_SI dependency tolerance timer */
   AVD_TMR_NODE_SYNC,  /* node sync timer for all PLs from headless */
+  AVD_TMR_NODE_FAILOVER,
   AVD_TMR_MAX
 } AVD_TMR_TYPE;
 
-- 
2.17.1




_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to