- amfd reset msg id counter for node that ignore amfnd down
event to avoid nodes reboot once more due to mismatch msg id after
reboot up from reboot order for sending node_up after sync window.

- amfd active order reboot its standby if it detect another
active amfd (multi partition cluster rejoin). Two actives will be
handled by RDE detect split-brain.

- amfd standby should reboot itself if see two active peers to
avoid standby do cold-sync or be updated with wrong active.
Two actives will be handled by RDE detect split-brain.

- amfd just become standby (out of sync) but see active down
should reboot itself.
---
 src/amf/amfd/dmsg.cc   |  8 ++++++++
 src/amf/amfd/evt.h     |  1 +
 src/amf/amfd/main.cc   |  3 +++
 src/amf/amfd/mds.cc    | 36 ++++++++++++++++++++++++++++++++++--
 src/amf/amfd/msg.h     |  1 +
 src/amf/amfd/ndfsm.cc  |  2 ++
 src/amf/amfd/proc.h    |  1 +
 src/amf/amfd/role.cc   | 27 +++++++++++++++++++++++++++
 src/amf/amfd/util.cc   |  2 +-
 src/amf/amfnd/amfnd.cc |  2 +-
 10 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/src/amf/amfd/dmsg.cc b/src/amf/amfd/dmsg.cc
index cf4019d8a..c450b649e 100644
--- a/src/amf/amfd/dmsg.cc
+++ b/src/amf/amfd/dmsg.cc
@@ -75,6 +75,8 @@ void avd_mds_d_enc(MDS_CALLBACK_ENC_INFO *enc_info) {
       ncs_encode_32bit(&data, msg->msg_info.d2d_chg_role_rsp.role);
       ncs_encode_32bit(&data, msg->msg_info.d2d_chg_role_rsp.status);
       break;
+    case AVD_D2D_REBOOT:
+      break;
     default:
       LOG_ER("%s: unknown msg %u", __FUNCTION__, msg->msg_type);
       break;
@@ -120,6 +122,8 @@ void avd_mds_d_dec(MDS_CALLBACK_DEC_INFO *dec_info) {
           static_cast<SaAmfHAStateT>(ncs_decode_32bit(&data));
       d2d_msg->msg_info.d2d_chg_role_rsp.status = ncs_decode_32bit(&data);
       break;
+    case AVD_D2D_REBOOT:
+      break;
     default:
       LOG_ER("%s: unknown msg %u", __FUNCTION__, d2d_msg->msg_type);
       break;
@@ -210,6 +214,10 @@ uint32_t avd_d2d_msg_rcv(AVD_D2D_MSG *rcv_msg) {
         osafassert(0);
       }
       break;
+    case AVD_D2D_REBOOT:
+      LOG_ER("Reboot order from Active as roaming SC split-brain detected");
+      opensaf_quick_reboot("Split-brain detected");
+      break;
     default:
       LOG_ER("%s: unknown msg %u", __FUNCTION__, rcv_msg->msg_type);
       break;
diff --git a/src/amf/amfd/evt.h b/src/amf/amfd/evt.h
index a9028cde3..a08dccebb 100644
--- a/src/amf/amfd/evt.h
+++ b/src/amf/amfd/evt.h
@@ -72,6 +72,7 @@ typedef enum avd_evt_type {
   AVD_IMM_REINITIALIZED,
   AVD_EVT_UNASSIGN_SI_DEP_STATE,
   AVD_EVT_ND_MDS_VER_INFO,
+  AVD_EVT_ROAMING_SC_SPLITBRAIN,
   AVD_EVT_MAX
 } AVD_EVT_TYPE;
 
diff --git a/src/amf/amfd/main.cc b/src/amf/amfd/main.cc
index 3b1536721..3cc0d9741 100644
--- a/src/amf/amfd/main.cc
+++ b/src/amf/amfd/main.cc
@@ -132,6 +132,9 @@ static const AVD_EVT_HDLR g_actv_list[AVD_EVT_MAX] = {
     invalid_evh,                /* AVD_EVT_INVALID */
     avd_sidep_unassign_evh,     /* AVD_EVT_UNASSIGN_SI_DEP_STATE */
     avd_avnd_mds_info_evh,      /* AVD_EVT_ND_MDS_VER_INFO*/
+
+    /* Roaming SC split-brain processing */
+    avd_roaming_sc_split_brain_evh, /* AVD_EVT_ROAMING_SC_SPLITBRAIN */
 };
 
 /* list of all the function pointers related to handling the events
diff --git a/src/amf/amfd/mds.cc b/src/amf/amfd/mds.cc
index 108f9b8bd..34ed96586 100644
--- a/src/amf/amfd/mds.cc
+++ b/src/amf/amfd/mds.cc
@@ -400,9 +400,11 @@ static uint32_t 
avd_mds_svc_evt(MDS_CALLBACK_SVC_EVENT_INFO *evt_info) {
     case NCSMDS_UP:
       switch (evt_info->i_svc_id) {
         case NCSMDS_SVC_ID_AVD:
+          TRACE("NCSMDS_UP AVD %x", evt_info->i_node_id);
           /* if((Is this up from other node) && (Is this Up from an Adest)) */
           if ((evt_info->i_node_id != cb->node_id_avd) &&
-              (m_MDS_DEST_IS_AN_ADEST(evt_info->i_dest))) {
+              (m_MDS_DEST_IS_AN_ADEST(evt_info->i_dest)) &&
+              (cb->other_avd_adest == 0)) {
             cb->node_id_avd_other = evt_info->i_node_id;
             cb->other_avd_adest = evt_info->i_dest;
             cb->stby_sync_state = AVD_STBY_OUT_OF_SYNC;
@@ -441,14 +443,44 @@ static uint32_t 
avd_mds_svc_evt(MDS_CALLBACK_SVC_EVENT_INFO *evt_info) {
       }
       break;
 
+    case NCSMDS_RED_UP:
+      if ((evt_info->i_svc_id == NCSMDS_SVC_ID_AVD) &&
+          (evt_info->i_role == V_DEST_RL_ACTIVE) &&
+          (cb->node_id_avd != evt_info->i_node_id) &&
+          (cb->other_avd_adest) &&
+          (cb->node_id_avd_other != evt_info->i_node_id)) {
+        if (cb->avail_state_avd == SA_AMF_HA_STANDBY) {
+          LOG_ER("Standby peer see two peers: %x and %x",
+            cb->node_id_avd_other, evt_info->i_node_id);
+          opensaf_reboot(0, NULL, "Standby peer see two peers");
+        } else if (cb->avail_state_avd == SA_AMF_HA_ACTIVE) {
+          // Send reboot order to known standby (multi clusters rejoin)
+          AVD_EVT *evt = new AVD_EVT();
+          evt->rcv_evt = AVD_EVT_ROAMING_SC_SPLITBRAIN;
+          if (m_NCS_IPC_SEND(&cb->avd_mbx, evt, NCS_IPC_PRIORITY_HIGH) !=
+              NCSCC_RC_SUCCESS) {
+            LOG_ER("%s: ncs_ipc_send failed", __FUNCTION__);
+            delete evt;
+          }
+        }
+      }
+      break;
+
     case NCSMDS_DOWN:
       switch (evt_info->i_svc_id) {
         case NCSMDS_SVC_ID_AVD:
+          TRACE("NCSMDS_DOWN AVD %x", evt_info->i_node_id);
           /* if(Is this down from an Adest) && (Is this adest same as Adest in
            * CB) */
           if (m_MDS_DEST_IS_AN_ADEST(evt_info->i_dest) &&
               m_NCS_MDS_DEST_EQUAL(&evt_info->i_dest, &cb->other_avd_adest)) {
-            memset(&cb->other_avd_adest, '\0', sizeof(MDS_DEST));
+            cb->other_avd_adest = 0;
+            if ((cb->avail_state_avd == SA_AMF_HA_STANDBY) &&
+                (cb->stby_sync_state == AVD_STBY_OUT_OF_SYNC)) {
+              LOG_ER("FAILOVER StandBy --> Active FAILED,"
+                " Standby OUT OF SYNC");
+              opensaf_quick_reboot("FAILOVER failed");
+            }
           }
           break;
 
diff --git a/src/amf/amfd/msg.h b/src/amf/amfd/msg.h
index 4b699db71..c87736dd6 100644
--- a/src/amf/amfd/msg.h
+++ b/src/amf/amfd/msg.h
@@ -40,6 +40,7 @@
 typedef enum {
   AVD_D2D_CHANGE_ROLE_REQ = AVSV_D2D_CHANGE_ROLE_REQ,
   AVD_D2D_CHANGE_ROLE_RSP,
+  AVD_D2D_REBOOT,
   AVD_D2D_MSG_MAX,
 } AVD_D2D_MSG_TYPE;
 
diff --git a/src/amf/amfd/ndfsm.cc b/src/amf/amfd/ndfsm.cc
index 674ef863a..089c41971 100644
--- a/src/amf/amfd/ndfsm.cc
+++ b/src/amf/amfd/ndfsm.cc
@@ -812,6 +812,8 @@ void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
         // Ignore amfnd down event in late after clm cb node left then joined
         // But not ignore if after headless
         LOG_WA("Ignore '%s' amfnd down event", node->node_name.c_str());
+        node->rcv_msg_id = 0;
+        node->snd_msg_id = 0;
         TRACE_LEAVE();
         return;
       }
diff --git a/src/amf/amfd/proc.h b/src/amf/amfd/proc.h
index 4052aecac..188aa4860 100644
--- a/src/amf/amfd/proc.h
+++ b/src/amf/amfd/proc.h
@@ -86,6 +86,7 @@ void avd_node_down_func(AVD_CL_CB *cb, AVD_AVND *avnd);
 void avd_nd_sisu_state_info_evh(AVD_CL_CB *cb, struct AVD_EVT *evt);
 void avd_nd_compcsi_state_info_evh(AVD_CL_CB *cb, struct AVD_EVT *evt);
 void avd_avnd_mds_info_evh(AVD_CL_CB *cb, AVD_EVT *evt);
+void avd_roaming_sc_split_brain_evh(AVD_CL_CB *cb, AVD_EVT *evt);
 uint32_t avd_node_down(AVD_CL_CB *cb, SaClmNodeIdT node_id);
 AVD_AVND *avd_msg_sanity_chk(AVD_EVT *evt, SaClmNodeIdT node_id,
                              AVSV_DND_MSG_TYPE msg_typ, uint32_t msg_id);
diff --git a/src/amf/amfd/role.cc b/src/amf/amfd/role.cc
index 24374de7c..1cf597b0e 100644
--- a/src/amf/amfd/role.cc
+++ b/src/amf/amfd/role.cc
@@ -1394,3 +1394,30 @@ uint32_t avd_rde_set_role(SaAmfHAStateT role) {
   }
   return rc;
 }
+
+/****************************************************************************\
+ * Function: avd_roaming_sc_split_brain_evh
+ *
+ * Purpose:  Send reboot order to standby.
+ *
+ * Input: cb - the AVD control block
+ *        evt - The event information.
+ *
+ * Returns: None.
+ *
+\**************************************************************************/
+void avd_roaming_sc_split_brain_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
+  (void)evt;
+  AVD_D2D_MSG d2d_msg;
+  TRACE_ENTER();
+
+  if ((cb->node_id_avd_other == 0) || (cb->other_avd_adest == 0)) {
+    return;
+  }
+  d2d_msg.msg_type = AVD_D2D_REBOOT;
+  if (avd_d2d_msg_snd(cb, &d2d_msg) != NCSCC_RC_SUCCESS) {
+    LOG_ER("Reboot order send failed for Standby AMFD");
+  }
+
+  TRACE_LEAVE();
+}
diff --git a/src/amf/amfd/util.cc b/src/amf/amfd/util.cc
index f2b3c5f2a..6915bddb6 100644
--- a/src/amf/amfd/util.cc
+++ b/src/amf/amfd/util.cc
@@ -1823,7 +1823,7 @@ void avd_d2n_reboot_snd(AVD_AVND *node) {
   AVD_DND_MSG *d2n_msg = new AVD_DND_MSG();
 
   d2n_msg->msg_type = AVSV_D2N_REBOOT_MSG;
-  d2n_msg->msg_info.d2n_reboot_info.node_id = node->node_info.nodeId;
+  d2n_msg->msg_info.d2n_reboot_info.node_id = avd_cb->node_id_avd;
   d2n_msg->msg_info.d2n_reboot_info.msg_id = ++(node->snd_msg_id);
 
   if (avd_d2n_msg_snd(avd_cb, node, d2n_msg) != NCSCC_RC_SUCCESS) {
diff --git a/src/amf/amfnd/amfnd.cc b/src/amf/amfnd/amfnd.cc
index 9e8739bee..54a3d75fa 100644
--- a/src/amf/amfnd/amfnd.cc
+++ b/src/amf/amfnd/amfnd.cc
@@ -432,7 +432,7 @@ uint32_t avnd_evt_avd_reboot_evh(AVND_CB *cb, AVND_EVT 
*evt) {
     }
   }
 
-  LOG_NO("Received reboot order, ordering reboot now!");
+  LOG_NO("Received reboot order from %x, ordering reboot now!", info->node_id);
   opensaf_reboot(cb->node_info.nodeId,
                  
osaf_extended_name_borrow(&cb->node_info.executionEnvironment),
                  "Received reboot order");
-- 
2.17.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to