- IMMND coordinator take longer time to sync because incorrectly
postpone sync to wait for incorrect number of down nodes.
- IMMND should restart after being accepted re-intro and not be
a new coordinator to sync again with new coordinator.
- Active IMMD only update ex-IMMD from coordinator if info exist.
Update ex-IMMD to node id itself when new coord announce sync.
- Update #3228 solution: active IMMD should not drop re-intro
from local IMMND, it causes unexpected IMMND coord selected then
local IMMND unexpected restart later.
- IMMND on active IMMD node will start split-brain detected timer
to reboot node if see another acitve IMMD, not reboot immedidately
to avoid messing up RDE split-brain detection mechanism.
- Quick reboot sometimes not quick then active IMMD on node may
impact to new promoted Active node. Let stop AMFND, kill AMFD/IMMD
to avoid any impact.
---
 scripts/opensaf_reboot     |   5 +-
 src/imm/immd/immd_evt.c    |  19 ++++--
 src/imm/immd/immd_mds.c    |   1 +
 src/imm/immnd/immnd.h      |   1 +
 src/imm/immnd/immnd_cb.h   |   2 +
 src/imm/immnd/immnd_evt.c  | 122 ++++++++++++++++++++++---------------
 src/imm/immnd/immnd_main.c |   2 +
 7 files changed, 98 insertions(+), 54 deletions(-)

diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot
index e2a0ca944..8e5bd8c40 100644
--- a/scripts/opensaf_reboot
+++ b/scripts/opensaf_reboot
@@ -143,8 +143,9 @@ unset tipc
 # If clm cluster reboot requested argument one and two are set but not used,
 # argument 3 is set to 1, "safe reboot" request.
 if [ "$#" = 0 ]; then
-       $icmd pkill -STOP osafamfd
-       $icmd pkill -STOP osafimmd
+       $icmd pkill -STOP osafamfnd
+       $icmd pkill -KILL osafamfd
+       $icmd pkill -KILL osafimmd
        quick_local_node_reboot
 elif [ "$safe_reboot" = 1 ]; then
        opensaf_safe_reboot
diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 51cc8e4f7..297761d13 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -897,7 +897,8 @@ static void immd_accept_node(IMMD_CB *cb, 
IMMD_IMMND_INFO_NODE *node_info,
                LOG_NO(
                    "IMMND coord at %x with ex-IMMD %x",
                    node_info->immnd_key, node_info->ex_immd_node_id);
-               cb->ex_immd_node_id = node_info->ex_immd_node_id;
+               if (check_ex_immd_node_id && node_info->ex_immd_node_id)
+                       cb->ex_immd_node_id = node_info->ex_immd_node_id;
        }
 
        mbcp_msg.type = IMMD_A2S_MSG_INTRO_RSP; /* Mbcp intro to SBY. */
@@ -1253,6 +1254,7 @@ static uint32_t immd_evt_proc_immnd_announce_sync(IMMD_CB 
*cb, IMMD_EVT *evt,
                           Loop through all nodes */
 
                        cb->mRulingEpoch++;
+                       cb->ex_immd_node_id = cb->node_id;
 
                        /*Only updates epoch for coord. */
                        /*node_info->epoch = cb->mRulingEpoch; */
@@ -1691,8 +1693,9 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb, 
IMMD_EVT *evt,
 
        immd_immnd_info_node_get(&cb->immnd_tree, &sinfo->dest, &node_info);
        if (!node_info) {
-               if (evt->info.ctrl_msg.refresh == 3) {
-                       LOG_WA("Drop re-intro from old IMMND dest %" PRIu64, 
sinfo->dest);
+               if ((evt->info.ctrl_msg.refresh == 3) &&
+                   (sinfo->node_id != cb->node_id)) {
+                       TRACE("Drop re-intro from old IMMND %x", 
sinfo->node_id);
                        goto done;
                }
                LOG_WA("Node not found dest %" PRIu64
@@ -3308,7 +3311,15 @@ static uint32_t immd_evt_proc_mds_evt(IMMD_CB *cb, 
IMMD_EVT *evt)
                                    mds_info->dest);
                                goto done;
                        } else {
-                               TRACE_5("IMMND DOWN PROCESS detected by IMMD");
+                               if (node_info->immnd_execPid == 0) {
+                                       TRACE_5(
+                                           "Ignore IMMND %x DOWN not yet 
accepted intro",
+                                           node_info->immnd_key);
+                                       immd_immnd_info_node_delete(cb, 
node_info);
+                                       goto done;
+                               }
+                               TRACE_5("IMMND %x DOWN PROCESS detected by 
IMMD",
+                                   node_info->immnd_key);
                                immd_process_immnd_down(cb, node_info, true);
                        }
                }
diff --git a/src/imm/immd/immd_mds.c b/src/imm/immd/immd_mds.c
index 7610a45fa..9688b49ad 100644
--- a/src/imm/immd/immd_mds.c
+++ b/src/imm/immd/immd_mds.c
@@ -495,6 +495,7 @@ static uint32_t immd_mds_rcv(IMMD_CB *cb, 
MDS_CALLBACK_RECEIVE_INFO *rcv_info)
        pEvt->sinfo.ctxt = rcv_info->i_msg_ctxt;
        pEvt->sinfo.dest = rcv_info->i_fr_dest;
        pEvt->sinfo.to_svc = rcv_info->i_fr_svc_id;
+       pEvt->sinfo.node_id = rcv_info->i_node_id;
        if (rcv_info->i_rsp_reqd) {
                pEvt->sinfo.stype = MDS_SENDTYPE_RSP;
        }
diff --git a/src/imm/immnd/immnd.h b/src/imm/immnd/immnd.h
index 7b0818de7..23edf004b 100644
--- a/src/imm/immnd/immnd.h
+++ b/src/imm/immnd/immnd.h
@@ -33,6 +33,7 @@
 #endif
 
 #include "imm/common/immsv.h"
+#include "base/ncssysf_tmr.h"
 #include "immnd_cb.h"
 #include "immnd_init.h"
 
diff --git a/src/imm/immnd/immnd_cb.h b/src/imm/immnd/immnd_cb.h
index 3dc03d88b..bb3bb8493 100644
--- a/src/imm/immnd/immnd_cb.h
+++ b/src/imm/immnd/immnd_cb.h
@@ -207,6 +207,8 @@ typedef struct immnd_cb_tag {
       clm_init_sel_obj; /* Selection object wait for  clms intialization*/
   bool isClmNodeJoined; /* True => If clm joined the cluster*/
   NCS_PATRICIA_TREE immnd_clm_list; /* IMMND_IMM_CLIENT_NODE - node */
+  tmr_t splitbrain_tmr;
+  bool splitbrain_tmr_run;
 } IMMND_CB;
 
 /* CB prototypes */
diff --git a/src/imm/immnd/immnd_evt.c b/src/imm/immnd/immnd_evt.c
index e405d3ce4..670823a45 100644
--- a/src/imm/immnd/immnd_evt.c
+++ b/src/imm/immnd/immnd_evt.c
@@ -10541,6 +10541,10 @@ static uint32_t immnd_evt_proc_intro_rsp(IMMND_CB *cb, 
IMMND_EVT *evt,
                        LOG_IN("2PBE SYNC CASE CAUGHT oldCanBeCoord:%u",
                               oldCanBeCoord);
                }
+               if ((cb->mIntroduced == 2) && (!evt->info.ctrl.isCoord)) {
+                       LOG_WA("Restart to sync with Coord! Exit");
+                       exit(EXIT_SUCCESS);
+               }
                cb->mIntroduced = 1;
                cb->mCanBeCoord = evt->info.ctrl.canBeCoord;
                if ((cb->mCanBeCoord == IMMSV_2PBE_PRELOAD) && (cb->m2Pbe < 2) 
&&
@@ -12202,6 +12206,12 @@ void immnd_evt_ccb_augment_admo(IMMND_CB *cb, 
IMMND_EVT *evt,
        TRACE_LEAVE();
 }
 
+void splitbrain_tmr_exp(void *arg)
+{
+       (void)arg;
+       LOG_ER("Split-brain detected! Rebooting...");
+       opensaf_quick_reboot("Split-brain detected! Rebooting...");
+}
 /****************************************************************************
  * Name          : immnd_evt_proc_mds_evt
  *
@@ -12219,52 +12229,69 @@ static uint32_t immnd_evt_proc_mds_evt(IMMND_CB *cb, 
IMMND_EVT *evt)
        /*TRACE_ENTER(); */
        uint32_t rc = NCSCC_RC_SUCCESS;
        bool is_headless = false;
+       IMMSV_MDS_INFO *mdsInfo = &evt->info.mds_info;
 
-       if ((evt->info.mds_info.change == NCSMDS_DOWN) &&
-           (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMA_OM ||
-            evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMA_OI)) {
+       if ((mdsInfo->change == NCSMDS_DOWN) &&
+           (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMA_OM ||
+            mdsInfo->svc_id == NCSMDS_SVC_ID_IMMA_OI)) {
                TRACE_2("IMMA DOWN EVENT");
-               immnd_proc_imma_down(cb, evt->info.mds_info.dest,
-                                    evt->info.mds_info.svc_id);
+               immnd_proc_imma_down(cb, mdsInfo->dest,
+                                    mdsInfo->svc_id);
        }
 
        /* In multi partitioned clusters rejoin, IMMND may not realize
         * headless due to see IMMDs from different partitions */
-       if ((evt->info.mds_info.change == NCSMDS_DOWN) &&
-           (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
-               is_headless = true;
-               cb->immd_node_id = 0;
-               cb->other_immd_id = 0;
-       } else if ((evt->info.mds_info.change == NCSMDS_RED_UP) &&
-           (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD) &&
-           (evt->info.mds_info.node_id != cb->immd_node_id)) {
-               if ((evt->info.mds_info.role == V_DEST_RL_STANDBY) &&
-                   (cb->other_immd_id == 0)) {
-                       cb->other_immd_id = evt->info.mds_info.node_id;
-                       TRACE_2("IMMD RED_UP EVENT %x role=%d ==> ACT:%x 
SBY:%x",
-                           evt->info.mds_info.node_id, evt->info.mds_info.role,
-                           cb->immd_node_id, cb->other_immd_id);
-               } else if ((evt->info.mds_info.role == V_DEST_RL_ACTIVE) &&
-                   (cb->immd_node_id != 0) &&
-                   (cb->node_id != cb->immd_node_id)) {
-                       LOG_WA("See two Active IMMD: %x %x, going to headless",
-                           cb->immd_node_id, evt->info.mds_info.node_id);
+       if (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMD) {
+               switch (mdsInfo->change) {
+               case NCSMDS_DOWN:
                        is_headless = true;
                        cb->immd_node_id = 0;
                        cb->other_immd_id = 0;
-               }
-       } else if ((evt->info.mds_info.change == NCSMDS_RED_DOWN) &&
-                  (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
-               if (cb->immd_node_id == evt->info.mds_info.node_id)
-                       cb->immd_node_id = 0;
-               if (cb->other_immd_id == evt->info.mds_info.node_id)
-                       cb->other_immd_id = 0;
-               TRACE_2("IMMD RED_DOWN EVENT %x role=%d ==> ACT:%x SBY:%x",
-                   evt->info.mds_info.node_id, evt->info.mds_info.role,
-                   cb->immd_node_id, cb->other_immd_id);
-               if ((cb->immd_node_id == 0) && (cb->other_immd_id == 0)) {
-                       LOG_WA("Both Active & Standby DOWN, going to headless");
-                       is_headless = true;
+                       break;
+               case NCSMDS_RED_UP:
+                       if ((mdsInfo->role == V_DEST_RL_STANDBY) &&
+                           (cb->other_immd_id == 0)) {
+                               cb->other_immd_id = mdsInfo->node_id;
+                       } else if (mdsInfo->role == V_DEST_RL_ACTIVE) {
+                               if ((cb->immd_node_id != 0) &&
+                                   (cb->immd_node_id != mdsInfo->node_id)) {
+                                       if (cb->node_id != cb->immd_node_id) {
+                                               LOG_WA(
+                                                   "See two Active IMMD: %x 
%x, going to headless",
+                                                   cb->immd_node_id, 
mdsInfo->node_id);
+                                               is_headless = true;
+                                               cb->immd_node_id = 0;
+                                               cb->other_immd_id = 0;
+                                       } else if (!cb->splitbrain_tmr_run) {
+                                       // Normally, RDE will handle 
split-brain detection.
+                                       // In roaming SC split/join, sometimes 
RDE don't detect
+                                       // split-brain but IMMND does, start 
timer reboot node.
+                                               LOG_WA(
+                                                   "Another Active IMMD %x. 
Start split-brain timer",
+                                                   mdsInfo->node_id);
+                                               cb->splitbrain_tmr = 
ncs_tmr_start(
+                                                   cb->splitbrain_tmr, 1000,  
// 10s
+                                                   splitbrain_tmr_exp, NULL, 
NULL, 0);
+                                               cb->splitbrain_tmr_run = true;
+                                       }
+                               }
+                       }
+                       break;
+               case NCSMDS_RED_DOWN:
+                       if (cb->immd_node_id == mdsInfo->node_id)
+                               cb->immd_node_id = 0;
+                       if (cb->other_immd_id == mdsInfo->node_id)
+                               cb->other_immd_id = 0;
+                       TRACE_2("IMMD RED_DOWN EVENT %x role=%d ==> ACT:%x 
SBY:%x",
+                           mdsInfo->node_id, mdsInfo->role,
+                           cb->immd_node_id, cb->other_immd_id);
+                       if ((cb->immd_node_id == 0) && (cb->other_immd_id == 
0)) {
+                               LOG_WA("Both Active & Standby DOWN, going to 
headless");
+                               is_headless = true;
+                       }
+                       break;
+               default:
+                       break;
                }
        }
 
@@ -12394,29 +12421,28 @@ static uint32_t immnd_evt_proc_mds_evt(IMMND_CB *cb, 
IMMND_EVT *evt)
                        }
                }
 
-       } else if ((evt->info.mds_info.change == NCSMDS_UP) &&
-                  (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
+       } else if ((mdsInfo->change == NCSMDS_UP) &&
+                  (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMD)) {
                LOG_NO(
                    "IMMD(%x) service is UP ... ScAbsenseAllowed?:%u 
introduced?:%u",
-                   evt->info.mds_info.node_id,
+                   mdsInfo->node_id,
                    cb->mScAbsenceAllowed, cb->mIntroduced);
                if ((cb->mIntroduced == 2) &&
                    (immnd_introduceMe(cb) != NCSCC_RC_SUCCESS)) {
                        LOG_WA(
                            "IMMND re-introduceMe after IMMD restart failed, 
will retry");
                }
-       } else if ((evt->info.mds_info.change == NCSMDS_UP) &&
-                  (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMA_OI ||
-                   evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMA_OM)) {
+       } else if ((mdsInfo->change == NCSMDS_UP) &&
+                  (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMA_OI ||
+                   mdsInfo->svc_id == NCSMDS_SVC_ID_IMMA_OM)) {
                TRACE_2("IMMA UP EVENT");
-       } else if ((evt->info.mds_info.change == NCSMDS_CHG_ROLE) &&
-                  (evt->info.mds_info.role == V_DEST_RL_ACTIVE) &&
-                  (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
-
+       } else if ((mdsInfo->change == NCSMDS_CHG_ROLE) &&
+                  (mdsInfo->role == V_DEST_RL_ACTIVE) &&
+                  (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMD)) {
                TRACE_2("IMMD FAILOVER");
                /* The IMMD has failed over. */
                immnd_proc_imma_discard_stales(cb);
-       } else if (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMND) {
+       } else if (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMND) {
                LOG_NO("MDS SERVICE EVENT OF TYPE IMMND!!");
        }
        /*TRACE_LEAVE(); */
diff --git a/src/imm/immnd/immnd_main.c b/src/imm/immnd/immnd_main.c
index 0cd004053..410134c97 100644
--- a/src/imm/immnd/immnd_main.c
+++ b/src/imm/immnd/immnd_main.c
@@ -301,6 +301,8 @@ static uint32_t immnd_initialize(char *progname)
        immnd_cb->clmSelectionObject = -1;
        immnd_cb->immd_node_id = 0;
        immnd_cb->other_immd_id = 0;
+       immnd_cb->splitbrain_tmr = ncs_tmr_alloc(NULL, 0);
+       immnd_cb->splitbrain_tmr_run = false;
 
        populate_reserved_class_names(immnd_cb);
 
-- 
2.17.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to