- IMMND coordinator take longer time to sync because incorrectly
postpone sync to wait for incorrect number of down nodes.
- IMMND should restart after being accepted re-intro and not be
a new coordinator to sync again with new coordinator.
- Active IMMD only update ex-IMMD from coordinator if info exist.
Update ex-IMMD to node id itself when new coord announce sync.
- IMMND on active IMMD node will start split-brain detected timer
to reboot node if see another acitve IMMD, not reboot immedidately
to avoid messing up RDE split-brain detection mechanism.
- Quick reboot sometimes not quick then active IMMD on node may
impact to new promoted Active node. Let stop AMFND, kill AMFD/IMMD
to avoid any impact.
---
 scripts/opensaf_reboot     |  5 +++--
 src/imm/immd/immd_evt.c    | 16 +++++++++++++---
 src/imm/immnd/immnd.h      |  1 +
 src/imm/immnd/immnd_cb.h   |  2 ++
 src/imm/immnd/immnd_evt.c  | 37 +++++++++++++++++++++++++++++--------
 src/imm/immnd/immnd_main.c |  2 ++
 6 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot
index e2a0ca944..8e5bd8c40 100644
--- a/scripts/opensaf_reboot
+++ b/scripts/opensaf_reboot
@@ -143,8 +143,9 @@ unset tipc
 # If clm cluster reboot requested argument one and two are set but not used,
 # argument 3 is set to 1, "safe reboot" request.
 if [ "$#" = 0 ]; then
-       $icmd pkill -STOP osafamfd
-       $icmd pkill -STOP osafimmd
+       $icmd pkill -STOP osafamfnd
+       $icmd pkill -KILL osafamfd
+       $icmd pkill -KILL osafimmd
        quick_local_node_reboot
 elif [ "$safe_reboot" = 1 ]; then
        opensaf_safe_reboot
diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 51cc8e4f7..e5f438c1a 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -897,7 +897,8 @@ static void immd_accept_node(IMMD_CB *cb, 
IMMD_IMMND_INFO_NODE *node_info,
                LOG_NO(
                    "IMMND coord at %x with ex-IMMD %x",
                    node_info->immnd_key, node_info->ex_immd_node_id);
-               cb->ex_immd_node_id = node_info->ex_immd_node_id;
+               if (node_info->ex_immd_node_id)
+                       cb->ex_immd_node_id = node_info->ex_immd_node_id;
        }
 
        mbcp_msg.type = IMMD_A2S_MSG_INTRO_RSP; /* Mbcp intro to SBY. */
@@ -1253,6 +1254,7 @@ static uint32_t immd_evt_proc_immnd_announce_sync(IMMD_CB 
*cb, IMMD_EVT *evt,
                           Loop through all nodes */
 
                        cb->mRulingEpoch++;
+                       cb->ex_immd_node_id = cb->node_id;
 
                        /*Only updates epoch for coord. */
                        /*node_info->epoch = cb->mRulingEpoch; */
@@ -1692,7 +1694,7 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb, 
IMMD_EVT *evt,
        immd_immnd_info_node_get(&cb->immnd_tree, &sinfo->dest, &node_info);
        if (!node_info) {
                if (evt->info.ctrl_msg.refresh == 3) {
-                       LOG_WA("Drop re-intro from old IMMND dest %" PRIu64, 
sinfo->dest);
+                       TRACE("Drop re-intro from old IMMND dest %" PRIu64, 
sinfo->dest);
                        goto done;
                }
                LOG_WA("Node not found dest %" PRIu64
@@ -3308,7 +3310,15 @@ static uint32_t immd_evt_proc_mds_evt(IMMD_CB *cb, 
IMMD_EVT *evt)
                                    mds_info->dest);
                                goto done;
                        } else {
-                               TRACE_5("IMMND DOWN PROCESS detected by IMMD");
+                               if (node_info->immnd_execPid == 0) {
+                                       TRACE_5(
+                                           "Ignore IMMND %x DOWN not yet 
accepted intro",
+                                           node_info->immnd_key);
+                                       immd_immnd_info_node_delete(cb, 
node_info);
+                                       goto done;
+                               }
+                               TRACE_5("IMMND %x DOWN PROCESS detected by 
IMMD",
+                                   node_info->immnd_key);
                                immd_process_immnd_down(cb, node_info, true);
                        }
                }
diff --git a/src/imm/immnd/immnd.h b/src/imm/immnd/immnd.h
index 7b0818de7..23edf004b 100644
--- a/src/imm/immnd/immnd.h
+++ b/src/imm/immnd/immnd.h
@@ -33,6 +33,7 @@
 #endif
 
 #include "imm/common/immsv.h"
+#include "base/ncssysf_tmr.h"
 #include "immnd_cb.h"
 #include "immnd_init.h"
 
diff --git a/src/imm/immnd/immnd_cb.h b/src/imm/immnd/immnd_cb.h
index 3dc03d88b..bb3bb8493 100644
--- a/src/imm/immnd/immnd_cb.h
+++ b/src/imm/immnd/immnd_cb.h
@@ -207,6 +207,8 @@ typedef struct immnd_cb_tag {
       clm_init_sel_obj; /* Selection object wait for  clms intialization*/
   bool isClmNodeJoined; /* True => If clm joined the cluster*/
   NCS_PATRICIA_TREE immnd_clm_list; /* IMMND_IMM_CLIENT_NODE - node */
+  tmr_t splitbrain_tmr;
+  bool splitbrain_tmr_run;
 } IMMND_CB;
 
 /* CB prototypes */
diff --git a/src/imm/immnd/immnd_evt.c b/src/imm/immnd/immnd_evt.c
index e405d3ce4..dca3f3f63 100644
--- a/src/imm/immnd/immnd_evt.c
+++ b/src/imm/immnd/immnd_evt.c
@@ -10541,6 +10541,10 @@ static uint32_t immnd_evt_proc_intro_rsp(IMMND_CB *cb, 
IMMND_EVT *evt,
                        LOG_IN("2PBE SYNC CASE CAUGHT oldCanBeCoord:%u",
                               oldCanBeCoord);
                }
+               if ((cb->mIntroduced == 2) && (!evt->info.ctrl.isCoord)) {
+                       LOG_WA("Restart to sync with Coord! Exit");
+                       exit(EXIT_SUCCESS);
+               }
                cb->mIntroduced = 1;
                cb->mCanBeCoord = evt->info.ctrl.canBeCoord;
                if ((cb->mCanBeCoord == IMMSV_2PBE_PRELOAD) && (cb->m2Pbe < 2) 
&&
@@ -12202,6 +12206,12 @@ void immnd_evt_ccb_augment_admo(IMMND_CB *cb, 
IMMND_EVT *evt,
        TRACE_LEAVE();
 }
 
+void splitbrain_tmr_exp(void *arg)
+{
+       (void)arg;
+       LOG_ER("Split-brain detected! Rebooting...");
+       opensaf_quick_reboot("Split-brain detected! Rebooting...");
+}
 /****************************************************************************
  * Name          : immnd_evt_proc_mds_evt
  *
@@ -12244,14 +12254,25 @@ static uint32_t immnd_evt_proc_mds_evt(IMMND_CB *cb, 
IMMND_EVT *evt)
                        TRACE_2("IMMD RED_UP EVENT %x role=%d ==> ACT:%x 
SBY:%x",
                            evt->info.mds_info.node_id, evt->info.mds_info.role,
                            cb->immd_node_id, cb->other_immd_id);
-               } else if ((evt->info.mds_info.role == V_DEST_RL_ACTIVE) &&
-                   (cb->immd_node_id != 0) &&
-                   (cb->node_id != cb->immd_node_id)) {
-                       LOG_WA("See two Active IMMD: %x %x, going to headless",
-                           cb->immd_node_id, evt->info.mds_info.node_id);
-                       is_headless = true;
-                       cb->immd_node_id = 0;
-                       cb->other_immd_id = 0;
+               } else if ((cb->immd_node_id != 0) &&
+                           (evt->info.mds_info.role == V_DEST_RL_ACTIVE)) {
+                       if (cb->node_id != cb->immd_node_id) {
+                               LOG_WA("See two Active IMMD: %x %x, going to 
headless",
+                                   cb->immd_node_id, 
evt->info.mds_info.node_id);
+                               is_headless = true;
+                               cb->immd_node_id = 0;
+                               cb->other_immd_id = 0;
+                       } else if (!cb->splitbrain_tmr_run) {
+                               // Normally, RDE will handle split-brain 
detected mechanism
+                               // In roaming SC split/join, sometimes RDE 
don't detect
+                               // split-brain but IMMND does, let start timer 
to reboot node
+                               LOG_WA("Another Active IMMD %x. Start 
split-brain timer",
+                                   evt->info.mds_info.node_id);
+                               cb->splitbrain_tmr = ncs_tmr_start(
+                                   cb->splitbrain_tmr, 1000,  // 10s
+                                   splitbrain_tmr_exp, NULL, NULL, 0);
+                               cb->splitbrain_tmr_run = true;
+                       }
                }
        } else if ((evt->info.mds_info.change == NCSMDS_RED_DOWN) &&
                   (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
diff --git a/src/imm/immnd/immnd_main.c b/src/imm/immnd/immnd_main.c
index 0cd004053..410134c97 100644
--- a/src/imm/immnd/immnd_main.c
+++ b/src/imm/immnd/immnd_main.c
@@ -301,6 +301,8 @@ static uint32_t immnd_initialize(char *progname)
        immnd_cb->clmSelectionObject = -1;
        immnd_cb->immd_node_id = 0;
        immnd_cb->other_immd_id = 0;
+       immnd_cb->splitbrain_tmr = ncs_tmr_alloc(NULL, 0);
+       immnd_cb->splitbrain_tmr_run = false;
 
        populate_reserved_class_names(immnd_cb);
 
-- 
2.17.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to