- IMMND coordinator take longer time to sync because incorrectly
postpone sync to wait for incorrect number of down nodes.
- IMMND should restart after being accepted re-intro and not be
a new coordinator to sync again with new coordinator.
- Active IMMD only update ex-IMMD from coordinator if info exist.
Update ex-IMMD to node id itself when new coord announce sync.
- IMMND on active IMMD node will start split-brain detected timer
to reboot node if see another acitve IMMD, not reboot immedidately
to avoid messing up RDE split-brain detection mechanism.
- Quick reboot sometimes not quick then active IMMD on node may
impact to new promoted Active node. Let stop AMFND, kill AMFD/IMMD
to avoid any impact.
---
scripts/opensaf_reboot | 5 +++--
src/imm/immd/immd_evt.c | 16 +++++++++++++---
src/imm/immnd/immnd.h | 1 +
src/imm/immnd/immnd_cb.h | 2 ++
src/imm/immnd/immnd_evt.c | 37 +++++++++++++++++++++++++++++--------
src/imm/immnd/immnd_main.c | 2 ++
6 files changed, 50 insertions(+), 13 deletions(-)
diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot
index e2a0ca944..8e5bd8c40 100644
--- a/scripts/opensaf_reboot
+++ b/scripts/opensaf_reboot
@@ -143,8 +143,9 @@ unset tipc
# If clm cluster reboot requested argument one and two are set but not used,
# argument 3 is set to 1, "safe reboot" request.
if [ "$#" = 0 ]; then
- $icmd pkill -STOP osafamfd
- $icmd pkill -STOP osafimmd
+ $icmd pkill -STOP osafamfnd
+ $icmd pkill -KILL osafamfd
+ $icmd pkill -KILL osafimmd
quick_local_node_reboot
elif [ "$safe_reboot" = 1 ]; then
opensaf_safe_reboot
diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 51cc8e4f7..e5f438c1a 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -897,7 +897,8 @@ static void immd_accept_node(IMMD_CB *cb,
IMMD_IMMND_INFO_NODE *node_info,
LOG_NO(
"IMMND coord at %x with ex-IMMD %x",
node_info->immnd_key, node_info->ex_immd_node_id);
- cb->ex_immd_node_id = node_info->ex_immd_node_id;
+ if (node_info->ex_immd_node_id)
+ cb->ex_immd_node_id = node_info->ex_immd_node_id;
}
mbcp_msg.type = IMMD_A2S_MSG_INTRO_RSP; /* Mbcp intro to SBY. */
@@ -1253,6 +1254,7 @@ static uint32_t immd_evt_proc_immnd_announce_sync(IMMD_CB
*cb, IMMD_EVT *evt,
Loop through all nodes */
cb->mRulingEpoch++;
+ cb->ex_immd_node_id = cb->node_id;
/*Only updates epoch for coord. */
/*node_info->epoch = cb->mRulingEpoch; */
@@ -1692,7 +1694,7 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb,
IMMD_EVT *evt,
immd_immnd_info_node_get(&cb->immnd_tree, &sinfo->dest, &node_info);
if (!node_info) {
if (evt->info.ctrl_msg.refresh == 3) {
- LOG_WA("Drop re-intro from old IMMND dest %" PRIu64,
sinfo->dest);
+ TRACE("Drop re-intro from old IMMND dest %" PRIu64,
sinfo->dest);
goto done;
}
LOG_WA("Node not found dest %" PRIu64
@@ -3308,7 +3310,15 @@ static uint32_t immd_evt_proc_mds_evt(IMMD_CB *cb,
IMMD_EVT *evt)
mds_info->dest);
goto done;
} else {
- TRACE_5("IMMND DOWN PROCESS detected by IMMD");
+ if (node_info->immnd_execPid == 0) {
+ TRACE_5(
+ "Ignore IMMND %x DOWN not yet
accepted intro",
+ node_info->immnd_key);
+ immd_immnd_info_node_delete(cb,
node_info);
+ goto done;
+ }
+ TRACE_5("IMMND %x DOWN PROCESS detected by
IMMD",
+ node_info->immnd_key);
immd_process_immnd_down(cb, node_info, true);
}
}
diff --git a/src/imm/immnd/immnd.h b/src/imm/immnd/immnd.h
index 7b0818de7..23edf004b 100644
--- a/src/imm/immnd/immnd.h
+++ b/src/imm/immnd/immnd.h
@@ -33,6 +33,7 @@
#endif
#include "imm/common/immsv.h"
+#include "base/ncssysf_tmr.h"
#include "immnd_cb.h"
#include "immnd_init.h"
diff --git a/src/imm/immnd/immnd_cb.h b/src/imm/immnd/immnd_cb.h
index 3dc03d88b..bb3bb8493 100644
--- a/src/imm/immnd/immnd_cb.h
+++ b/src/imm/immnd/immnd_cb.h
@@ -207,6 +207,8 @@ typedef struct immnd_cb_tag {
clm_init_sel_obj; /* Selection object wait for clms intialization*/
bool isClmNodeJoined; /* True => If clm joined the cluster*/
NCS_PATRICIA_TREE immnd_clm_list; /* IMMND_IMM_CLIENT_NODE - node */
+ tmr_t splitbrain_tmr;
+ bool splitbrain_tmr_run;
} IMMND_CB;
/* CB prototypes */
diff --git a/src/imm/immnd/immnd_evt.c b/src/imm/immnd/immnd_evt.c
index e405d3ce4..dca3f3f63 100644
--- a/src/imm/immnd/immnd_evt.c
+++ b/src/imm/immnd/immnd_evt.c
@@ -10541,6 +10541,10 @@ static uint32_t immnd_evt_proc_intro_rsp(IMMND_CB *cb,
IMMND_EVT *evt,
LOG_IN("2PBE SYNC CASE CAUGHT oldCanBeCoord:%u",
oldCanBeCoord);
}
+ if ((cb->mIntroduced == 2) && (!evt->info.ctrl.isCoord)) {
+ LOG_WA("Restart to sync with Coord! Exit");
+ exit(EXIT_SUCCESS);
+ }
cb->mIntroduced = 1;
cb->mCanBeCoord = evt->info.ctrl.canBeCoord;
if ((cb->mCanBeCoord == IMMSV_2PBE_PRELOAD) && (cb->m2Pbe < 2)
&&
@@ -12202,6 +12206,12 @@ void immnd_evt_ccb_augment_admo(IMMND_CB *cb,
IMMND_EVT *evt,
TRACE_LEAVE();
}
+void splitbrain_tmr_exp(void *arg)
+{
+ (void)arg;
+ LOG_ER("Split-brain detected! Rebooting...");
+ opensaf_quick_reboot("Split-brain detected! Rebooting...");
+}
/****************************************************************************
* Name : immnd_evt_proc_mds_evt
*
@@ -12244,14 +12254,25 @@ static uint32_t immnd_evt_proc_mds_evt(IMMND_CB *cb,
IMMND_EVT *evt)
TRACE_2("IMMD RED_UP EVENT %x role=%d ==> ACT:%x
SBY:%x",
evt->info.mds_info.node_id, evt->info.mds_info.role,
cb->immd_node_id, cb->other_immd_id);
- } else if ((evt->info.mds_info.role == V_DEST_RL_ACTIVE) &&
- (cb->immd_node_id != 0) &&
- (cb->node_id != cb->immd_node_id)) {
- LOG_WA("See two Active IMMD: %x %x, going to headless",
- cb->immd_node_id, evt->info.mds_info.node_id);
- is_headless = true;
- cb->immd_node_id = 0;
- cb->other_immd_id = 0;
+ } else if ((cb->immd_node_id != 0) &&
+ (evt->info.mds_info.role == V_DEST_RL_ACTIVE)) {
+ if (cb->node_id != cb->immd_node_id) {
+ LOG_WA("See two Active IMMD: %x %x, going to
headless",
+ cb->immd_node_id,
evt->info.mds_info.node_id);
+ is_headless = true;
+ cb->immd_node_id = 0;
+ cb->other_immd_id = 0;
+ } else if (!cb->splitbrain_tmr_run) {
+ // Normally, RDE will handle split-brain
detected mechanism
+ // In roaming SC split/join, sometimes RDE
don't detect
+ // split-brain but IMMND does, let start timer
to reboot node
+ LOG_WA("Another Active IMMD %x. Start
split-brain timer",
+ evt->info.mds_info.node_id);
+ cb->splitbrain_tmr = ncs_tmr_start(
+ cb->splitbrain_tmr, 1000, // 10s
+ splitbrain_tmr_exp, NULL, NULL, 0);
+ cb->splitbrain_tmr_run = true;
+ }
}
} else if ((evt->info.mds_info.change == NCSMDS_RED_DOWN) &&
(evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
diff --git a/src/imm/immnd/immnd_main.c b/src/imm/immnd/immnd_main.c
index 0cd004053..410134c97 100644
--- a/src/imm/immnd/immnd_main.c
+++ b/src/imm/immnd/immnd_main.c
@@ -301,6 +301,8 @@ static uint32_t immnd_initialize(char *progname)
immnd_cb->clmSelectionObject = -1;
immnd_cb->immd_node_id = 0;
immnd_cb->other_immd_id = 0;
+ immnd_cb->splitbrain_tmr = ncs_tmr_alloc(NULL, 0);
+ immnd_cb->splitbrain_tmr_run = false;
populate_reserved_class_names(immnd_cb);
--
2.17.1
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel