- Quick reboot is sometimes not quick cause RDE continue cause
split-brain detection for another SC. Need kill director services
to avoid impact other SCs.

- Active IMMD pause itself if see another active IMMD. Node will
reboot by RDE or split-brain timer of local IMMND.

- Improve log messages to avoid confusion about intro/re-intro
accept or just epoch update.
---
 scripts/opensaf_reboot  | 10 ++++++---
 src/imm/immd/immd_evt.c | 47 ++++++++++++++++++++++++++---------------
 2 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot
index 8e5bd8c40..5fbb1dd54 100644
--- a/scripts/opensaf_reboot
+++ b/scripts/opensaf_reboot
@@ -107,6 +107,13 @@ quick_local_node_reboot()
 {
        logger -t "opensaf_reboot" "Do quick local node reboot"
 
+       for service in osafamfnd osafimmnd; do
+               $icmd pkill -STOP $service
+       done
+       for service in osafrded osafamfd osafimmd osaflogd osafntfd osafclmd; do
+               $icmd pkill -KILL $service
+       done
+
        $icmd /bin/sh -c "/bin/echo -n 'b' 2> /dev/null > /proc/sysrq-trigger"
        ret_code=$?
 
@@ -143,9 +150,6 @@ unset tipc
 # If clm cluster reboot requested argument one and two are set but not used,
 # argument 3 is set to 1, "safe reboot" request.
 if [ "$#" = 0 ]; then
-       $icmd pkill -STOP osafamfnd
-       $icmd pkill -KILL osafamfd
-       $icmd pkill -KILL osafimmd
        quick_local_node_reboot
 elif [ "$safe_reboot" = 1 ]; then
        opensaf_safe_reboot
diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 297761d13..eb579c489 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -785,13 +785,15 @@ static void immd_kill_node(IMMD_CB *cb, 
IMMD_IMMND_INFO_NODE *node_info)
 static uint16_t accepted_nodes = 0;
 
 static void immd_accept_node(IMMD_CB *cb, IMMD_IMMND_INFO_NODE *node_info,
-                            bool doReply, bool knownVeteran, bool 
check_ex_immd_node_id)
+                            bool doReply, bool knownVeteran, bool 
check_ex_immd)
 {
        IMMSV_EVT accept_evt;
        IMMD_MBCSV_MSG mbcp_msg;
        bool isOnController = node_info->isOnController;
        bool fsParamMbcp = false;
-       TRACE_ENTER();
+       TRACE_ENTER2(
+           "Accept IMMND %x doReply=%d knownVeteran=%d check_ex_immd=%d",
+           node_info->immnd_key, doReply, knownVeteran, check_ex_immd);
 
        memset(&accept_evt, 0, sizeof(IMMSV_EVT));
        memset(&mbcp_msg, 0, sizeof(IMMD_MBCSV_MSG));
@@ -799,9 +801,6 @@ static void immd_accept_node(IMMD_CB *cb, 
IMMD_IMMND_INFO_NODE *node_info,
                ++accepted_nodes;
        }
 
-       LOG_NO(
-           "Accept intro from %x with ex-IMMD %x",
-           node_info->immnd_key, node_info->ex_immd_node_id);
        accept_evt.type = IMMSV_EVT_TYPE_IMMND;
        accept_evt.info.immnd.type = IMMND_EVT_D2ND_INTRO_RSP;
        accept_evt.info.immnd.info.ctrl.nodeId = node_info->immnd_key;
@@ -844,7 +843,7 @@ static void immd_accept_node(IMMD_CB *cb, 
IMMD_IMMND_INFO_NODE *node_info,
                                cb->immnd_coord = node_info->immnd_key;
                                node_info->isCoord = true;
                        } else if (cb->mScAbsenceAllowed && doReply) {
-                               if ((check_ex_immd_node_id) &&
+                               if ((check_ex_immd) &&
                                    (cb->node_id == node_info->immnd_key)) {
                                        LOG_NO(
                                            "IMMND re-introduce to IMMD on same 
this node. "
@@ -897,13 +896,13 @@ static void immd_accept_node(IMMD_CB *cb, 
IMMD_IMMND_INFO_NODE *node_info,
                LOG_NO(
                    "IMMND coord at %x with ex-IMMD %x",
                    node_info->immnd_key, node_info->ex_immd_node_id);
-               if (check_ex_immd_node_id && node_info->ex_immd_node_id)
+               if (check_ex_immd && node_info->ex_immd_node_id)
                        cb->ex_immd_node_id = node_info->ex_immd_node_id;
        }
 
        mbcp_msg.type = IMMD_A2S_MSG_INTRO_RSP; /* Mbcp intro to SBY. */
        mbcp_msg.info.ctrl = accept_evt.info.immnd.info.ctrl;
-       if (check_ex_immd_node_id) {
+       if (check_ex_immd) {
                mbcp_msg.type = IMMD_A2S_MSG_INTRO_RSP_2;
                mbcp_msg.info.ctrl.ex_immd_node_id = node_info->ex_immd_node_id;
        }
@@ -949,10 +948,19 @@ static void immd_accept_node(IMMD_CB *cb, 
IMMD_IMMND_INFO_NODE *node_info,
                            .canBeCoord = IMMSV_VETERAN_COORD;
                        /* Allow all nodes including payloads to be coord */
 
-                       if (check_ex_immd_node_id &&
-                           !is_on_same_partition_with_coord(cb, node_info)) {
-                               LOG_WA("Going to reboot node 0x%x", 
node_info->immnd_key);
-                               accept_evt.info.immnd.info.ctrl.canBeCoord = 
IMMSV_UNKNOWN;
+                       if (check_ex_immd) {
+                               if (!is_on_same_partition_with_coord(cb, 
node_info)) {
+                                       LOG_WA(
+                                           "Going to reboot node 0x%x", 
node_info->immnd_key);
+                                       accept_evt.info.immnd.info.ctrl
+                                           .canBeCoord = IMMSV_UNKNOWN;
+                               } else {
+                                       LOG_NO(
+                                           "Accept re-intro from %x with 
ex-IMMD %x",
+                                           node_info->immnd_key, 
node_info->ex_immd_node_id);
+                               }
+                       } else {
+                               LOG_NO("Accept intro from %x", 
node_info->immnd_key);
                        }
 
                        accept_evt.info.immnd.info.ctrl.ndExecPid =
@@ -3253,11 +3261,16 @@ static uint32_t immd_evt_proc_mds_evt(IMMD_CB *cb, 
IMMD_EVT *evt)
                        }
                } else if ((mds_info->node_id != cb->immd_self_id) &&
                            (mds_info->node_id != cb->immd_remote_id) &&
-                           (mds_info->role == V_DEST_RL_ACTIVE) &&
-                           (cb->ha_state == SA_AMF_HA_STANDBY)) {
-                       LOG_ER("Standby peer see two peers: %x and %x",
-                           cb->immd_remote_id, mds_info->node_id);
-                       opensaf_reboot(0, NULL, "Standby peer see two peers");
+                           (mds_info->role == V_DEST_RL_ACTIVE)) {
+                       if (cb->ha_state == SA_AMF_HA_STANDBY) {
+                               LOG_ER("Standby peer see two peers: %x and %x",
+                                       cb->immd_remote_id, mds_info->node_id);
+                               opensaf_reboot(0, NULL, "Standby peer see two 
peers");
+                       } else if (cb->ha_state == SA_AMF_HA_ACTIVE) {
+                               // Node will be ordered reboot by RDE or IMMND
+                               LOG_WA("Another Active IMMD %x", 
mds_info->node_id);
+                               for (;;) pause();
+                       }
                }
                break;
 
-- 
2.25.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to