osaf/services/saf/amf/amfnd/err.cc |  80 +++++++++++++++++++-------------------
 1 files changed, 40 insertions(+), 40 deletions(-)


If any error escalates to component/su failover during headless, amfnd
reboot node.

The issue is other healthy SUs get affected by this reboot, and this degrades
the availability characteristic that AMF supports.

The patch allow component/su failover during headless, but supports it
partially (mark comp/su as failed) since failover to another comp/su
requires amfd's presence.

diff --git a/osaf/services/saf/amf/amfnd/err.cc 
b/osaf/services/saf/amf/amfnd/err.cc
--- a/osaf/services/saf/amf/amfnd/err.cc
+++ b/osaf/services/saf/amf/amfnd/err.cc
@@ -77,8 +77,6 @@ static uint32_t avnd_err_restart_esc_lev
 static uint32_t avnd_err_restart_esc_level_1(AVND_CB *, AVND_SU *, 
AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
 static uint32_t avnd_err_restart_esc_level_2(AVND_CB *, AVND_SU *, 
AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
 
-static void cleanup_all_comps_and_reboot(AVND_CB *cb);
-
 /* LSB Changes. Strings to represent source of component Error */
 
 static const char *g_comp_err[] = {
@@ -802,9 +800,23 @@ uint32_t avnd_err_rcvr_comp_failover(AVN
                rc = avnd_di_oper_send(cb, failed_comp->su, 
AVSV_ERR_RCVR_SU_FAILOVER);
 
                // if headless, we have to perform the 'failover' without amfd
+               // for now, just terminate all components in the SU
                if (cb->is_avd_down == true) {
-                       // SU failover results in a node failfast if headless 
(not nice)
-                       cleanup_all_comps_and_reboot(cb);
+                       AVND_COMP *comp;
+
+                       LOG_NO("Terminating components of '%s'(abruptly & 
unordered)",su->name.value);
+                       for (comp = 
m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list));
+                                       comp;
+                                       comp = 
m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_NEXT(&comp->su_dll_node))) {
+                               if (comp->su->su_is_external)
+                                       continue;
+
+                               rc = avnd_comp_clc_fsm_run(cb, comp, 
AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
+                               if (NCSCC_RC_SUCCESS != rc) {
+                                       LOG_ER("'%s' termination failed", 
comp->name.value);
+                                       goto done;
+                               }
+                       }
                }
        }
 
@@ -854,11 +866,7 @@ uint32_t avnd_err_rcvr_su_failover(AVND_
                }
                avnd_su_pres_state_set(cb, comp->su, 
SA_AMF_PRESENCE_TERMINATING);
        }
-
 done:
-       if (cb->is_avd_down == true) {
-               cleanup_all_comps_and_reboot(cb);
-       }
 
        TRACE_LEAVE2("%u", rc);
        return rc;
@@ -962,7 +970,30 @@ done:
        // TODO - try to see if we can avoid a reboot & terminate components 
more gracefully
        // if headless, reboot as we can't perform a switchover without amfd
        if (cb->is_avd_down == true) {
-               cleanup_all_comps_and_reboot(cb);
+               /* Unordered cleanup of all local application components */
+               for (comp = (AVND_COMP *)ncs_patricia_tree_getnext(&cb->compdb, 
(uint8_t *)nullptr);
+                         comp != nullptr;
+                         comp = (AVND_COMP *) 
ncs_patricia_tree_getnext(&cb->compdb, (uint8_t *)&comp->name)) {
+
+                       if (comp->su->is_ncs || comp->su->su_is_external)
+                               continue;
+
+                       rc = avnd_comp_clc_fsm_run(cb, comp, 
AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
+                       if (rc != NCSCC_RC_SUCCESS) {
+                               LOG_ER("'%s' termination failed", 
comp->name.value);
+                               opensaf_reboot(avnd_cb->node_info.nodeId,
+                                                          (char 
*)avnd_cb->node_info.executionEnvironment.value,
+                                                          "Component 
termination failed at node switchover");
+                               LOG_ER("Exiting (due to comp term failed) to 
aid fast node reboot");
+                               exit(1);
+                       }
+               }
+
+               opensaf_reboot(avnd_cb->node_info.nodeId,
+                       (char *)avnd_cb->node_info.executionEnvironment.value,
+                       "Can't perform node switchover while controllers are 
down. Recovery is node failfast.");
+               LOG_ER("Exiting to aid fast node reboot");
+               exit(1);
        }
 
        TRACE_LEAVE2("%u", rc);
@@ -1598,34 +1629,3 @@ bool is_no_assignment_due_to_escalations
        TRACE_LEAVE2("false");
        return false;
 }
-
-void cleanup_all_comps_and_reboot(AVND_CB *cb)
-{
-       AVND_COMP *comp;
-       uint32_t rc = NCSCC_RC_SUCCESS;
-
-       /* Unordered cleanup of all local application components */
-       for (comp = (AVND_COMP *)ncs_patricia_tree_getnext(&cb->compdb, 
(uint8_t *)nullptr);
-                 comp != nullptr;
-                 comp = (AVND_COMP *) ncs_patricia_tree_getnext(&cb->compdb, 
(uint8_t *)&comp->name)) {
-
-               if (comp->su->is_ncs || comp->su->su_is_external)
-                       continue;
-
-               rc = avnd_comp_clc_fsm_run(cb, comp, 
AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
-               if (rc != NCSCC_RC_SUCCESS) {
-                       LOG_ER("'%s' termination failed", comp->name.value);
-                       opensaf_reboot(avnd_cb->node_info.nodeId,
-                                                  (char 
*)avnd_cb->node_info.executionEnvironment.value,
-                                                  "Component termination 
failed at node switchover");
-                       LOG_ER("Exiting (due to comp term failed) to aid fast 
node reboot");
-                       exit(1);
-               }
-       }
-
-       opensaf_reboot(avnd_cb->node_info.nodeId,
-               (char *)avnd_cb->node_info.executionEnvironment.value,
-               "Can't perform recovery while controllers are down. Recovery is 
node failfast.");
-       LOG_ER("Exiting to aid fast node reboot");
-       exit(1);
-}

------------------------------------------------------------------------------
Site24x7 APM Insight: Get Deep Visibility into Application Performance
APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
Monitor end-to-end web transactions and take corrective actions now
Troubleshoot faster and improve end-user experience. Signup Now!
http://pubads.g.doubleclick.net/gampad/clk?id=272487151&iu=/4140
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to