osaf/services/saf/amf/amfnd/err.cc | 80 +++++++++++++++++++------------------- 1 files changed, 40 insertions(+), 40 deletions(-)
If any error escalates to component/su failover during headless, amfnd reboot node. The issue is other healthy SUs get affected by this reboot, and this degrades the availability characteristic that AMF supports. The patch allow component/su failover during headless, but supports it partially (mark comp/su as failed) since failover to another comp/su requires amfd's presence. diff --git a/osaf/services/saf/amf/amfnd/err.cc b/osaf/services/saf/amf/amfnd/err.cc --- a/osaf/services/saf/amf/amfnd/err.cc +++ b/osaf/services/saf/amf/amfnd/err.cc @@ -77,8 +77,6 @@ static uint32_t avnd_err_restart_esc_lev static uint32_t avnd_err_restart_esc_level_1(AVND_CB *, AVND_SU *, AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *); static uint32_t avnd_err_restart_esc_level_2(AVND_CB *, AVND_SU *, AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *); -static void cleanup_all_comps_and_reboot(AVND_CB *cb); - /* LSB Changes. Strings to represent source of component Error */ static const char *g_comp_err[] = { @@ -802,9 +800,23 @@ uint32_t avnd_err_rcvr_comp_failover(AVN rc = avnd_di_oper_send(cb, failed_comp->su, AVSV_ERR_RCVR_SU_FAILOVER); // if headless, we have to perform the 'failover' without amfd + // for now, just terminate all components in the SU if (cb->is_avd_down == true) { - // SU failover results in a node failfast if headless (not nice) - cleanup_all_comps_and_reboot(cb); + AVND_COMP *comp; + + LOG_NO("Terminating components of '%s'(abruptly & unordered)",su->name.value); + for (comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list)); + comp; + comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_NEXT(&comp->su_dll_node))) { + if (comp->su->su_is_external) + continue; + + rc = avnd_comp_clc_fsm_run(cb, comp, AVND_COMP_CLC_PRES_FSM_EV_CLEANUP); + if (NCSCC_RC_SUCCESS != rc) { + LOG_ER("'%s' termination failed", comp->name.value); + goto done; + } + } } } @@ -854,11 +866,7 @@ uint32_t avnd_err_rcvr_su_failover(AVND_ } avnd_su_pres_state_set(cb, comp->su, SA_AMF_PRESENCE_TERMINATING); } - done: - if (cb->is_avd_down == true) { - cleanup_all_comps_and_reboot(cb); - } TRACE_LEAVE2("%u", rc); return rc; @@ -962,7 +970,30 @@ done: // TODO - try to see if we can avoid a reboot & terminate components more gracefully // if headless, reboot as we can't perform a switchover without amfd if (cb->is_avd_down == true) { - cleanup_all_comps_and_reboot(cb); + /* Unordered cleanup of all local application components */ + for (comp = (AVND_COMP *)ncs_patricia_tree_getnext(&cb->compdb, (uint8_t *)nullptr); + comp != nullptr; + comp = (AVND_COMP *) ncs_patricia_tree_getnext(&cb->compdb, (uint8_t *)&comp->name)) { + + if (comp->su->is_ncs || comp->su->su_is_external) + continue; + + rc = avnd_comp_clc_fsm_run(cb, comp, AVND_COMP_CLC_PRES_FSM_EV_CLEANUP); + if (rc != NCSCC_RC_SUCCESS) { + LOG_ER("'%s' termination failed", comp->name.value); + opensaf_reboot(avnd_cb->node_info.nodeId, + (char *)avnd_cb->node_info.executionEnvironment.value, + "Component termination failed at node switchover"); + LOG_ER("Exiting (due to comp term failed) to aid fast node reboot"); + exit(1); + } + } + + opensaf_reboot(avnd_cb->node_info.nodeId, + (char *)avnd_cb->node_info.executionEnvironment.value, + "Can't perform node switchover while controllers are down. Recovery is node failfast."); + LOG_ER("Exiting to aid fast node reboot"); + exit(1); } TRACE_LEAVE2("%u", rc); @@ -1598,34 +1629,3 @@ bool is_no_assignment_due_to_escalations TRACE_LEAVE2("false"); return false; } - -void cleanup_all_comps_and_reboot(AVND_CB *cb) -{ - AVND_COMP *comp; - uint32_t rc = NCSCC_RC_SUCCESS; - - /* Unordered cleanup of all local application components */ - for (comp = (AVND_COMP *)ncs_patricia_tree_getnext(&cb->compdb, (uint8_t *)nullptr); - comp != nullptr; - comp = (AVND_COMP *) ncs_patricia_tree_getnext(&cb->compdb, (uint8_t *)&comp->name)) { - - if (comp->su->is_ncs || comp->su->su_is_external) - continue; - - rc = avnd_comp_clc_fsm_run(cb, comp, AVND_COMP_CLC_PRES_FSM_EV_CLEANUP); - if (rc != NCSCC_RC_SUCCESS) { - LOG_ER("'%s' termination failed", comp->name.value); - opensaf_reboot(avnd_cb->node_info.nodeId, - (char *)avnd_cb->node_info.executionEnvironment.value, - "Component termination failed at node switchover"); - LOG_ER("Exiting (due to comp term failed) to aid fast node reboot"); - exit(1); - } - } - - opensaf_reboot(avnd_cb->node_info.nodeId, - (char *)avnd_cb->node_info.executionEnvironment.value, - "Can't perform recovery while controllers are down. Recovery is node failfast."); - LOG_ER("Exiting to aid fast node reboot"); - exit(1); -} ------------------------------------------------------------------------------ Site24x7 APM Insight: Get Deep Visibility into Application Performance APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month Monitor end-to-end web transactions and take corrective actions now Troubleshoot faster and improve end-user experience. Signup Now! http://pubads.g.doubleclick.net/gampad/clk?id=272487151&iu=/4140 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel