osaf/services/saf/amf/amfnd/err.cc | 61 +++++++++++++++++++++++--- osaf/services/saf/amf/amfnd/include/avnd_cb.h | 2 + osaf/services/saf/amf/amfnd/include/avnd_su.h | 3 + osaf/services/saf/amf/amfnd/su.cc | 23 ++++++++++ 4 files changed, 82 insertions(+), 7 deletions(-)
* log when SU restart/failover limits have been reached * log each time a SU restart/failover occurs * add a log entry when error escalation is changed * log each time a component is restarted, or when restart limits have been reached diff --git a/osaf/services/saf/amf/amfnd/err.cc b/osaf/services/saf/amf/amfnd/err.cc --- a/osaf/services/saf/amf/amfnd/err.cc +++ b/osaf/services/saf/amf/amfnd/err.cc @@ -56,6 +56,7 @@ */ #include "avnd.h" +#include <string> /* static function declarations */ @@ -77,6 +78,10 @@ static uint32_t avnd_err_restart_esc_lev static uint32_t avnd_err_restart_esc_level_1(AVND_CB *, AVND_SU *, AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *); static uint32_t avnd_err_restart_esc_level_2(AVND_CB *, AVND_SU *, AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *); +std::string avnd_translate_recovery_method(const uint32_t esc_rcvr); +void avnd_log_recovery_escalation(const AVND_SU& su, + const uint32_t previous_esc_rcvr, const uint32_t esc_rcvr); + /* LSB Changes. Strings to represent source of component Error */ static const char *g_comp_err[] = { @@ -301,6 +306,8 @@ uint32_t avnd_err_process(AVND_CB *cb, A uint32_t esc_rcvr = err_info->rec_rcvr.raw; uint32_t rc = NCSCC_RC_SUCCESS; TRACE_ENTER2("Comp:'%s' esc_rcvr:'%u'", comp->name.value, esc_rcvr); + + const uint32_t previous_esc_rcvr = esc_rcvr; // Handle errors differently when shutdown has started if (AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED == cb->term_state) { @@ -363,6 +370,9 @@ uint32_t avnd_err_process(AVND_CB *cb, A rc = avnd_err_escalate(cb, comp->su, comp, &esc_rcvr); if (NCSCC_RC_SUCCESS != rc) goto done; + + // add an entry to syslog if recovery method has changed + avnd_log_recovery_escalation(*(comp->su), previous_esc_rcvr, esc_rcvr); LOG_NO("'%s' faulted due to '%s' : Recovery is '%s'", comp->name.value, g_comp_err[err_info->src], g_comp_rcvr[esc_rcvr - 1]); @@ -1073,17 +1083,20 @@ uint32_t avnd_err_restart_esc_level_0(AV goto done; m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, AVND_CKPT_SU_ERR_ESC_TMR); - su->comp_restart_cnt++; + su_increment_comp_restart_count(*su); goto done; } if (su->comp_restart_cnt < su->comp_restart_max) { - su->comp_restart_cnt++; + su_increment_comp_restart_count(*su); goto done; } /* ok! go to next level */ if (su->comp_restart_cnt >= su->comp_restart_max) { + LOG_NO("components of '%s' restarted too many times (%d)", + su->name.value, su->comp_restart_cnt); + /*stop the comp-err-esc-timer */ m_AVND_TMR_COMP_ERR_ESC_STOP(cb, su); m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, AVND_CKPT_SU_ERR_ESC_TMR); @@ -1166,12 +1179,15 @@ uint32_t avnd_err_restart_esc_level_1(AV m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, AVND_CKPT_SU_ERR_ESC_TMR); } - su->su_restart_cnt++; + su_increment_su_restart_count(*su); goto done; } /* reached max count */ if (su->su_restart_cnt >= su->su_restart_max) { + LOG_NO("'%s' restarted too many times (%d)", + su->name.value, su->su_restart_cnt); + /* stop timer */ m_AVND_TMR_SU_ERR_ESC_STOP(cb, su); m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, AVND_CKPT_SU_ERR_ESC_TMR); @@ -1241,17 +1257,20 @@ uint32_t avnd_err_restart_esc_level_2(AV if (NCSCC_RC_SUCCESS != rc) goto done; - cb->su_failover_cnt++; + cb_increment_su_failover_count(*cb, *su); goto done; } if (cb->su_failover_cnt < cb->su_failover_max) { - cb->su_failover_cnt++; + cb_increment_su_failover_count(*cb, *su); goto done; } /* reached max count */ if (cb->su_failover_cnt >= cb->su_failover_max) { + LOG_NO("SU failover performed too many times (%d)", + cb->su_failover_cnt); + /* stop timer */ m_AVND_TMR_NODE_ERR_ESC_STOP(cb); cb->su_failover_cnt = 0; @@ -1310,18 +1329,21 @@ AVSV_ERR_RCVR avnd_err_esc_su_failover(A if (NCSCC_RC_SUCCESS != rc) goto done; - cb->su_failover_cnt++; + cb_increment_su_failover_count(*cb, *su); goto done; } if (cb->su_failover_cnt < cb->su_failover_max) { /* This means NODE_ERR_ESC may be already running because of other SUs escalations.*/ su->su_err_esc_level = AVND_ERR_ESC_LEVEL_2; - cb->su_failover_cnt++; + cb_increment_su_failover_count(*cb, *su); goto done; } if (cb->su_failover_cnt >= cb->su_failover_max) { + LOG_NO("SU failover performed too many times (%d)", + cb->su_failover_cnt); + /* stop timer */ m_AVND_TMR_NODE_ERR_ESC_STOP(cb); cb->su_failover_cnt = 0; @@ -1354,6 +1376,8 @@ uint32_t avnd_evt_tmr_node_err_esc_evh(A AVND_SU *su; TRACE_ENTER(); + + LOG_NO("node error escalation timer expired"); su = (AVND_SU *)ncs_patricia_tree_getnext(&cb->sudb, (uint8_t *)0); while (su != 0) { @@ -1433,3 +1457,26 @@ uint32_t avnd_err_rcvr_node_failfast(AVN TRACE_LEAVE2("%u", rc); return rc; } + +void avnd_log_recovery_escalation(const AVND_SU& su, + const uint32_t previous_esc_rcvr, + const uint32_t esc_rcvr) +{ + if (esc_rcvr != previous_esc_rcvr) { + LOG_NO("'%s' recovery action escalated from '%s' to '%s'", + su.name.value, + avnd_translate_recovery_method(previous_esc_rcvr).c_str(), + avnd_translate_recovery_method(esc_rcvr).c_str()); + } +} + +std::string avnd_translate_recovery_method(const uint32_t esc_rcvr) +{ + const uint32_t no_of_methods = sizeof(g_comp_rcvr)/sizeof(g_comp_rcvr[0]); + + if (esc_rcvr <= no_of_methods) { + return g_comp_rcvr[esc_rcvr - 1]; + } else { + return "unknown recovery method"; + } +} diff --git a/osaf/services/saf/amf/amfnd/include/avnd_cb.h b/osaf/services/saf/amf/amfnd/include/avnd_cb.h --- a/osaf/services/saf/amf/amfnd/include/avnd_cb.h +++ b/osaf/services/saf/amf/amfnd/include/avnd_cb.h @@ -153,6 +153,8 @@ typedef struct avnd_cb_tag { #define m_AVND_NODE_ERR_ESC_LEVEL_SET(x, val) ((x)->node_err_esc_level = (val)) +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su); + extern AVND_CB *avnd_cb; #endif diff --git a/osaf/services/saf/amf/amfnd/include/avnd_su.h b/osaf/services/saf/amf/amfnd/include/avnd_su.h --- a/osaf/services/saf/amf/amfnd/include/avnd_su.h +++ b/osaf/services/saf/amf/amfnd/include/avnd_su.h @@ -408,4 +408,7 @@ extern bool sufailover_during_nodeswitch extern bool all_csis_in_removed_state(const AVND_SU *su); extern void su_reset_restart_count_in_comps(const AVND_SU *su); +void su_increment_su_restart_count(AVND_SU& su); +void su_increment_comp_restart_count(AVND_SU& su); + #endif diff --git a/osaf/services/saf/amf/amfnd/su.cc b/osaf/services/saf/amf/amfnd/su.cc --- a/osaf/services/saf/amf/amfnd/su.cc +++ b/osaf/services/saf/amf/amfnd/su.cc @@ -462,6 +462,8 @@ uint32_t avnd_evt_tmr_su_err_esc_evh(AVN } TRACE("'%s'", su->name.value); + + LOG_NO("'%s' error escalation timer expired", su->name.value) if (NCSCC_RC_SUCCESS == m_AVND_CHECK_FOR_STDBY_FOR_EXT_COMP(cb, su->su_is_external)) goto done; @@ -689,3 +691,24 @@ void su_reset_restart_count_in_comps(con } } + +void su_increment_su_restart_count(AVND_SU& su) +{ + su.su_restart_cnt++; + LOG_NO("Restarting '%s' (%d)", + su.name.value, su.su_restart_cnt); +} + +void su_increment_comp_restart_count(AVND_SU& su) +{ + su.comp_restart_cnt++; + LOG_NO("Restarting a component of '%s' (%d)", + su.name.value, su.comp_restart_cnt); +} + +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su) +{ + cb.su_failover_cnt++; + LOG_NO("Performing failover of '%s' (%d)", + su.name.value, cb.su_failover_cnt); +} ------------------------------------------------------------------------------ The best possible search technologies are now affordable for all companies. Download your FREE open source Enterprise Search Engine today! Our experts will assist you in its installation for $59/mo, no commitment. Test it for FREE on our Cloud platform anytime! http://pubads.g.doubleclick.net/gampad/clk?id=145328191&iu=/4140/ostg.clktrk _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel