Hi, I think this is a good start. Some minor comments inline. Thanks, Hans > -----Original Message----- > From: Gary Lee [mailto:gary....@dektech.com.au] > Sent: den 26 maj 2014 05:46 > To: Hans Feldt; Hans Nordebäck; praveen.malv...@oracle.com; > nagendr...@oracle.com > Cc: opensaf-devel@lists.sourceforge.net > Subject: [PATCH 1 of 2] amfnd: Improve logging for error escalation decisions > [#870] > > osaf/services/saf/amf/amfnd/err.cc | 61 > +++++++++++++++++++++++--- > osaf/services/saf/amf/amfnd/include/avnd_cb.h | 2 + > osaf/services/saf/amf/amfnd/include/avnd_su.h | 3 + > osaf/services/saf/amf/amfnd/su.cc | 23 ++++++++++ > 4 files changed, 82 insertions(+), 7 deletions(-) > > > * log when SU restart/failover limits have been reached > * log each time a SU restart/failover occurs > * add a log entry when error escalation is changed > * log each time a component is restarted, or when restart > limits have been reached > > diff --git a/osaf/services/saf/amf/amfnd/err.cc > b/osaf/services/saf/amf/amfnd/err.cc > --- a/osaf/services/saf/amf/amfnd/err.cc > +++ b/osaf/services/saf/amf/amfnd/err.cc > @@ -56,6 +56,7 @@ > */ > > #include "avnd.h" > +#include <string> > > /* static function declarations */ > > @@ -77,6 +78,10 @@ static uint32_t avnd_err_restart_esc_lev > static uint32_t avnd_err_restart_esc_level_1(AVND_CB *, AVND_SU *, > AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *); > static uint32_t avnd_err_restart_esc_level_2(AVND_CB *, AVND_SU *, > AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *); > > +std::string avnd_translate_recovery_method(const uint32_t esc_rcvr); > +void avnd_log_recovery_escalation(const AVND_SU& su, [Hans] drop the avnd_ prefix thx, not needed and no longer a valid acronym
> + const uint32_t previous_esc_rcvr, const uint32_t esc_rcvr); > + > /* LSB Changes. Strings to represent source of component Error */ > > static const char *g_comp_err[] = { > @@ -301,6 +306,8 @@ uint32_t avnd_err_process(AVND_CB *cb, A > uint32_t esc_rcvr = err_info->rec_rcvr.raw; > uint32_t rc = NCSCC_RC_SUCCESS; > TRACE_ENTER2("Comp:'%s' esc_rcvr:'%u'", comp->name.value, esc_rcvr); > + > + const uint32_t previous_esc_rcvr = esc_rcvr; > > // Handle errors differently when shutdown has started > if (AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED == cb->term_state) { > @@ -363,6 +370,9 @@ uint32_t avnd_err_process(AVND_CB *cb, A > rc = avnd_err_escalate(cb, comp->su, comp, &esc_rcvr); > if (NCSCC_RC_SUCCESS != rc) > goto done; > + > + // add an entry to syslog if recovery method has changed > + avnd_log_recovery_escalation(*(comp->su), previous_esc_rcvr, esc_rcvr); > > LOG_NO("'%s' faulted due to '%s' : Recovery is '%s'", > comp->name.value, g_comp_err[err_info->src], > g_comp_rcvr[esc_rcvr - 1]); > @@ -1073,17 +1083,20 @@ uint32_t avnd_err_restart_esc_level_0(AV > goto done; > > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, > AVND_CKPT_SU_ERR_ESC_TMR); > - su->comp_restart_cnt++; > + su_increment_comp_restart_count(*su); > goto done; > } > > if (su->comp_restart_cnt < su->comp_restart_max) { > - su->comp_restart_cnt++; > + su_increment_comp_restart_count(*su); > goto done; > } > > /* ok! go to next level */ > if (su->comp_restart_cnt >= su->comp_restart_max) { > + LOG_NO("components of '%s' restarted too many times (%d)", > + su->name.value, su->comp_restart_cnt); > + > /*stop the comp-err-esc-timer */ > m_AVND_TMR_COMP_ERR_ESC_STOP(cb, su); > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, > AVND_CKPT_SU_ERR_ESC_TMR); > @@ -1166,12 +1179,15 @@ uint32_t avnd_err_restart_esc_level_1(AV > > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, > AVND_CKPT_SU_ERR_ESC_TMR); > } > - su->su_restart_cnt++; > + su_increment_su_restart_count(*su); > goto done; > } > > /* reached max count */ > if (su->su_restart_cnt >= su->su_restart_max) { > + LOG_NO("'%s' restarted too many times (%d)", > + su->name.value, su->su_restart_cnt); > + > /* stop timer */ > m_AVND_TMR_SU_ERR_ESC_STOP(cb, su); > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, > AVND_CKPT_SU_ERR_ESC_TMR); > @@ -1241,17 +1257,20 @@ uint32_t avnd_err_restart_esc_level_2(AV > if (NCSCC_RC_SUCCESS != rc) > goto done; > > - cb->su_failover_cnt++; > + cb_increment_su_failover_count(*cb, *su); > goto done; > } > > if (cb->su_failover_cnt < cb->su_failover_max) { > - cb->su_failover_cnt++; > + cb_increment_su_failover_count(*cb, *su); > goto done; > } > > /* reached max count */ > if (cb->su_failover_cnt >= cb->su_failover_max) { > + LOG_NO("SU failover performed too many times (%d)", > + cb->su_failover_cnt); > + > /* stop timer */ > m_AVND_TMR_NODE_ERR_ESC_STOP(cb); > cb->su_failover_cnt = 0; > @@ -1310,18 +1329,21 @@ AVSV_ERR_RCVR avnd_err_esc_su_failover(A > if (NCSCC_RC_SUCCESS != rc) > goto done; > > - cb->su_failover_cnt++; > + cb_increment_su_failover_count(*cb, *su); > goto done; > } > > if (cb->su_failover_cnt < cb->su_failover_max) { > /* This means NODE_ERR_ESC may be already running because of > other SUs escalations.*/ > su->su_err_esc_level = AVND_ERR_ESC_LEVEL_2; > - cb->su_failover_cnt++; > + cb_increment_su_failover_count(*cb, *su); > goto done; > } > > if (cb->su_failover_cnt >= cb->su_failover_max) { > + LOG_NO("SU failover performed too many times (%d)", > + cb->su_failover_cnt); > + > /* stop timer */ > m_AVND_TMR_NODE_ERR_ESC_STOP(cb); > cb->su_failover_cnt = 0; > @@ -1354,6 +1376,8 @@ uint32_t avnd_evt_tmr_node_err_esc_evh(A > AVND_SU *su; > > TRACE_ENTER(); > + > + LOG_NO("node error escalation timer expired"); > > su = (AVND_SU *)ncs_patricia_tree_getnext(&cb->sudb, (uint8_t *)0); > while (su != 0) { > @@ -1433,3 +1457,26 @@ uint32_t avnd_err_rcvr_node_failfast(AVN > TRACE_LEAVE2("%u", rc); > return rc; > } > + > +void avnd_log_recovery_escalation(const AVND_SU& su, > + const uint32_t previous_esc_rcvr, > + const uint32_t esc_rcvr) > +{ > + if (esc_rcvr != previous_esc_rcvr) { > + LOG_NO("'%s' recovery action escalated from '%s' to '%s'", > + su.name.value, [Hans] can we really log SU name here? Is it valid in all cases? > + > avnd_translate_recovery_method(previous_esc_rcvr).c_str(), > + avnd_translate_recovery_method(esc_rcvr).c_str()); > + } > +} > + > +std::string avnd_translate_recovery_method(const uint32_t esc_rcvr) [Hans] static and drop avnd_ prefix > +{ > + const uint32_t no_of_methods = > sizeof(g_comp_rcvr)/sizeof(g_comp_rcvr[0]); > + > + if (esc_rcvr <= no_of_methods) { > + return g_comp_rcvr[esc_rcvr - 1]; > + } else { > + return "unknown recovery method"; > + } > +} > diff --git a/osaf/services/saf/amf/amfnd/include/avnd_cb.h > b/osaf/services/saf/amf/amfnd/include/avnd_cb.h > --- a/osaf/services/saf/amf/amfnd/include/avnd_cb.h > +++ b/osaf/services/saf/amf/amfnd/include/avnd_cb.h > @@ -153,6 +153,8 @@ typedef struct avnd_cb_tag { > > #define m_AVND_NODE_ERR_ESC_LEVEL_SET(x, val) ((x)->node_err_esc_level = > (val)) > > +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su); > + > extern AVND_CB *avnd_cb; > > #endif > diff --git a/osaf/services/saf/amf/amfnd/include/avnd_su.h > b/osaf/services/saf/amf/amfnd/include/avnd_su.h > --- a/osaf/services/saf/amf/amfnd/include/avnd_su.h > +++ b/osaf/services/saf/amf/amfnd/include/avnd_su.h > @@ -408,4 +408,7 @@ extern bool sufailover_during_nodeswitch > extern bool all_csis_in_removed_state(const AVND_SU *su); > extern void su_reset_restart_count_in_comps(const AVND_SU *su); > > +void su_increment_su_restart_count(AVND_SU& su); > +void su_increment_comp_restart_count(AVND_SU& su); > + > #endif > diff --git a/osaf/services/saf/amf/amfnd/su.cc > b/osaf/services/saf/amf/amfnd/su.cc > --- a/osaf/services/saf/amf/amfnd/su.cc > +++ b/osaf/services/saf/amf/amfnd/su.cc > @@ -462,6 +462,8 @@ uint32_t avnd_evt_tmr_su_err_esc_evh(AVN > } > > TRACE("'%s'", su->name.value); > + > + LOG_NO("'%s' error escalation timer expired", su->name.value) > > if (NCSCC_RC_SUCCESS == m_AVND_CHECK_FOR_STDBY_FOR_EXT_COMP(cb, > su->su_is_external)) > goto done; > @@ -689,3 +691,24 @@ void su_reset_restart_count_in_comps(con > } > > } > + > +void su_increment_su_restart_count(AVND_SU& su) > +{ > + su.su_restart_cnt++; > + LOG_NO("Restarting '%s' (%d)", [Hans] or "Restarting SU '%s' for the %d time" > + su.name.value, su.su_restart_cnt); > +} > + > +void su_increment_comp_restart_count(AVND_SU& su) > +{ > + su.comp_restart_cnt++; > + LOG_NO("Restarting a component of '%s' (%d)", [Hans] or "Restarting Comp '%s' for the %d time" > + su.name.value, su.comp_restart_cnt); > +} > + > +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su) > +{ > + cb.su_failover_cnt++; > + LOG_NO("Performing failover of '%s' (%d)", [Hans] please add a qualifier to %d; e.g. "failover cnt:%d" > + su.name.value, cb.su_failover_cnt); > +} ------------------------------------------------------------------------------ The best possible search technologies are now affordable for all companies. Download your FREE open source Enterprise Search Engine today! Our experts will assist you in its installation for $59/mo, no commitment. Test it for FREE on our Cloud platform anytime! http://pubads.g.doubleclick.net/gampad/clk?id=145328191&iu=/4140/ostg.clktrk _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel