Few comments: 1. All %d should be converted to %u. 2. " too many times " doesn't look good. LOG_NO("components of '%s' restarted too many times (comp restart count: %d)" should be changed to LOG_NO("components of '%s' restarted exceeds configured '%u' (comp restart count: %u)", su->comp_restart_max, su->name.value, su->comp_restart_cnt);
Other places also need to be changed: LOG_NO("'%s' restarted too many times (SU restart count: %d)", LOG_NO("SU failover performed too many times (SU failover count: %d)", LOG_NO("SU failover performed too many times (SU failover count: %d)", 3. log_recovery_escalation() is not of much use. We already have the below log: LOG_NO(""'%s' faulted due to '%s' : Recovery is '%s'" It could be rephrased as or something like: LOG_NO("'%s' faulted due to '%s' : Configured/Recommended Recovery is '%s' and recovery escalated is %s", comp->name.value, g_comp_err[err_info->src], g_comp_rcvr[previous_esc_rcvr - 1], g_comp_rcvr[esc_rcvr - 1]); Thanks -Nagu > -----Original Message----- > From: Gary Lee [mailto:gary....@dektech.com.au] > Sent: 28 May 2014 13:03 > To: hans.fe...@ericsson.com; hans.nordeb...@ericsson.com; Praveen Malviya; > Nagendra Kumar > Cc: opensaf-devel@lists.sourceforge.net > Subject: [PATCH 1 of 2] amfnd: Improve logging for error escalation decisions > [#870] > > osaf/services/saf/amf/amfnd/err.cc | 77 ++++++++++++++++++++++++- > - > osaf/services/saf/amf/amfnd/include/avnd_cb.h | 2 + > osaf/services/saf/amf/amfnd/include/avnd_su.h | 3 + > osaf/services/saf/amf/amfnd/su.cc | 23 ++++++++ > 4 files changed, 98 insertions(+), 7 deletions(-) > > > * log when SU restart/failover limits have been reached > * log each time a SU restart/failover occurs > * add a log entry when error escalation is changed > * log each time a component is restarted, or when restart > limits have been reached > > diff --git a/osaf/services/saf/amf/amfnd/err.cc > b/osaf/services/saf/amf/amfnd/err.cc > --- a/osaf/services/saf/amf/amfnd/err.cc > +++ b/osaf/services/saf/amf/amfnd/err.cc > @@ -56,6 +56,7 @@ > */ > > #include "avnd.h" > +#include <string> > > /* static function declarations */ > > @@ -77,6 +78,10 @@ static uint32_t avnd_err_restart_esc_lev > static uint32_t avnd_err_restart_esc_level_1(AVND_CB *, AVND_SU *, > AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *); > static uint32_t avnd_err_restart_esc_level_2(AVND_CB *, AVND_SU *, > AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *); > > +static std::string translate_recovery_method(const uint32_t esc_rcvr); > +static void log_recovery_escalation(const AVND_COMP& su, > + const uint32_t previous_esc_rcvr, const uint32_t esc_rcvr); > + > /* LSB Changes. Strings to represent source of component Error */ > > static const char *g_comp_err[] = { > @@ -301,6 +306,8 @@ uint32_t avnd_err_process(AVND_CB *cb, A > uint32_t esc_rcvr = err_info->rec_rcvr.raw; > uint32_t rc = NCSCC_RC_SUCCESS; > TRACE_ENTER2("Comp:'%s' esc_rcvr:'%u'", comp->name.value, > esc_rcvr); > + > + const uint32_t previous_esc_rcvr = esc_rcvr; > > // Handle errors differently when shutdown has started > if (AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED == cb- > >term_state) { > @@ -363,6 +370,9 @@ uint32_t avnd_err_process(AVND_CB *cb, A > rc = avnd_err_escalate(cb, comp->su, comp, &esc_rcvr); > if (NCSCC_RC_SUCCESS != rc) > goto done; > + > + // add an entry to syslog if recovery method has changed > + log_recovery_escalation(*comp, previous_esc_rcvr, esc_rcvr); > > LOG_NO("'%s' faulted due to '%s' : Recovery is '%s'", > comp->name.value, g_comp_err[err_info->src], > g_comp_rcvr[esc_rcvr - 1]); > @@ -1073,17 +1083,20 @@ uint32_t avnd_err_restart_esc_level_0(AV > goto done; > > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, > AVND_CKPT_SU_ERR_ESC_TMR); > - su->comp_restart_cnt++; > + su_increment_comp_restart_count(*su); > goto done; > } > > if (su->comp_restart_cnt < su->comp_restart_max) { > - su->comp_restart_cnt++; > + su_increment_comp_restart_count(*su); > goto done; > } > > /* ok! go to next level */ > if (su->comp_restart_cnt >= su->comp_restart_max) { > + LOG_NO("components of '%s' restarted too many times (comp > restart count: %d)", > + su->name.value, su->comp_restart_cnt); > + > /*stop the comp-err-esc-timer */ > m_AVND_TMR_COMP_ERR_ESC_STOP(cb, su); > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, > AVND_CKPT_SU_ERR_ESC_TMR); > @@ -1166,12 +1179,15 @@ uint32_t avnd_err_restart_esc_level_1(AV > > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, > AVND_CKPT_SU_ERR_ESC_TMR); > } > - su->su_restart_cnt++; > + su_increment_su_restart_count(*su); > goto done; > } > > /* reached max count */ > if (su->su_restart_cnt >= su->su_restart_max) { > + LOG_NO("'%s' restarted too many times (SU restart count: > %d)", > + su->name.value, su->su_restart_cnt); > + > /* stop timer */ > m_AVND_TMR_SU_ERR_ESC_STOP(cb, su); > m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, > AVND_CKPT_SU_ERR_ESC_TMR); > @@ -1241,17 +1257,20 @@ uint32_t avnd_err_restart_esc_level_2(AV > if (NCSCC_RC_SUCCESS != rc) > goto done; > > - cb->su_failover_cnt++; > + cb_increment_su_failover_count(*cb, *su); > goto done; > } > > if (cb->su_failover_cnt < cb->su_failover_max) { > - cb->su_failover_cnt++; > + cb_increment_su_failover_count(*cb, *su); > goto done; > } > > /* reached max count */ > if (cb->su_failover_cnt >= cb->su_failover_max) { > + LOG_NO("SU failover performed too many times (SU failover > count: > %d)", > + cb->su_failover_cnt); > + > /* stop timer */ > m_AVND_TMR_NODE_ERR_ESC_STOP(cb); > cb->su_failover_cnt = 0; > @@ -1310,18 +1329,21 @@ AVSV_ERR_RCVR avnd_err_esc_su_failover(A > if (NCSCC_RC_SUCCESS != rc) > goto done; > > - cb->su_failover_cnt++; > + cb_increment_su_failover_count(*cb, *su); > goto done; > } > > if (cb->su_failover_cnt < cb->su_failover_max) { > /* This means NODE_ERR_ESC may be already running because > of other SUs escalations.*/ > su->su_err_esc_level = AVND_ERR_ESC_LEVEL_2; > - cb->su_failover_cnt++; > + cb_increment_su_failover_count(*cb, *su); > goto done; > } > > if (cb->su_failover_cnt >= cb->su_failover_max) { > + LOG_NO("SU failover performed too many times (SU failover > count: %d)", > + cb->su_failover_cnt); > + > /* stop timer */ > m_AVND_TMR_NODE_ERR_ESC_STOP(cb); > cb->su_failover_cnt = 0; > @@ -1354,6 +1376,8 @@ uint32_t avnd_evt_tmr_node_err_esc_evh(A > AVND_SU *su; > > TRACE_ENTER(); > + > + LOG_NO("node error escalation timer expired"); > > su = (AVND_SU *)ncs_patricia_tree_getnext(&cb->sudb, (uint8_t *)0); > while (su != 0) { > @@ -1433,3 +1457,42 @@ uint32_t avnd_err_rcvr_node_failfast(AVN > TRACE_LEAVE2("%u", rc); > return rc; > } > + > +void log_recovery_escalation(const AVND_COMP& comp, > + const uint32_t previous_esc_rcvr, > + const uint32_t esc_rcvr) > +{ > + std::string subject; > + > + switch (previous_esc_rcvr) { > + case SA_AMF_NO_RECOMMENDATION: > + case SA_AMF_COMPONENT_RESTART: > + case SA_AMF_COMPONENT_FAILOVER: > + subject = std::string((char*)comp.name.value, > + comp.name.length); > + break; > + default: > + subject = std::string((char*)comp.su->name.value, > + comp.su->name.length); > + break; > + } > + > + if (esc_rcvr != previous_esc_rcvr) { > + LOG_NO("'%s' recovery action escalated from '%s' to '%s'", > + subject.c_str(), > + > translate_recovery_method(previous_esc_rcvr).c_str(), > + translate_recovery_method(esc_rcvr).c_str()); > + } > +} > + > +std::string translate_recovery_method(const uint32_t esc_rcvr) > +{ > + const uint32_t no_of_methods = > sizeof(g_comp_rcvr)/sizeof(g_comp_rcvr[0]); > + > + // valid range is 1 to 'no_of_methods' > + if (esc_rcvr >= 1 && esc_rcvr <= no_of_methods) { > + return g_comp_rcvr[esc_rcvr - 1]; > + } else { > + return "unknown recovery method"; > + } > +} > diff --git a/osaf/services/saf/amf/amfnd/include/avnd_cb.h > b/osaf/services/saf/amf/amfnd/include/avnd_cb.h > --- a/osaf/services/saf/amf/amfnd/include/avnd_cb.h > +++ b/osaf/services/saf/amf/amfnd/include/avnd_cb.h > @@ -153,6 +153,8 @@ typedef struct avnd_cb_tag { > > #define m_AVND_NODE_ERR_ESC_LEVEL_SET(x, val) ((x)- > >node_err_esc_level = (val)) > > +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su); > + > extern AVND_CB *avnd_cb; > > #endif > diff --git a/osaf/services/saf/amf/amfnd/include/avnd_su.h > b/osaf/services/saf/amf/amfnd/include/avnd_su.h > --- a/osaf/services/saf/amf/amfnd/include/avnd_su.h > +++ b/osaf/services/saf/amf/amfnd/include/avnd_su.h > @@ -408,4 +408,7 @@ extern bool sufailover_during_nodeswitch > extern bool all_csis_in_removed_state(const AVND_SU *su); > extern void su_reset_restart_count_in_comps(const AVND_SU *su); > > +void su_increment_su_restart_count(AVND_SU& su); > +void su_increment_comp_restart_count(AVND_SU& su); > + > #endif > diff --git a/osaf/services/saf/amf/amfnd/su.cc > b/osaf/services/saf/amf/amfnd/su.cc > --- a/osaf/services/saf/amf/amfnd/su.cc > +++ b/osaf/services/saf/amf/amfnd/su.cc > @@ -462,6 +462,8 @@ uint32_t avnd_evt_tmr_su_err_esc_evh(AVN > } > > TRACE("'%s'", su->name.value); > + > + LOG_NO("'%s' error escalation timer expired", su->name.value); > > if (NCSCC_RC_SUCCESS == > m_AVND_CHECK_FOR_STDBY_FOR_EXT_COMP(cb, su->su_is_external)) > goto done; > @@ -689,3 +691,24 @@ void su_reset_restart_count_in_comps(con > } > > } > + > +void su_increment_su_restart_count(AVND_SU& su) > +{ > + su.su_restart_cnt++; > + LOG_NO("Restarting '%s' (SU restart count: %d)", > + su.name.value, su.su_restart_cnt); > +} > + > +void su_increment_comp_restart_count(AVND_SU& su) > +{ > + su.comp_restart_cnt++; > + LOG_NO("Restarting a component of '%s' (comp restart count: %d)", > + su.name.value, su.comp_restart_cnt); > +} > + > +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su) > +{ > + cb.su_failover_cnt++; > + LOG_NO("Performing failover of '%s' (SU failover count: %d)", > + su.name.value, cb.su_failover_cnt); > +} ------------------------------------------------------------------------------ Time is money. Stop wasting it! Get your web API in 5 minutes. www.restlet.com/download http://p.sf.net/sfu/restlet _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel