Few comments:

1. All %d should be converted to %u.
2. " too many times " doesn't look good.
LOG_NO("components of '%s' restarted too many times (comp restart count: %d)" 
should be changed to 
    LOG_NO("components of '%s' restarted exceeds configured '%u' (comp restart 
count: %u)",
        su->comp_restart_max, su->name.value, su->comp_restart_cnt);

Other places also need to be changed:
LOG_NO("'%s' restarted too many times (SU restart count: %d)",
LOG_NO("SU failover performed too many times (SU failover count: %d)",
LOG_NO("SU failover performed too many times (SU failover count: %d)",

3. log_recovery_escalation() is not of much use. We already have the below log:
LOG_NO(""'%s' faulted due to '%s' : Recovery is '%s'"

It could be rephrased as or something like:
        LOG_NO("'%s' faulted due to '%s' : Configured/Recommended Recovery is 
'%s' and recovery escalated is %s",
                comp->name.value, g_comp_err[err_info->src], 
g_comp_rcvr[previous_esc_rcvr - 1], g_comp_rcvr[esc_rcvr - 1]);

Thanks
-Nagu

> -----Original Message-----
> From: Gary Lee [mailto:gary....@dektech.com.au]
> Sent: 28 May 2014 13:03
> To: hans.fe...@ericsson.com; hans.nordeb...@ericsson.com; Praveen Malviya;
> Nagendra Kumar
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: [PATCH 1 of 2] amfnd: Improve logging for error escalation decisions
> [#870]
> 
>  osaf/services/saf/amf/amfnd/err.cc            |  77 ++++++++++++++++++++++++-
> -
>  osaf/services/saf/amf/amfnd/include/avnd_cb.h |   2 +
>  osaf/services/saf/amf/amfnd/include/avnd_su.h |   3 +
>  osaf/services/saf/amf/amfnd/su.cc             |  23 ++++++++
>  4 files changed, 98 insertions(+), 7 deletions(-)
> 
> 
> * log when SU restart/failover limits have been reached
> * log each time a SU restart/failover occurs
> * add a log entry when error escalation is changed
> * log each time a component is restarted, or when restart
>   limits have been reached
> 
> diff --git a/osaf/services/saf/amf/amfnd/err.cc
> b/osaf/services/saf/amf/amfnd/err.cc
> --- a/osaf/services/saf/amf/amfnd/err.cc
> +++ b/osaf/services/saf/amf/amfnd/err.cc
> @@ -56,6 +56,7 @@
>  */
> 
>  #include "avnd.h"
> +#include <string>
> 
>  /* static function declarations */
> 
> @@ -77,6 +78,10 @@ static uint32_t avnd_err_restart_esc_lev
>  static uint32_t avnd_err_restart_esc_level_1(AVND_CB *, AVND_SU *,
> AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
>  static uint32_t avnd_err_restart_esc_level_2(AVND_CB *, AVND_SU *,
> AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
> 
> +static std::string translate_recovery_method(const uint32_t esc_rcvr);
> +static void log_recovery_escalation(const AVND_COMP& su,
> +     const uint32_t previous_esc_rcvr, const uint32_t esc_rcvr);
> +
>  /* LSB Changes. Strings to represent source of component Error */
> 
>  static const char *g_comp_err[] = {
> @@ -301,6 +306,8 @@ uint32_t avnd_err_process(AVND_CB *cb, A
>       uint32_t esc_rcvr = err_info->rec_rcvr.raw;
>       uint32_t rc = NCSCC_RC_SUCCESS;
>       TRACE_ENTER2("Comp:'%s' esc_rcvr:'%u'", comp->name.value,
> esc_rcvr);
> +
> +     const uint32_t previous_esc_rcvr = esc_rcvr;
> 
>       // Handle errors differently when shutdown has started
>       if (AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED == cb-
> >term_state) {
> @@ -363,6 +370,9 @@ uint32_t avnd_err_process(AVND_CB *cb, A
>       rc = avnd_err_escalate(cb, comp->su, comp, &esc_rcvr);
>       if (NCSCC_RC_SUCCESS != rc)
>               goto done;
> +
> +     // add an entry to syslog if recovery method has changed
> +     log_recovery_escalation(*comp, previous_esc_rcvr, esc_rcvr);
> 
>       LOG_NO("'%s' faulted due to '%s' : Recovery is '%s'",
>               comp->name.value, g_comp_err[err_info->src],
> g_comp_rcvr[esc_rcvr - 1]);
> @@ -1073,17 +1083,20 @@ uint32_t avnd_err_restart_esc_level_0(AV
>                       goto done;
> 
>               m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su,
> AVND_CKPT_SU_ERR_ESC_TMR);
> -             su->comp_restart_cnt++;
> +             su_increment_comp_restart_count(*su);
>               goto done;
>       }
> 
>       if (su->comp_restart_cnt < su->comp_restart_max) {
> -             su->comp_restart_cnt++;
> +             su_increment_comp_restart_count(*su);
>               goto done;
>       }
> 
>       /* ok! go to next level */
>       if (su->comp_restart_cnt >= su->comp_restart_max) {
> +             LOG_NO("components of '%s' restarted too many times (comp
> restart count: %d)",
> +                     su->name.value, su->comp_restart_cnt);
> +
>               /*stop the comp-err-esc-timer */
>               m_AVND_TMR_COMP_ERR_ESC_STOP(cb, su);
>               m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su,
> AVND_CKPT_SU_ERR_ESC_TMR);
> @@ -1166,12 +1179,15 @@ uint32_t avnd_err_restart_esc_level_1(AV
> 
>                       m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su,
> AVND_CKPT_SU_ERR_ESC_TMR);
>               }
> -             su->su_restart_cnt++;
> +             su_increment_su_restart_count(*su);
>               goto done;
>       }
> 
>       /* reached max count */
>       if (su->su_restart_cnt >= su->su_restart_max) {
> +             LOG_NO("'%s' restarted too many times (SU restart count:
> %d)",
> +                     su->name.value, su->su_restart_cnt);
> +
>               /* stop timer */
>               m_AVND_TMR_SU_ERR_ESC_STOP(cb, su);
>               m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su,
> AVND_CKPT_SU_ERR_ESC_TMR);
> @@ -1241,17 +1257,20 @@ uint32_t avnd_err_restart_esc_level_2(AV
>               if (NCSCC_RC_SUCCESS != rc)
>                       goto done;
> 
> -             cb->su_failover_cnt++;
> +             cb_increment_su_failover_count(*cb, *su);
>               goto done;
>       }
> 
>       if (cb->su_failover_cnt < cb->su_failover_max) {
> -             cb->su_failover_cnt++;
> +             cb_increment_su_failover_count(*cb, *su);
>               goto done;
>       }
> 
>       /* reached max count */
>       if (cb->su_failover_cnt >= cb->su_failover_max) {
> +               LOG_NO("SU failover performed too many times (SU failover 
> count:
> %d)",
> +                    cb->su_failover_cnt);
> +
>               /* stop timer */
>               m_AVND_TMR_NODE_ERR_ESC_STOP(cb);
>               cb->su_failover_cnt = 0;
> @@ -1310,18 +1329,21 @@ AVSV_ERR_RCVR avnd_err_esc_su_failover(A
>               if (NCSCC_RC_SUCCESS != rc)
>                       goto done;
> 
> -             cb->su_failover_cnt++;
> +             cb_increment_su_failover_count(*cb, *su);
>               goto done;
>       }
> 
>       if (cb->su_failover_cnt < cb->su_failover_max) {
>               /* This means NODE_ERR_ESC may be already running because
> of other SUs escalations.*/
>               su->su_err_esc_level = AVND_ERR_ESC_LEVEL_2;
> -             cb->su_failover_cnt++;
> +             cb_increment_su_failover_count(*cb, *su);
>               goto done;
>       }
> 
>       if (cb->su_failover_cnt >= cb->su_failover_max) {
> +             LOG_NO("SU failover performed too many times (SU failover
> count: %d)",
> +                     cb->su_failover_cnt);
> +
>               /* stop timer */
>               m_AVND_TMR_NODE_ERR_ESC_STOP(cb);
>               cb->su_failover_cnt = 0;
> @@ -1354,6 +1376,8 @@ uint32_t avnd_evt_tmr_node_err_esc_evh(A
>       AVND_SU *su;
> 
>       TRACE_ENTER();
> +
> +     LOG_NO("node error escalation timer expired");
> 
>       su = (AVND_SU *)ncs_patricia_tree_getnext(&cb->sudb, (uint8_t *)0);
>       while (su != 0) {
> @@ -1433,3 +1457,42 @@ uint32_t avnd_err_rcvr_node_failfast(AVN
>       TRACE_LEAVE2("%u", rc);
>       return rc;
>  }
> +
> +void log_recovery_escalation(const AVND_COMP& comp,
> +     const uint32_t previous_esc_rcvr,
> +     const uint32_t esc_rcvr)
> +{
> +     std::string subject;
> +
> +     switch (previous_esc_rcvr) {
> +     case SA_AMF_NO_RECOMMENDATION:
> +     case SA_AMF_COMPONENT_RESTART:
> +     case SA_AMF_COMPONENT_FAILOVER:
> +             subject = std::string((char*)comp.name.value,
> +                     comp.name.length);
> +             break;
> +     default:
> +             subject = std::string((char*)comp.su->name.value,
> +                     comp.su->name.length);
> +             break;
> +     }
> +
> +     if (esc_rcvr != previous_esc_rcvr) {
> +             LOG_NO("'%s' recovery action escalated from '%s' to '%s'",
> +                     subject.c_str(),
> +
>       translate_recovery_method(previous_esc_rcvr).c_str(),
> +                     translate_recovery_method(esc_rcvr).c_str());
> +     }
> +}
> +
> +std::string translate_recovery_method(const uint32_t esc_rcvr)
> +{
> +     const uint32_t no_of_methods =
> sizeof(g_comp_rcvr)/sizeof(g_comp_rcvr[0]);
> +
> +     // valid range is 1 to 'no_of_methods'
> +     if (esc_rcvr >= 1 && esc_rcvr <= no_of_methods) {
> +             return g_comp_rcvr[esc_rcvr - 1];
> +     } else {
> +             return "unknown recovery method";
> +     }
> +}
> diff --git a/osaf/services/saf/amf/amfnd/include/avnd_cb.h
> b/osaf/services/saf/amf/amfnd/include/avnd_cb.h
> --- a/osaf/services/saf/amf/amfnd/include/avnd_cb.h
> +++ b/osaf/services/saf/amf/amfnd/include/avnd_cb.h
> @@ -153,6 +153,8 @@ typedef struct avnd_cb_tag {
> 
>  #define m_AVND_NODE_ERR_ESC_LEVEL_SET(x, val)  ((x)-
> >node_err_esc_level = (val))
> 
> +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su);
> +
>  extern AVND_CB *avnd_cb;
> 
>  #endif
> diff --git a/osaf/services/saf/amf/amfnd/include/avnd_su.h
> b/osaf/services/saf/amf/amfnd/include/avnd_su.h
> --- a/osaf/services/saf/amf/amfnd/include/avnd_su.h
> +++ b/osaf/services/saf/amf/amfnd/include/avnd_su.h
> @@ -408,4 +408,7 @@ extern bool sufailover_during_nodeswitch
>  extern bool all_csis_in_removed_state(const AVND_SU *su);
>  extern void su_reset_restart_count_in_comps(const AVND_SU *su);
> 
> +void su_increment_su_restart_count(AVND_SU& su);
> +void su_increment_comp_restart_count(AVND_SU& su);
> +
>  #endif
> diff --git a/osaf/services/saf/amf/amfnd/su.cc
> b/osaf/services/saf/amf/amfnd/su.cc
> --- a/osaf/services/saf/amf/amfnd/su.cc
> +++ b/osaf/services/saf/amf/amfnd/su.cc
> @@ -462,6 +462,8 @@ uint32_t avnd_evt_tmr_su_err_esc_evh(AVN
>       }
> 
>       TRACE("'%s'", su->name.value);
> +
> +     LOG_NO("'%s' error escalation timer expired", su->name.value);
> 
>       if (NCSCC_RC_SUCCESS ==
> m_AVND_CHECK_FOR_STDBY_FOR_EXT_COMP(cb, su->su_is_external))
>               goto done;
> @@ -689,3 +691,24 @@ void su_reset_restart_count_in_comps(con
>       }
> 
>  }
> +
> +void su_increment_su_restart_count(AVND_SU& su)
> +{
> +     su.su_restart_cnt++;
> +     LOG_NO("Restarting '%s' (SU restart count: %d)",
> +             su.name.value, su.su_restart_cnt);
> +}
> +
> +void su_increment_comp_restart_count(AVND_SU& su)
> +{
> +     su.comp_restart_cnt++;
> +     LOG_NO("Restarting a component of '%s' (comp restart count: %d)",
> +             su.name.value, su.comp_restart_cnt);
> +}
> +
> +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su)
> +{
> +     cb.su_failover_cnt++;
> +     LOG_NO("Performing failover of '%s' (SU failover count: %d)",
> +             su.name.value, cb.su_failover_cnt);
> +}

------------------------------------------------------------------------------
Time is money. Stop wasting it! Get your web API in 5 minutes.
www.restlet.com/download
http://p.sf.net/sfu/restlet
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to