Hi,
I think this is a good start. Some minor comments inline.
Thanks,
Hans

> -----Original Message-----
> From: Gary Lee [mailto:gary....@dektech.com.au]
> Sent: den 26 maj 2014 05:46
> To: Hans Feldt; Hans Nordebäck; praveen.malv...@oracle.com; 
> nagendr...@oracle.com
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: [PATCH 1 of 2] amfnd: Improve logging for error escalation decisions 
> [#870]
> 
>  osaf/services/saf/amf/amfnd/err.cc            |  61 
> +++++++++++++++++++++++---
>  osaf/services/saf/amf/amfnd/include/avnd_cb.h |   2 +
>  osaf/services/saf/amf/amfnd/include/avnd_su.h |   3 +
>  osaf/services/saf/amf/amfnd/su.cc             |  23 ++++++++++
>  4 files changed, 82 insertions(+), 7 deletions(-)
> 
> 
> * log when SU restart/failover limits have been reached
> * log each time a SU restart/failover occurs
> * add a log entry when error escalation is changed
> * log each time a component is restarted, or when restart
>   limits have been reached
> 
> diff --git a/osaf/services/saf/amf/amfnd/err.cc 
> b/osaf/services/saf/amf/amfnd/err.cc
> --- a/osaf/services/saf/amf/amfnd/err.cc
> +++ b/osaf/services/saf/amf/amfnd/err.cc
> @@ -56,6 +56,7 @@
>  */
> 
>  #include "avnd.h"
> +#include <string>
> 
>  /* static function declarations */
> 
> @@ -77,6 +78,10 @@ static uint32_t avnd_err_restart_esc_lev
>  static uint32_t avnd_err_restart_esc_level_1(AVND_CB *, AVND_SU *, 
> AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
>  static uint32_t avnd_err_restart_esc_level_2(AVND_CB *, AVND_SU *, 
> AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
> 
> +std::string avnd_translate_recovery_method(const uint32_t esc_rcvr);
> +void avnd_log_recovery_escalation(const AVND_SU& su,
[Hans] drop the avnd_ prefix thx, not needed and no longer a valid acronym

> +     const uint32_t previous_esc_rcvr, const uint32_t esc_rcvr);
> +
>  /* LSB Changes. Strings to represent source of component Error */
> 
>  static const char *g_comp_err[] = {
> @@ -301,6 +306,8 @@ uint32_t avnd_err_process(AVND_CB *cb, A
>       uint32_t esc_rcvr = err_info->rec_rcvr.raw;
>       uint32_t rc = NCSCC_RC_SUCCESS;
>       TRACE_ENTER2("Comp:'%s' esc_rcvr:'%u'", comp->name.value, esc_rcvr);
> +
> +     const uint32_t previous_esc_rcvr = esc_rcvr;
> 
>       // Handle errors differently when shutdown has started
>       if (AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED == cb->term_state) {
> @@ -363,6 +370,9 @@ uint32_t avnd_err_process(AVND_CB *cb, A
>       rc = avnd_err_escalate(cb, comp->su, comp, &esc_rcvr);
>       if (NCSCC_RC_SUCCESS != rc)
>               goto done;
> +
> +     // add an entry to syslog if recovery method has changed
> +     avnd_log_recovery_escalation(*(comp->su), previous_esc_rcvr, esc_rcvr);
> 
>       LOG_NO("'%s' faulted due to '%s' : Recovery is '%s'",
>               comp->name.value, g_comp_err[err_info->src], 
> g_comp_rcvr[esc_rcvr - 1]);
> @@ -1073,17 +1083,20 @@ uint32_t avnd_err_restart_esc_level_0(AV
>                       goto done;
> 
>               m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
> AVND_CKPT_SU_ERR_ESC_TMR);
> -             su->comp_restart_cnt++;
> +             su_increment_comp_restart_count(*su);
>               goto done;
>       }
> 
>       if (su->comp_restart_cnt < su->comp_restart_max) {
> -             su->comp_restart_cnt++;
> +             su_increment_comp_restart_count(*su);
>               goto done;
>       }
> 
>       /* ok! go to next level */
>       if (su->comp_restart_cnt >= su->comp_restart_max) {
> +             LOG_NO("components of '%s' restarted too many times (%d)",
> +                     su->name.value, su->comp_restart_cnt);
> +
>               /*stop the comp-err-esc-timer */
>               m_AVND_TMR_COMP_ERR_ESC_STOP(cb, su);
>               m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
> AVND_CKPT_SU_ERR_ESC_TMR);
> @@ -1166,12 +1179,15 @@ uint32_t avnd_err_restart_esc_level_1(AV
> 
>                       m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
> AVND_CKPT_SU_ERR_ESC_TMR);
>               }
> -             su->su_restart_cnt++;
> +             su_increment_su_restart_count(*su);
>               goto done;
>       }
> 
>       /* reached max count */
>       if (su->su_restart_cnt >= su->su_restart_max) {
> +             LOG_NO("'%s' restarted too many times (%d)",
> +                     su->name.value, su->su_restart_cnt);
> +
>               /* stop timer */
>               m_AVND_TMR_SU_ERR_ESC_STOP(cb, su);
>               m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
> AVND_CKPT_SU_ERR_ESC_TMR);
> @@ -1241,17 +1257,20 @@ uint32_t avnd_err_restart_esc_level_2(AV
>               if (NCSCC_RC_SUCCESS != rc)
>                       goto done;
> 
> -             cb->su_failover_cnt++;
> +             cb_increment_su_failover_count(*cb, *su);
>               goto done;
>       }
> 
>       if (cb->su_failover_cnt < cb->su_failover_max) {
> -             cb->su_failover_cnt++;
> +             cb_increment_su_failover_count(*cb, *su);
>               goto done;
>       }
> 
>       /* reached max count */
>       if (cb->su_failover_cnt >= cb->su_failover_max) {
> +               LOG_NO("SU failover performed too many times (%d)",
> +                    cb->su_failover_cnt);
> +
>               /* stop timer */
>               m_AVND_TMR_NODE_ERR_ESC_STOP(cb);
>               cb->su_failover_cnt = 0;
> @@ -1310,18 +1329,21 @@ AVSV_ERR_RCVR avnd_err_esc_su_failover(A
>               if (NCSCC_RC_SUCCESS != rc)
>                       goto done;
> 
> -             cb->su_failover_cnt++;
> +             cb_increment_su_failover_count(*cb, *su);
>               goto done;
>       }
> 
>       if (cb->su_failover_cnt < cb->su_failover_max) {
>               /* This means NODE_ERR_ESC may be already running because of 
> other SUs escalations.*/
>               su->su_err_esc_level = AVND_ERR_ESC_LEVEL_2;
> -             cb->su_failover_cnt++;
> +             cb_increment_su_failover_count(*cb, *su);
>               goto done;
>       }
> 
>       if (cb->su_failover_cnt >= cb->su_failover_max) {
> +             LOG_NO("SU failover performed too many times (%d)",
> +                     cb->su_failover_cnt);
> +
>               /* stop timer */
>               m_AVND_TMR_NODE_ERR_ESC_STOP(cb);
>               cb->su_failover_cnt = 0;
> @@ -1354,6 +1376,8 @@ uint32_t avnd_evt_tmr_node_err_esc_evh(A
>       AVND_SU *su;
> 
>       TRACE_ENTER();
> +
> +     LOG_NO("node error escalation timer expired");
> 
>       su = (AVND_SU *)ncs_patricia_tree_getnext(&cb->sudb, (uint8_t *)0);
>       while (su != 0) {
> @@ -1433,3 +1457,26 @@ uint32_t avnd_err_rcvr_node_failfast(AVN
>       TRACE_LEAVE2("%u", rc);
>       return rc;
>  }
> +
> +void avnd_log_recovery_escalation(const AVND_SU& su,
> +     const uint32_t previous_esc_rcvr,
> +     const uint32_t esc_rcvr)
> +{
> +     if (esc_rcvr != previous_esc_rcvr) {
> +             LOG_NO("'%s' recovery action escalated from '%s' to '%s'",
> +                     su.name.value,
[Hans] can we really log SU name here? Is it valid in all cases?

> +                     
> avnd_translate_recovery_method(previous_esc_rcvr).c_str(),
> +                     avnd_translate_recovery_method(esc_rcvr).c_str());
> +     }
> +}
> +
> +std::string avnd_translate_recovery_method(const uint32_t esc_rcvr)
[Hans] static and drop avnd_ prefix

> +{
> +     const uint32_t no_of_methods = 
> sizeof(g_comp_rcvr)/sizeof(g_comp_rcvr[0]);
> +
> +     if (esc_rcvr <= no_of_methods) {
> +             return g_comp_rcvr[esc_rcvr - 1];
> +     } else {
> +             return "unknown recovery method";
> +     }
> +}
> diff --git a/osaf/services/saf/amf/amfnd/include/avnd_cb.h 
> b/osaf/services/saf/amf/amfnd/include/avnd_cb.h
> --- a/osaf/services/saf/amf/amfnd/include/avnd_cb.h
> +++ b/osaf/services/saf/amf/amfnd/include/avnd_cb.h
> @@ -153,6 +153,8 @@ typedef struct avnd_cb_tag {
> 
>  #define m_AVND_NODE_ERR_ESC_LEVEL_SET(x, val)  ((x)->node_err_esc_level = 
> (val))
> 
> +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su);
> +
>  extern AVND_CB *avnd_cb;
> 
>  #endif
> diff --git a/osaf/services/saf/amf/amfnd/include/avnd_su.h 
> b/osaf/services/saf/amf/amfnd/include/avnd_su.h
> --- a/osaf/services/saf/amf/amfnd/include/avnd_su.h
> +++ b/osaf/services/saf/amf/amfnd/include/avnd_su.h
> @@ -408,4 +408,7 @@ extern bool sufailover_during_nodeswitch
>  extern bool all_csis_in_removed_state(const AVND_SU *su);
>  extern void su_reset_restart_count_in_comps(const AVND_SU *su);
> 
> +void su_increment_su_restart_count(AVND_SU& su);
> +void su_increment_comp_restart_count(AVND_SU& su);
> +
>  #endif
> diff --git a/osaf/services/saf/amf/amfnd/su.cc 
> b/osaf/services/saf/amf/amfnd/su.cc
> --- a/osaf/services/saf/amf/amfnd/su.cc
> +++ b/osaf/services/saf/amf/amfnd/su.cc
> @@ -462,6 +462,8 @@ uint32_t avnd_evt_tmr_su_err_esc_evh(AVN
>       }
> 
>       TRACE("'%s'", su->name.value);
> +
> +     LOG_NO("'%s' error escalation timer expired", su->name.value)
> 
>       if (NCSCC_RC_SUCCESS == m_AVND_CHECK_FOR_STDBY_FOR_EXT_COMP(cb, 
> su->su_is_external))
>               goto done;
> @@ -689,3 +691,24 @@ void su_reset_restart_count_in_comps(con
>       }
> 
>  }
> +
> +void su_increment_su_restart_count(AVND_SU& su)
> +{
> +     su.su_restart_cnt++;
> +     LOG_NO("Restarting '%s' (%d)",
[Hans] or "Restarting SU '%s' for the %d time"
> +             su.name.value, su.su_restart_cnt);
> +}
> +
> +void su_increment_comp_restart_count(AVND_SU& su)
> +{
> +     su.comp_restart_cnt++;
> +     LOG_NO("Restarting a component of '%s' (%d)",
[Hans] or "Restarting Comp '%s' for the %d time"
> +             su.name.value, su.comp_restart_cnt);
> +}
> +
> +void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su)
> +{
> +     cb.su_failover_cnt++;
> +     LOG_NO("Performing failover of '%s' (%d)",
[Hans] please add a qualifier to %d; e.g. "failover cnt:%d"
> +             su.name.value, cb.su_failover_cnt);
> +}

------------------------------------------------------------------------------
The best possible search technologies are now affordable for all companies.
Download your FREE open source Enterprise Search Engine today!
Our experts will assist you in its installation for $59/mo, no commitment.
Test it for FREE on our Cloud platform anytime!
http://pubads.g.doubleclick.net/gampad/clk?id=145328191&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to