osaf/services/saf/amf/amfnd/err.cc            |  48 +++++++++++++++++++++++---
 osaf/services/saf/amf/amfnd/include/avnd_cb.h |   2 +
 osaf/services/saf/amf/amfnd/include/avnd_su.h |   3 +
 osaf/services/saf/amf/amfnd/su.cc             |  23 ++++++++++++
 4 files changed, 69 insertions(+), 7 deletions(-)


* log when SU restart/failover limits have been reached
* log each time a SU restart/failover occurs
* add a log entry when error escalation is changed
* log each time a component is restarted, or when restart
  limits have been reached

diff --git a/osaf/services/saf/amf/amfnd/err.cc 
b/osaf/services/saf/amf/amfnd/err.cc
--- a/osaf/services/saf/amf/amfnd/err.cc
+++ b/osaf/services/saf/amf/amfnd/err.cc
@@ -77,6 +77,9 @@ static uint32_t avnd_err_restart_esc_lev
 static uint32_t avnd_err_restart_esc_level_1(AVND_CB *, AVND_SU *, 
AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
 static uint32_t avnd_err_restart_esc_level_2(AVND_CB *, AVND_SU *, 
AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
 
+static void log_recovery_escalation(const AVND_COMP& comp,
+       const uint32_t previous_esc_rcvr, const uint32_t esc_rcvr);
+
 /* LSB Changes. Strings to represent source of component Error */
 
 static const char *g_comp_err[] = {
@@ -301,6 +304,8 @@ uint32_t avnd_err_process(AVND_CB *cb, A
        uint32_t esc_rcvr = err_info->rec_rcvr.raw;
        uint32_t rc = NCSCC_RC_SUCCESS;
        TRACE_ENTER2("Comp:'%s' esc_rcvr:'%u'", comp->name.value, esc_rcvr);
+       
+       const uint32_t previous_esc_rcvr = esc_rcvr;
 
        // Handle errors differently when shutdown has started
        if (AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED == cb->term_state) {
@@ -363,6 +368,9 @@ uint32_t avnd_err_process(AVND_CB *cb, A
        rc = avnd_err_escalate(cb, comp->su, comp, &esc_rcvr);
        if (NCSCC_RC_SUCCESS != rc)
                goto done;
+       
+       // add an entry to syslog if recovery method has changed
+       log_recovery_escalation(*comp, previous_esc_rcvr, esc_rcvr);
 
        LOG_NO("'%s' faulted due to '%s' : Recovery is '%s'",
                comp->name.value, g_comp_err[err_info->src], 
g_comp_rcvr[esc_rcvr - 1]);
@@ -1073,17 +1081,20 @@ uint32_t avnd_err_restart_esc_level_0(AV
                        goto done;
 
                m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
AVND_CKPT_SU_ERR_ESC_TMR);
-               su->comp_restart_cnt++;
+               su_increment_comp_restart_count(*su);
                goto done;
        }
 
        if (su->comp_restart_cnt < su->comp_restart_max) {
-               su->comp_restart_cnt++;
+               su_increment_comp_restart_count(*su);
                goto done;
        }
 
        /* ok! go to next level */
        if (su->comp_restart_cnt >= su->comp_restart_max) {
+               LOG_NO("'%s' component restarts have reached configured limit 
of %u",
+                       su->name.value, su->comp_restart_max);
+
                /*stop the comp-err-esc-timer */
                m_AVND_TMR_COMP_ERR_ESC_STOP(cb, su);
                m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
AVND_CKPT_SU_ERR_ESC_TMR);
@@ -1166,12 +1177,15 @@ uint32_t avnd_err_restart_esc_level_1(AV
 
                        m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
AVND_CKPT_SU_ERR_ESC_TMR);
                }
-               su->su_restart_cnt++;
+               su_increment_su_restart_count(*su);
                goto done;
        }
 
        /* reached max count */
        if (su->su_restart_cnt >= su->su_restart_max) {
+               LOG_NO("'%s' restarts have reached configured limit of %u",
+                       su->name.value, su->su_restart_max);
+
                /* stop timer */
                m_AVND_TMR_SU_ERR_ESC_STOP(cb, su);
                m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
AVND_CKPT_SU_ERR_ESC_TMR);
@@ -1241,17 +1255,20 @@ uint32_t avnd_err_restart_esc_level_2(AV
                if (NCSCC_RC_SUCCESS != rc)
                        goto done;
 
-               cb->su_failover_cnt++;
+               cb_increment_su_failover_count(*cb, *su);
                goto done;
        }
 
        if (cb->su_failover_cnt < cb->su_failover_max) {
-               cb->su_failover_cnt++;
+               cb_increment_su_failover_count(*cb, *su);
                goto done;
        }
 
        /* reached max count */
        if (cb->su_failover_cnt >= cb->su_failover_max) {
+               LOG_NO("SU failovers have reached configured limit of %u",
+                       cb->su_failover_max);
+
                /* stop timer */
                m_AVND_TMR_NODE_ERR_ESC_STOP(cb);
                cb->su_failover_cnt = 0;
@@ -1310,18 +1327,21 @@ AVSV_ERR_RCVR avnd_err_esc_su_failover(A
                if (NCSCC_RC_SUCCESS != rc)
                        goto done;
 
-               cb->su_failover_cnt++;
+               cb_increment_su_failover_count(*cb, *su);
                goto done;
        }
 
        if (cb->su_failover_cnt < cb->su_failover_max) {
                /* This means NODE_ERR_ESC may be already running because of 
other SUs escalations.*/
                su->su_err_esc_level = AVND_ERR_ESC_LEVEL_2;
-               cb->su_failover_cnt++;
+               cb_increment_su_failover_count(*cb, *su);
                goto done;
        }
 
        if (cb->su_failover_cnt >= cb->su_failover_max) {
+               LOG_NO("SU failovers have reached configured limit of %u",
+                       cb->su_failover_max);
+
                /* stop timer */
                m_AVND_TMR_NODE_ERR_ESC_STOP(cb);
                cb->su_failover_cnt = 0;
@@ -1354,6 +1374,8 @@ uint32_t avnd_evt_tmr_node_err_esc_evh(A
        AVND_SU *su;
 
        TRACE_ENTER();
+       
+       LOG_NO("node error escalation timer expired");
 
        su = (AVND_SU *)ncs_patricia_tree_getnext(&cb->sudb, (uint8_t *)0);
        while (su != 0) {
@@ -1433,3 +1455,15 @@ uint32_t avnd_err_rcvr_node_failfast(AVN
        TRACE_LEAVE2("%u", rc);
        return rc;
 }
+
+void log_recovery_escalation(const AVND_COMP& comp,
+       const uint32_t previous_esc_rcvr,
+       const uint32_t esc_rcvr)
+{
+       if (esc_rcvr != previous_esc_rcvr) {
+               LOG_NO("'%s' recovery action escalated from '%s' to '%s'",
+                       comp.name.value,
+                       g_comp_rcvr[previous_esc_rcvr - 1],
+                       g_comp_rcvr[esc_rcvr - 1]);
+       }
+}
diff --git a/osaf/services/saf/amf/amfnd/include/avnd_cb.h 
b/osaf/services/saf/amf/amfnd/include/avnd_cb.h
--- a/osaf/services/saf/amf/amfnd/include/avnd_cb.h
+++ b/osaf/services/saf/amf/amfnd/include/avnd_cb.h
@@ -153,6 +153,8 @@ typedef struct avnd_cb_tag {
 
 #define m_AVND_NODE_ERR_ESC_LEVEL_SET(x, val)  ((x)->node_err_esc_level = 
(val))
 
+void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su);
+
 extern AVND_CB *avnd_cb;
 
 #endif
diff --git a/osaf/services/saf/amf/amfnd/include/avnd_su.h 
b/osaf/services/saf/amf/amfnd/include/avnd_su.h
--- a/osaf/services/saf/amf/amfnd/include/avnd_su.h
+++ b/osaf/services/saf/amf/amfnd/include/avnd_su.h
@@ -408,4 +408,7 @@ extern bool sufailover_during_nodeswitch
 extern bool all_csis_in_removed_state(const AVND_SU *su);
 extern void su_reset_restart_count_in_comps(const AVND_SU *su);
 
+void su_increment_su_restart_count(AVND_SU& su);
+void su_increment_comp_restart_count(AVND_SU& su);
+
 #endif
diff --git a/osaf/services/saf/amf/amfnd/su.cc 
b/osaf/services/saf/amf/amfnd/su.cc
--- a/osaf/services/saf/amf/amfnd/su.cc
+++ b/osaf/services/saf/amf/amfnd/su.cc
@@ -462,6 +462,8 @@ uint32_t avnd_evt_tmr_su_err_esc_evh(AVN
        }
 
        TRACE("'%s'", su->name.value);
+       
+       LOG_NO("'%s' error escalation timer expired", su->name.value);
 
        if (NCSCC_RC_SUCCESS == m_AVND_CHECK_FOR_STDBY_FOR_EXT_COMP(cb, 
su->su_is_external))
                goto done;
@@ -689,3 +691,24 @@ void su_reset_restart_count_in_comps(con
        }
 
 }
+
+void su_increment_su_restart_count(AVND_SU& su)
+{
+       su.su_restart_cnt++;
+       LOG_NO("Restarting '%s' (SU restart count: %d)",
+               su.name.value, su.su_restart_cnt);      
+}
+
+void su_increment_comp_restart_count(AVND_SU& su)
+{
+       su.comp_restart_cnt++;
+       LOG_NO("Restarting a component of '%s' (comp restart count: %d)",
+               su.name.value, su.comp_restart_cnt);    
+}
+
+void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su)
+{
+       cb.su_failover_cnt++;
+       LOG_NO("Performing failover of '%s' (SU failover count: %d)",
+               su.name.value, cb.su_failover_cnt);     
+}

------------------------------------------------------------------------------
Learn Graph Databases - Download FREE O'Reilly Book
"Graph Databases" is the definitive new guide to graph databases and their 
applications. Written by three acclaimed leaders in the field, 
this first edition is now available. Download your free book today!
http://p.sf.net/sfu/NeoTech
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to