osaf/services/saf/amf/amfnd/err.cc            |  61 +++++++++++++++++++++++---
 osaf/services/saf/amf/amfnd/include/avnd_cb.h |   2 +
 osaf/services/saf/amf/amfnd/include/avnd_su.h |   3 +
 osaf/services/saf/amf/amfnd/su.cc             |  23 ++++++++++
 4 files changed, 82 insertions(+), 7 deletions(-)


* log when SU restart/failover limits have been reached
* log each time a SU restart/failover occurs
* add a log entry when error escalation is changed
* log each time a component is restarted, or when restart
  limits have been reached

diff --git a/osaf/services/saf/amf/amfnd/err.cc 
b/osaf/services/saf/amf/amfnd/err.cc
--- a/osaf/services/saf/amf/amfnd/err.cc
+++ b/osaf/services/saf/amf/amfnd/err.cc
@@ -56,6 +56,7 @@
 */
 
 #include "avnd.h"
+#include <string>
 
 /* static function declarations */
 
@@ -77,6 +78,10 @@ static uint32_t avnd_err_restart_esc_lev
 static uint32_t avnd_err_restart_esc_level_1(AVND_CB *, AVND_SU *, 
AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
 static uint32_t avnd_err_restart_esc_level_2(AVND_CB *, AVND_SU *, 
AVND_ERR_ESC_LEVEL *, AVSV_ERR_RCVR *);
 
+std::string avnd_translate_recovery_method(const uint32_t esc_rcvr);
+void avnd_log_recovery_escalation(const AVND_SU& su,
+       const uint32_t previous_esc_rcvr, const uint32_t esc_rcvr);
+
 /* LSB Changes. Strings to represent source of component Error */
 
 static const char *g_comp_err[] = {
@@ -301,6 +306,8 @@ uint32_t avnd_err_process(AVND_CB *cb, A
        uint32_t esc_rcvr = err_info->rec_rcvr.raw;
        uint32_t rc = NCSCC_RC_SUCCESS;
        TRACE_ENTER2("Comp:'%s' esc_rcvr:'%u'", comp->name.value, esc_rcvr);
+       
+       const uint32_t previous_esc_rcvr = esc_rcvr;
 
        // Handle errors differently when shutdown has started
        if (AVND_TERM_STATE_OPENSAF_SHUTDOWN_STARTED == cb->term_state) {
@@ -363,6 +370,9 @@ uint32_t avnd_err_process(AVND_CB *cb, A
        rc = avnd_err_escalate(cb, comp->su, comp, &esc_rcvr);
        if (NCSCC_RC_SUCCESS != rc)
                goto done;
+       
+       // add an entry to syslog if recovery method has changed
+       avnd_log_recovery_escalation(*(comp->su), previous_esc_rcvr, esc_rcvr);
 
        LOG_NO("'%s' faulted due to '%s' : Recovery is '%s'",
                comp->name.value, g_comp_err[err_info->src], 
g_comp_rcvr[esc_rcvr - 1]);
@@ -1073,17 +1083,20 @@ uint32_t avnd_err_restart_esc_level_0(AV
                        goto done;
 
                m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
AVND_CKPT_SU_ERR_ESC_TMR);
-               su->comp_restart_cnt++;
+               su_increment_comp_restart_count(*su);
                goto done;
        }
 
        if (su->comp_restart_cnt < su->comp_restart_max) {
-               su->comp_restart_cnt++;
+               su_increment_comp_restart_count(*su);
                goto done;
        }
 
        /* ok! go to next level */
        if (su->comp_restart_cnt >= su->comp_restart_max) {
+               LOG_NO("components of '%s' restarted too many times (%d)",
+                       su->name.value, su->comp_restart_cnt);
+
                /*stop the comp-err-esc-timer */
                m_AVND_TMR_COMP_ERR_ESC_STOP(cb, su);
                m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
AVND_CKPT_SU_ERR_ESC_TMR);
@@ -1166,12 +1179,15 @@ uint32_t avnd_err_restart_esc_level_1(AV
 
                        m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
AVND_CKPT_SU_ERR_ESC_TMR);
                }
-               su->su_restart_cnt++;
+               su_increment_su_restart_count(*su);
                goto done;
        }
 
        /* reached max count */
        if (su->su_restart_cnt >= su->su_restart_max) {
+               LOG_NO("'%s' restarted too many times (%d)",
+                       su->name.value, su->su_restart_cnt);
+
                /* stop timer */
                m_AVND_TMR_SU_ERR_ESC_STOP(cb, su);
                m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, su, 
AVND_CKPT_SU_ERR_ESC_TMR);
@@ -1241,17 +1257,20 @@ uint32_t avnd_err_restart_esc_level_2(AV
                if (NCSCC_RC_SUCCESS != rc)
                        goto done;
 
-               cb->su_failover_cnt++;
+               cb_increment_su_failover_count(*cb, *su);
                goto done;
        }
 
        if (cb->su_failover_cnt < cb->su_failover_max) {
-               cb->su_failover_cnt++;
+               cb_increment_su_failover_count(*cb, *su);
                goto done;
        }
 
        /* reached max count */
        if (cb->su_failover_cnt >= cb->su_failover_max) {
+               LOG_NO("SU failover performed too many times (%d)",
+                      cb->su_failover_cnt);
+
                /* stop timer */
                m_AVND_TMR_NODE_ERR_ESC_STOP(cb);
                cb->su_failover_cnt = 0;
@@ -1310,18 +1329,21 @@ AVSV_ERR_RCVR avnd_err_esc_su_failover(A
                if (NCSCC_RC_SUCCESS != rc)
                        goto done;
 
-               cb->su_failover_cnt++;
+               cb_increment_su_failover_count(*cb, *su);
                goto done;
        }
 
        if (cb->su_failover_cnt < cb->su_failover_max) {
                /* This means NODE_ERR_ESC may be already running because of 
other SUs escalations.*/
                su->su_err_esc_level = AVND_ERR_ESC_LEVEL_2;
-               cb->su_failover_cnt++;
+               cb_increment_su_failover_count(*cb, *su);
                goto done;
        }
 
        if (cb->su_failover_cnt >= cb->su_failover_max) {
+               LOG_NO("SU failover performed too many times (%d)",
+                       cb->su_failover_cnt);
+
                /* stop timer */
                m_AVND_TMR_NODE_ERR_ESC_STOP(cb);
                cb->su_failover_cnt = 0;
@@ -1354,6 +1376,8 @@ uint32_t avnd_evt_tmr_node_err_esc_evh(A
        AVND_SU *su;
 
        TRACE_ENTER();
+       
+       LOG_NO("node error escalation timer expired");
 
        su = (AVND_SU *)ncs_patricia_tree_getnext(&cb->sudb, (uint8_t *)0);
        while (su != 0) {
@@ -1433,3 +1457,26 @@ uint32_t avnd_err_rcvr_node_failfast(AVN
        TRACE_LEAVE2("%u", rc);
        return rc;
 }
+
+void avnd_log_recovery_escalation(const AVND_SU& su,
+       const uint32_t previous_esc_rcvr,
+       const uint32_t esc_rcvr)
+{
+       if (esc_rcvr != previous_esc_rcvr) {
+               LOG_NO("'%s' recovery action escalated from '%s' to '%s'",
+                       su.name.value,
+                       
avnd_translate_recovery_method(previous_esc_rcvr).c_str(),
+                       avnd_translate_recovery_method(esc_rcvr).c_str());
+       }
+}
+
+std::string avnd_translate_recovery_method(const uint32_t esc_rcvr)
+{
+       const uint32_t no_of_methods = 
sizeof(g_comp_rcvr)/sizeof(g_comp_rcvr[0]);
+       
+       if (esc_rcvr <= no_of_methods) {
+               return g_comp_rcvr[esc_rcvr - 1];
+       } else {
+               return "unknown recovery method";
+       }
+}
diff --git a/osaf/services/saf/amf/amfnd/include/avnd_cb.h 
b/osaf/services/saf/amf/amfnd/include/avnd_cb.h
--- a/osaf/services/saf/amf/amfnd/include/avnd_cb.h
+++ b/osaf/services/saf/amf/amfnd/include/avnd_cb.h
@@ -153,6 +153,8 @@ typedef struct avnd_cb_tag {
 
 #define m_AVND_NODE_ERR_ESC_LEVEL_SET(x, val)  ((x)->node_err_esc_level = 
(val))
 
+void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su);
+
 extern AVND_CB *avnd_cb;
 
 #endif
diff --git a/osaf/services/saf/amf/amfnd/include/avnd_su.h 
b/osaf/services/saf/amf/amfnd/include/avnd_su.h
--- a/osaf/services/saf/amf/amfnd/include/avnd_su.h
+++ b/osaf/services/saf/amf/amfnd/include/avnd_su.h
@@ -408,4 +408,7 @@ extern bool sufailover_during_nodeswitch
 extern bool all_csis_in_removed_state(const AVND_SU *su);
 extern void su_reset_restart_count_in_comps(const AVND_SU *su);
 
+void su_increment_su_restart_count(AVND_SU& su);
+void su_increment_comp_restart_count(AVND_SU& su);
+
 #endif
diff --git a/osaf/services/saf/amf/amfnd/su.cc 
b/osaf/services/saf/amf/amfnd/su.cc
--- a/osaf/services/saf/amf/amfnd/su.cc
+++ b/osaf/services/saf/amf/amfnd/su.cc
@@ -462,6 +462,8 @@ uint32_t avnd_evt_tmr_su_err_esc_evh(AVN
        }
 
        TRACE("'%s'", su->name.value);
+       
+       LOG_NO("'%s' error escalation timer expired", su->name.value)
 
        if (NCSCC_RC_SUCCESS == m_AVND_CHECK_FOR_STDBY_FOR_EXT_COMP(cb, 
su->su_is_external))
                goto done;
@@ -689,3 +691,24 @@ void su_reset_restart_count_in_comps(con
        }
 
 }
+
+void su_increment_su_restart_count(AVND_SU& su)
+{
+       su.su_restart_cnt++;
+       LOG_NO("Restarting '%s' (%d)",
+               su.name.value, su.su_restart_cnt);      
+}
+
+void su_increment_comp_restart_count(AVND_SU& su)
+{
+       su.comp_restart_cnt++;
+       LOG_NO("Restarting a component of '%s' (%d)",
+               su.name.value, su.comp_restart_cnt);    
+}
+
+void cb_increment_su_failover_count(AVND_CB& cb, const AVND_SU& su)
+{
+       cb.su_failover_cnt++;
+       LOG_NO("Performing failover of '%s' (%d)",
+               su.name.value, cb.su_failover_cnt);     
+}

------------------------------------------------------------------------------
The best possible search technologies are now affordable for all companies.
Download your FREE open source Enterprise Search Engine today!
Our experts will assist you in its installation for $59/mo, no commitment.
Test it for FREE on our Cloud platform anytime!
http://pubads.g.doubleclick.net/gampad/clk?id=145328191&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to