osaf/services/saf/amf/amfnd/comp.cc             |  23 +++++++++++++++++++++--
 osaf/services/saf/amf/amfnd/err.cc              |  15 +++++++++++----
 osaf/services/saf/amf/amfnd/include/avnd_comp.h |   1 +
 osaf/services/saf/amf/amfnd/su.cc               |   1 +
 osaf/services/saf/amf/amfnd/util.cc             |   5 ++---
 5 files changed, 36 insertions(+), 9 deletions(-)


When a node or SU is locked, AMF assigns the QUIESCED HA state to ACTIVE
sa-aware components. This state is intended to transfer state to a peer
component assigned STANDBY. If the ACTIVE component fails in the QUIESCED
state, AMF will restart it and reassign it QUIESCED. This is an invalid state
transition according to picture 3 in the B.04 AMF spec and makes no sense since
the state is anyway gone (because the component failed).

The problem was likely reintroduced when fixing #3083, before that special
handling for comp errors in QUIESCED state was there. However the solution had
other problems and was removed. So in the current code there is no special
handling for errors in QUIESCED state which results in re-assignment of the
QUIESCED HA state after component restart.

This patch changes the error escalation logic so that instead of component
restart, component failover is performed as recovery action.

diff --git a/osaf/services/saf/amf/amfnd/comp.cc 
b/osaf/services/saf/amf/amfnd/comp.cc
--- a/osaf/services/saf/amf/amfnd/comp.cc
+++ b/osaf/services/saf/amf/amfnd/comp.cc
@@ -32,9 +32,7 @@
 ******************************************************************************
 */
 
-#include <stdbool.h>
 #include "avnd.h"
-#include <stdbool.h>
 #include <immutil.h>
 
 /*** Static function declarations ***/
@@ -2766,3 +2764,24 @@ void avnd_comp_pres_state_set(AVND_COMP 
        m_AVND_SEND_CKPT_UPDT_ASYNC_UPDT(cb, comp, AVND_CKPT_COMP_PRES_STATE);
 }
 
+/**
+ * Returns true if the HA state for any CSI assignment is QUIESCED/QUIESCING
+ * @param su
+ */
+bool comp_has_quiesced_assignment(const AVND_COMP *comp)
+{
+       const AVND_COMP_CSI_REC *csi;
+
+       for (csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(
+                       m_NCS_DBLIST_FIND_FIRST(&comp->csi_list));
+               csi != NULL;
+               csi = m_AVND_CSI_REC_FROM_COMP_DLL_NODE_GET(
+                       m_NCS_DBLIST_FIND_NEXT(&csi->comp_dll_node))) {
+
+               if ((csi->si->curr_state == SA_AMF_HA_QUIESCED) ||
+                               (csi->si->curr_state == SA_AMF_HA_QUIESCING))
+                       return true;
+       }
+
+       return false;
+}
diff --git a/osaf/services/saf/amf/amfnd/err.cc 
b/osaf/services/saf/amf/amfnd/err.cc
--- a/osaf/services/saf/amf/amfnd/err.cc
+++ b/osaf/services/saf/amf/amfnd/err.cc
@@ -411,10 +411,17 @@ uint32_t avnd_err_escalate(AVND_CB *cb, 
        if (*io_esc_rcvr == SA_AMF_NO_RECOMMENDATION)
                *io_esc_rcvr = comp->err_info.def_rec;
 
-       /* disallow comp-restart if it's disabled */
-       if ((SA_AMF_COMPONENT_RESTART == *io_esc_rcvr) && 
m_AVND_COMP_IS_RESTART_DIS(comp) && (!su->is_ncs)) {
-               LOG_NO("saAmfCompDisableRestart is true for 
'%s'",comp->name.value);
-               *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER;
+       if (*io_esc_rcvr == SA_AMF_COMPONENT_RESTART) {
+               if (m_AVND_COMP_IS_RESTART_DIS(comp) && (!su->is_ncs)) {
+                       LOG_NO("saAmfCompDisableRestart is true for 
'%s'",comp->name.value);
+                       LOG_NO("recovery action 'comp restart' escalated to 
'comp failover'");
+                       *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER;
+               } else if (comp_has_quiesced_assignment(comp) == true) {
+                       /* Cannot re-assign QUIESCED, escalate to failover */
+                       LOG_NO("component with QUIESCED/QUIESCING assignment 
failed");
+                       LOG_NO("recovery action 'comp restart' escalated to 
'comp failover'");
+                       *io_esc_rcvr = SA_AMF_COMPONENT_FAILOVER;
+               }
        }
 
        if ((SA_AMF_COMPONENT_FAILOVER== *io_esc_rcvr) && (su->sufailover) && 
(!su->is_ncs)) {
diff --git a/osaf/services/saf/amf/amfnd/include/avnd_comp.h 
b/osaf/services/saf/amf/amfnd/include/avnd_comp.h
--- a/osaf/services/saf/amf/amfnd/include/avnd_comp.h
+++ b/osaf/services/saf/amf/amfnd/include/avnd_comp.h
@@ -876,6 +876,7 @@ extern unsigned int avnd_comp_config_get
 extern int avnd_comp_config_reinit(AVND_COMP *comp);
 extern void avnd_comp_delete(AVND_COMP *comp);
 extern void avnd_comp_pres_state_set(AVND_COMP *comp, SaAmfPresenceStateT 
newstate);
+bool comp_has_quiesced_assignment(const AVND_COMP *comp);
 
 /**
  * Initiate restart of a component.
diff --git a/osaf/services/saf/amf/amfnd/su.cc 
b/osaf/services/saf/amf/amfnd/su.cc
--- a/osaf/services/saf/amf/amfnd/su.cc
+++ b/osaf/services/saf/amf/amfnd/su.cc
@@ -481,6 +481,7 @@ uint32_t avnd_evt_su_admin_op_req(AVND_C
                AVND_COMP *comp;
 
                /* SU has been repaired. Reset states and update AMF director 
accordingly. */
+               LOG_NO("Repair request for '%s'", su->name.value);
 
                for (comp = 
m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list));
                      comp;
diff --git a/osaf/services/saf/amf/amfnd/util.cc 
b/osaf/services/saf/amf/amfnd/util.cc
--- a/osaf/services/saf/amf/amfnd/util.cc
+++ b/osaf/services/saf/amf/amfnd/util.cc
@@ -236,9 +236,8 @@ void avnd_failed_state_file_create(void)
  */
 void avnd_failed_state_file_delete(void)
 {
-       if (unlink(failed_state_file_name) == -1)
-               LOG_ER("cannot unlink failed state file %s: %s",
-                               failed_state_file_name, strerror(errno));
+       // file might not exist in some cases, ignore errors
+       (void) unlink(failed_state_file_name);
 }
 
 /**

------------------------------------------------------------------------------
CenturyLink Cloud: The Leader in Enterprise Cloud Services.
Learn Why More Businesses Are Choosing CenturyLink Cloud For
Critical Workloads, Development Environments & Everything In Between.
Get a Quote or Start a Free Trial Today. 
http://pubads.g.doubleclick.net/gampad/clk?id=119420431&iu=/4140/ostg.clktrk
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to