The candidate patch below is tested, working fine
-----------------------------------------------------------------------------
diff --git a/osaf/services/saf/amf/amfd/sg_2n_fsm.cc 
b/osaf/services/saf/amf/amfd/sg_2n_fsm.cc
--- a/osaf/services/saf/amf/amfd/sg_2n_fsm.cc
+++ b/osaf/services/saf/amf/amfd/sg_2n_fsm.cc
@@ -2973,8 +2973,8 @@ void SG_2N::node_fail_su_oper(AVD_SU *su
                                           if available, or same SU will get 
standby assignment after repair.
                                         */
                                        
su->set_su_switch(AVSV_SI_TOGGLE_STABLE);
-                                       m_AVD_SET_SG_FSM(cb, (su->sg_of_su), 
AVD_SG_FSM_STABLE);
-                                       complete_siswap(a_susi->su, SA_AIS_OK);
+                                       avd_sg_su_oper_list_add(cb, a_susi->su, 
false);
+                                       m_AVD_SET_SG_FSM(cb, (su->sg_of_su), 
AVD_SG_FSM_SG_REALIGN);
                                } else {
                                        avd_sg_su_oper_list_add(cb, a_susi->su, 
false);
                                        m_AVD_SET_SG_FSM(cb, (su->sg_of_su), 
AVD_SG_FSM_SG_REALIGN);

But this will contradict with patch of #309 as below.
----------------------------------------------------------------------------------
Message: 2
Date: Mon, 28 Jul 2014 04:24:13 +0530
From: [email protected]
Subject: [devel] [PATCH 1 of 1] amfd : fix node failover while
        assigning       standby state   during si-swap {#309]
To: [email protected], [email protected],
        [email protected]
Cc: [email protected]
Message-ID: <99360b761ba19f495934.1406501653@CON-PC>
Content-Type: text/plain; charset="us-ascii"

 osaf/services/saf/amf/amfd/sg_2n_fsm.cc |  22 +++++++++++++++++-----
 1 files changed, 17 insertions(+), 5 deletions(-)


Two si-swap operations are performed. Si-swap fails second time if fault
occurs during standby assignment and it leads to node-failover escalation
during first si-wap

Second si-swap fails because SG remain unstable. This particular case
is not handle in SG FSM of 2N model.

Patch fixes the problem by making SG stable.

diff --git a/osaf/services/saf/amf/amfd/sg_2n_fsm.cc 
b/osaf/services/saf/amf/amfd/sg_2n_fsm.cc
--- a/osaf/services/saf/amf/amfd/sg_2n_fsm.cc
+++ b/osaf/services/saf/amf/amfd/sg_2n_fsm.cc
@@ -59,7 +59,7 @@ static void complete_siswap(AVD_SU *su, 
                /* si->invocation field is not check pointed. If controller 
failovers when si-swap
                   operation is in progress, si->invocation will be zero on the 
new active controller.
                   Log an error when si-swap operation completes.*/
-               LOG_ER("Operation done, but invocationId for the operation on 
SI not found '%s'", su->name.value);
+               TRACE("Operation done, but invocationId for the operation on SI 
not found '%s'", su->name.value);
        }
        TRACE_LEAVE();
 }
@@ -2929,16 +2929,28 @@ static void avd_sg_2n_node_fail_su_oper(
                        /* the admin state of the SU is shutdown change it to 
lock. */
                        if (su->saAmfSUAdminState == 
SA_AMF_ADMIN_SHUTTING_DOWN) {
                                su->set_admin_state(SA_AMF_ADMIN_LOCKED);
+                               avd_sg_su_oper_list_add(cb, a_susi->su, false);
+                               m_AVD_SET_SG_FSM(cb, (su->sg_of_su), 
AVD_SG_FSM_SG_REALIGN);
                        } else if (su_node_ptr->saAmfNodeAdminState == 
SA_AMF_ADMIN_SHUTTING_DOWN) {
                                m_AVD_IS_NODE_LOCK((su_node_ptr), flag);
                                if (flag == true) {
                                        node_admin_state_set(su_node_ptr, 
SA_AMF_ADMIN_LOCKED);
                                }
-                       } else {
-                               su->set_su_switch(AVSV_SI_TOGGLE_STABLE);
+                               avd_sg_su_oper_list_add(cb, a_susi->su, false);
+                               m_AVD_SET_SG_FSM(cb, (su->sg_of_su), 
AVD_SG_FSM_SG_REALIGN);
+                       } else  {
+
+                               /* During si-swap while standby assignment is 
going on, if Nodefailover 
+                                  or SU failover got escalated then toggle SU 
switch state and make SG 
+                                  stable. After SG becomes stable, spare SU 
will be instantiated, 
+                                  if available, or same SU will get standby 
assignment after repair.
+                                */
+                               if (su->su_switch == AVSV_SI_TOGGLE_SWITCH) {
+                                       
su->set_su_switch(AVSV_SI_TOGGLE_STABLE);
+                                       m_AVD_SET_SG_FSM(cb, (su->sg_of_su), 
AVD_SG_FSM_STABLE);
+                                       complete_siswap(a_susi->su, SA_AIS_OK);
+                               }
                        }
-                       avd_sg_su_oper_list_add(cb, a_susi->su, false);
-                       m_AVD_SET_SG_FSM(cb, (su->sg_of_su), 
AVD_SG_FSM_SG_REALIGN);
                } /* if (a_susi->su != su) */
                else {
                        if (s_susi != AVD_SU_SI_REL_NULL) {


---

** [tickets:#1312] AMF: NodeFailover during SiSwap leaves SG UnStable**

**Status:** assigned
**Milestone:** 4.4.2
**Created:** Fri Apr 10, 2015 10:57 AM UTC by Minh Hon Chau
**Last Updated:** Fri Apr 10, 2015 11:04 AM UTC
**Owner:** Minh Hon Chau

* Configuration:

2 2N SU1, SU2 hosted in SCs
1 sponsored SI (AGENT) and some dependent SIs (MTZ, ACA, CQH, AFD, HDF, NSF, 
SGS, CLH, DBO)
Only one componentRestart will escalate to nodeFailover

* Steps and analysis

All SIs are assigned ACTIVE to SU1, STANDBY to SU2

1) Swap SI safSi=AFD,safApp=TEST_APP
Apr 10 11:00:49 SC-1 osafamfd[491]: NO safSi=AFD,safApp=TEST_APP Swap initiated

2) Swap 2N SI will lead to SU switch over
Apr 10 11:00:49 SC-1 osafamfnd[500]: NO Assigning 'safSi=ACA,safApp=TEST_APP' 
QUIESCED to 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP'
Apr 10 11:00:49 SC-1 osafamfnd[500]: NO Assigned 'safSi=ACA,safApp=TEST_APP' 
QUIESCED to 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP'
...
Apr 10 11:00:49 SC-1 osafamfnd[500]: NO Assigning 'safSi=AGENT,safApp=TEST_APP' 
QUIESCED to 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP'
Apr 10 11:00:49 SC-1 osafamfnd[500]: NO Assigned 'safSi=AGENT,safApp=TEST_APP' 
QUIESCED to 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP'

3) Assign sponsor SI ACTIVE to SU2
Apr 10 11:00:49 SC-2 osafamfnd[488]: NO Assigning 'safSi=AGENT,safApp=TEST_APP' 
ACTIVE to 'safSu=SU2,safSg=TEST_SG_2N,safApp=TEST_APP'
(But AGENT in SC-2 has not responded to AMFND)

4) Binary of CQH is corrupted after QUIESCED response to AMF , escalate to 
nodeFailover
Apr 10 11:00:50 SC-1 osafamfnd[500]: NO 
'safComp=CQH,safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP' recovery action 
escalated from 'componentRestart' to 'nodeFailover'
Apr 10 11:00:50 SC-1 osafamfnd[500]: NO 
'safComp=CQH,safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP' faulted due to 
'avaDown' : Recovery is 'nodeFailover'

5) SC-1 is going reboot, SC-2 becomes ACTIVE
Apr 10 11:00:50 SC-2 osafamfd[479]: NO FAILOVER StandBy --> Active

6) AMFD-SC2 starts node_failover procedure
Apr 10 11:00:50.731489 osafamfd [479:ndproc.cc:0923] >> avd_node_failover: 
'safAmfNode=SC-1,safAmfCluster=myAmfCluster'
...
Apr 10 11:00:50.737048 osafamfd [479:sg_nored_fsm.cc:0793] >> node_fail: 
safSu=SC-1,safSg=NoRed,safApp=OpenSAF, sg_fsm_state=0
Apr 10 11:00:50.745536 osafamfd [479:sg_2n_fsm.cc:3262] >> node_fail: 
'safSu=SC-1,safSg=2N,safApp=OpenSAF', 0
Apr 10 11:00:50.748579 osafamfd [479:sg_2n_fsm.cc:3262] >> node_fail: 
'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP', 2

7) During running node_fail_su_oper for TEST_SG_2N (due to swap), SG state set 
to STABLE
Apr 10 11:00:50.748584 osafamfd [479:sg_2n_fsm.cc:2865] >> node_fail_su_oper 
...
Apr 10 11:00:50.749197 osafamfd [479:sg.cc:1635] TR 
safSg=TEST_SG_2N,safApp=TEST_APP sg_fsm_state 2 => 0
...
Apr 10 11:00:50.749217 osafamfd [479:sg_2n_fsm.cc:3099] << node_fail_su_oper 

8) Now in SC-2, AGENT responded to AMFND for ACTIVE csiSetCallback, AMFD 
receives this su_si event from AMFND.
But SG is STABLE, and no operation for su_si modify (act:5)
Apr 10 11:00:59.280465 osafamfnd [488:susm.cc:0954] NO Assigned 
'safSi=AGENT,safApp=TEST_APP' ACTIVE to 
'safSu=SU2,safSg=TEST_SG_2N,safApp=TEST_APP'
Apr 10 11:00:59.280681 osafamfd [479:sgproc.cc:0889] >> avd_su_si_assign_evh: 
id:120, node:2020f, act:5, 'safSu=SU2,safSg=TEST_SG_2N,safApp=TEST_APP', 
'safSi=AGENT,safApp=TEST_APP', ha:1, err:1, single:0
...
Apr 10 11:00:59.280737 osafamfd [479:sg_2n_fsm.cc:2361] >> susi_success: 
'safSu=SU2,safSg=TEST_SG_2N,safApp=TEST_APP' act=5, hastate=1, sg_fsm_state=0
Apr 10 11:00:59.280749 osafamfd [479:sg_2n_fsm.cc:2376] EM sg_2n_fsm.cc:2376: 
safSu=SU2,safSg=TEST_SG_2N,safApp=TEST_APP (42)
Apr 10 11:00:59.280752 osafamfd [479:sg_2n_fsm.cc:2562] << susi_success: rc:1
Apr 10 11:00:59.280755 osafamfd [479:sgproc.cc:1405] << avd_su_si_assign_evh 

9) SC-1 comes up, all SIs are assigned STANDBY
Apr 10 11:01:21 SC-1 opensafd: Starting OpenSAF Services (Using TCP)
...
Apr 10 11:01:24 SC-1 osafamfnd[490]: NO Assigning 'safSi=DBO,safApp=TEST_APP' 
STANDBY to 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP'
Apr 10 11:01:24 SC-1 osafamfnd[490]: NO Assigned 'safSi=DBO,safApp=TEST_APP' 
STANDBY to 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP'
...
Apr 10 11:01:24 SC-1 osafamfnd[490]: NO Assigning 'safSi=AGENT,safApp=TEST_APP' 
STANDBY to 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP'
Apr 10 11:01:24 SC-1 osafamfnd[490]: NO Assigned 'safSi=AGENT,safApp=TEST_APP' 
STANDBY to 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP'

10) AMFD-SC2 is informed the SU1's STANDBY assignment
After susi_success(), SG state is still REALIGN
Apr 10 11:01:24.345208 osafamfd [479:sgproc.cc:0889] >> avd_su_si_assign_evh: 
id:115, node:2010f, act:2, 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP', 
'safSi=AGENT,safApp=TEST_APP', ha:2, err:1, single:0
...
Apr 10 11:01:24.345666 osafamfd [479:sg_2n_fsm.cc:2361] >> susi_success: 
'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP' act=2, hastate=2, sg_fsm_state=1
Apr 10 11:01:24.345669 osafamfd [479:sg_2n_fsm.cc:1446] >> 
susi_success_sg_realign: 'safSu=SU1,safSg=TEST_SG_2N,safApp=TEST_APP' act=2, 
state=2
Apr 10 11:01:24.345672 osafamfd [479:sg_2n_fsm.cc:1865] << 
susi_success_sg_realign: rc:1
Apr 10 11:01:24.345674 osafamfd [479:sg_2n_fsm.cc:2562] << susi_success: rc:1
Apr 10 11:01:24.345678 osafamfd [479:sgproc.cc:1405] << avd_su_si_assign_evh 

11) Finally, failed to swap again
Apr 10 11:03:23.304988 osafamfd [479:si.cc:0821] >> si_admin_op_cb: 
safSi=AFD,safApp=TEST_APP op=7
Apr 10 11:03:23.304997 osafamfd [479:sg_2n_fsm.cc:0757] >> si_swap: 
'safSi=AFD,safApp=TEST_APP' sg_fsm_state=1
Apr 10 11:03:23.305011 osafamfd [479:sg_2n_fsm.cc:0775] ER 
safSi=AFD,safApp=TEST_APP SWAP failed - SG not stable (1)
Apr 10 11:03:23.305013 osafamfd [479:sg_2n_fsm.cc:0857] << si_swap: 
sg_fsm_state=1





---

Sent from sourceforge.net because [email protected] is 
subscribed to https://sourceforge.net/p/opensaf/tickets/

To unsubscribe from further messages, a project admin can change settings at 
https://sourceforge.net/p/opensaf/admin/tickets/options.  Or, if this is a 
mailing list, you can unsubscribe from the mailing list.
------------------------------------------------------------------------------
BPM Camp - Free Virtual Workshop May 6th at 10am PDT/1PM EDT
Develop your own process in accordance with the BPMN 2 standard
Learn Process modeling best practices with Bonita BPM through live exercises
http://www.bonitasoft.com/be-part-of-it/events/bpm-camp-virtual- event?utm_
source=Sourceforge_BPM_Camp_5_6_15&utm_medium=email&utm_campaign=VA_SF
_______________________________________________
Opensaf-tickets mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-tickets

Reply via email to