Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141] V2

2016-12-07 Thread Nagendra Kumar
Ack, but I couldn't test it.

Thanks
-Nagu

> -Original Message-
> From: Minh Hon Chau [mailto:minh.c...@dektech.com.au]
> Sent: 02 December 2016 17:04
> To: hans.nordeb...@ericsson.com; Nagendra Kumar; Praveen Malviya;
> gary@dektech.com.au; minh.c...@dektech.com.au
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM
> track callback [#2141] V2
> 
>  osaf/services/saf/amf/amfd/clm.cc   |  35 ++-
> -
>  osaf/services/saf/amf/amfd/include/cb.h |   1 +
>  osaf/services/saf/amf/amfd/role.cc  |  10 ++--
>  3 files changed, 32 insertions(+), 14 deletions(-)
> 
> 
> V2 Incorporate comments from Praveen, Nagu: Send saClmResponse_4
> if clm cb is received in non-action amfd, retry to stop clm when switch
> over is done. This V2 does not make amfd exit at second retry, only
> logging error as it should succeed.
> 
> In controller failover/switchover, sometimes active AMFD fails to stop
> CLM track callback. Therefore, when this AMFD become standby, AMFD can
> continue receiving CLM track callback and trigger the operations which
> should only be executed in active AMFD.
> 
> diff --git a/osaf/services/saf/amf/amfd/clm.cc
> b/osaf/services/saf/amf/amfd/clm.cc
> --- a/osaf/services/saf/amf/amfd/clm.cc
> +++ b/osaf/services/saf/amf/amfd/clm.cc
> @@ -220,7 +220,11 @@ static void clm_track_cb(const SaClmClus
>   LOG_ER("ClmTrackCallback received in error");
>   goto done;
>   }
> -
> + if (avd_cb->avail_state_avd != SA_AMF_HA_ACTIVE) {
> + LOG_WA("Receive clm track cb with AMFD state(%d)",
> avd_cb->avail_state_avd);
> + saClmResponse_4(avd_cb->clmHandle, invocation,
> SA_CLM_CALLBACK_RESPONSE_OK);
> + goto done;
> + }
>   /*
>   ** The CLM cluster can be larger than the AMF cluster thus it is not
> an
>   ** error if the corresponding AMF node cannot be found.
> @@ -395,6 +399,7 @@ SaAisErrorT avd_clm_init(AVD_CL_CB* cb)
> 
>   cb->clmHandle = 0;
>   cb->clm_sel_obj = 0;
> + cb->is_clm_track_started = false;
>   TRACE_ENTER();
>   /*
>* TODO: This CLM initialization thread can be re-factored
> @@ -454,6 +459,8 @@ SaAisErrorT avd_clm_track_start(void)
>   } else {
>   LOG_ER("Failed to start cluster tracking %u", error);
>   }
> + } else {
> + avd_cb->is_clm_track_started = true;
>   }
>   TRACE_LEAVE();
>   return error;
> @@ -461,17 +468,23 @@ SaAisErrorT avd_clm_track_start(void)
> 
>  SaAisErrorT avd_clm_track_stop(void)
>  {
> -SaAisErrorT error = SA_AIS_OK;
> + SaAisErrorT error = SA_AIS_OK;
> + TRACE_ENTER();
> + error = saClmClusterTrackStop(avd_cb->clmHandle);
> + if (error != SA_AIS_OK) {
> + if (error == SA_AIS_ERR_TRY_AGAIN || error ==
> SA_AIS_ERR_TIMEOUT ||
> + error == SA_AIS_ERR_UNAVAILABLE) {
> + LOG_WA("Failed to stop cluster tracking %u", error);
> + } else {
> + LOG_ER("Failed to stop cluster tracking %u", error);
> + }
> + } else {
> + TRACE("Sucessfully stops cluster tracking");
> + avd_cb->is_clm_track_started = false;
> + }
> 
> -TRACE_ENTER();
> - error = saClmClusterTrackStop(avd_cb->clmHandle);
> -if (SA_AIS_OK != error)
> -LOG_ER("Failed to stop cluster tracking %u", error);
> - else
> - TRACE("Sucessfully stops cluster tracking");
> -
> -TRACE_LEAVE();
> -return error;
> + TRACE_LEAVE();
> + return error;
>  }
> 
>  void clm_node_terminate(AVD_AVND *node)
> diff --git a/osaf/services/saf/amf/amfd/include/cb.h
> b/osaf/services/saf/amf/amfd/include/cb.h
> --- a/osaf/services/saf/amf/amfd/include/cb.h
> +++ b/osaf/services/saf/amf/amfd/include/cb.h
> @@ -215,6 +215,7 @@ typedef struct cl_cb_tag {
>   /* Clm stuff */
>   std::atomic clmHandle;
>   std::atomic clm_sel_obj;
> + bool is_clm_track_started;
> 
>   bool fully_initialized;
>   bool swap_switch; /* true - In middle of role switch. */
> diff --git a/osaf/services/saf/amf/amfd/role.cc
> b/osaf/services/saf/amf/amfd/role.cc
> --- a/osaf/services/saf/amf/amfd/role.cc
> +++ b/osaf/services/saf/amf/amfd/role.cc
> @@ -1055,9 +1055,7 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB
>   /*  Mark AVD as Quiesced. */
>   cb->avail_state_avd = SA_AMF_HA_QUIESCED;
> 
> - if (avd_clm_track_stop() != SA_AIS_OK) {
> - LOG_ER("ClmTrack stop failed");
> - }
> + avd_clm_track_stop();
> 
>   /* Go ahead and set mds role as already the NCS SU has been
> switched */
>   if (NCSCC_RC_SUCCESS != (rc = avd_mds_set_vdest_role(cb,
> SA_AMF_HA_QUIESCED))) {
> @@ -1136,6 +1134,12 @@ uint32_t amfd_switch_qsd_stdby(AVD_CL_CB
>   avd_pg_node_csi_del_all(cb, avnd);
>   }

Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141] V2

2016-12-04 Thread minh chau
Hi Nagu, Praveen

This V2 patch is changed according to your latest comments, the only 
diff I don't make amfd exit, and do logging instead.
If it happens and reaches to this error logging, it should be enough to 
get us noticed, and investigate services internally. Reboot node would 
cause error reports from applications. Please help to review and let me 
know if the patch needs to change anything.

Thanks,
Minh

On 02/12/16 22:33, Minh Hon Chau wrote:
>   osaf/services/saf/amf/amfd/clm.cc   |  35 
> ++--
>   osaf/services/saf/amf/amfd/include/cb.h |   1 +
>   osaf/services/saf/amf/amfd/role.cc  |  10 ++--
>   3 files changed, 32 insertions(+), 14 deletions(-)
>
>
> V2 Incorporate comments from Praveen, Nagu: Send saClmResponse_4
> if clm cb is received in non-action amfd, retry to stop clm when switch
> over is done. This V2 does not make amfd exit at second retry, only
> logging error as it should succeed.
>
> In controller failover/switchover, sometimes active AMFD fails to stop
> CLM track callback. Therefore, when this AMFD become standby, AMFD can
> continue receiving CLM track callback and trigger the operations which
> should only be executed in active AMFD.
>
> diff --git a/osaf/services/saf/amf/amfd/clm.cc 
> b/osaf/services/saf/amf/amfd/clm.cc
> --- a/osaf/services/saf/amf/amfd/clm.cc
> +++ b/osaf/services/saf/amf/amfd/clm.cc
> @@ -220,7 +220,11 @@ static void clm_track_cb(const SaClmClus
>   LOG_ER("ClmTrackCallback received in error");
>   goto done;
>   }
> -
> + if (avd_cb->avail_state_avd != SA_AMF_HA_ACTIVE) {
> + LOG_WA("Receive clm track cb with AMFD state(%d)", 
> avd_cb->avail_state_avd);
> + saClmResponse_4(avd_cb->clmHandle, invocation, 
> SA_CLM_CALLBACK_RESPONSE_OK);
> + goto done;
> + }
>   /*
>   ** The CLM cluster can be larger than the AMF cluster thus it is not an
>   ** error if the corresponding AMF node cannot be found.
> @@ -395,6 +399,7 @@ SaAisErrorT avd_clm_init(AVD_CL_CB* cb)
>   
>   cb->clmHandle = 0;
>   cb->clm_sel_obj = 0;
> + cb->is_clm_track_started = false;
>   TRACE_ENTER();
>   /*
>* TODO: This CLM initialization thread can be re-factored
> @@ -454,6 +459,8 @@ SaAisErrorT avd_clm_track_start(void)
>   } else {
>   LOG_ER("Failed to start cluster tracking %u", error);
>   }
> + } else {
> + avd_cb->is_clm_track_started = true;
>   }
>   TRACE_LEAVE();
>   return error;
> @@ -461,17 +468,23 @@ SaAisErrorT avd_clm_track_start(void)
>   
>   SaAisErrorT avd_clm_track_stop(void)
>   {
> -SaAisErrorT error = SA_AIS_OK;
> + SaAisErrorT error = SA_AIS_OK;
> + TRACE_ENTER();
> + error = saClmClusterTrackStop(avd_cb->clmHandle);
> + if (error != SA_AIS_OK) {
> + if (error == SA_AIS_ERR_TRY_AGAIN || error == 
> SA_AIS_ERR_TIMEOUT ||
> + error == SA_AIS_ERR_UNAVAILABLE) {
> + LOG_WA("Failed to stop cluster tracking %u", error);
> + } else {
> + LOG_ER("Failed to stop cluster tracking %u", error);
> + }
> + } else {
> + TRACE("Sucessfully stops cluster tracking");
> + avd_cb->is_clm_track_started = false;
> + }
>   
> -TRACE_ENTER();
> - error = saClmClusterTrackStop(avd_cb->clmHandle);
> -if (SA_AIS_OK != error)
> -LOG_ER("Failed to stop cluster tracking %u", error);
> - else
> - TRACE("Sucessfully stops cluster tracking");
> -
> -TRACE_LEAVE();
> -return error;
> + TRACE_LEAVE();
> + return error;
>   }
>   
>   void clm_node_terminate(AVD_AVND *node)
> diff --git a/osaf/services/saf/amf/amfd/include/cb.h 
> b/osaf/services/saf/amf/amfd/include/cb.h
> --- a/osaf/services/saf/amf/amfd/include/cb.h
> +++ b/osaf/services/saf/amf/amfd/include/cb.h
> @@ -215,6 +215,7 @@ typedef struct cl_cb_tag {
>   /* Clm stuff */
>   std::atomic clmHandle;
>   std::atomic clm_sel_obj;
> + bool is_clm_track_started;
>   
>   bool fully_initialized;
>   bool swap_switch; /* true - In middle of role switch. */
> diff --git a/osaf/services/saf/amf/amfd/role.cc 
> b/osaf/services/saf/amf/amfd/role.cc
> --- a/osaf/services/saf/amf/amfd/role.cc
> +++ b/osaf/services/saf/amf/amfd/role.cc
> @@ -1055,9 +1055,7 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB
>   /*  Mark AVD as Quiesced. */
>   cb->avail_state_avd = SA_AMF_HA_QUIESCED;
>   
> - if (avd_clm_track_stop() != SA_AIS_OK) {
> - LOG_ER("ClmTrack stop failed");
> - }
> + avd_clm_track_stop();
>   
>   /* Go ahead and set mds role as already the NCS SU has been switched */
>   if (NCSCC_RC_SUCCESS != (rc = avd_mds_set_vdest_role(cb, 
> SA_AMF_HA_QUIESCED))) {
> @@ -1136,6 +1134,12 @@ uint32_t amfd_switch_qsd_stdby(AVD_CL_CB
>  

Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-12-01 Thread Nagendra Kumar
Hi Gary,
> I’m not sure if reboot the standby node is acceptable. It might be standby 
> for AMF, but active for other SGs.

It is very rare to come upto the point that Standby Amfd has to reboot the node.
The CLM stop API should succeed in normal situations(When it has become 
Standby) if it fails during transition(Act to Quiesced).
I am also saying to retry if it doesn't stop for couple of times.
I don't find any issue with the approach.

Thanks
-Nagu
> -Original Message-
> From: Gary Lee [mailto:gary@dektech.com.au]
> Sent: 01 December 2016 14:26
> To: Nagendra Kumar
> Cc: minh chau; Praveen Malviya; hans.nordeb...@ericsson.com;
> long.hb.ngu...@dektech.com.au; opensaf-devel@lists.sourceforge.net
> Subject: Re: [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM
> track callback [#2141]
> 
> Hi guys
> 
> 
> > On 1 Dec. 2016, at 4:24 pm, Nagendra Kumar 
> wrote:
> >
> > Hi Minh,
> > But fixing small thing is complicating the code. You can just try to
> stop CLM track again in Quisced->Standby transition and even if it fails you
> may try few times as it is Standby and it is not doing much job accept that 
> it is
> receiving some checkpoints. If that fails, it is ok to reboot the Standby 
> node.
> I am sure this is very rare case and the added code may not be getting
> chance to get executed.
> 
> I’m not sure if reboot the standby node is acceptable. It might be standby for
> AMF, but active for other SGs.
> 
> > Also, it is important to investigate why CLM is giving try-again.
> >
> > Thanks
> > -Nagu
> 
> Thanks
> Gary
> 

--
___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-12-01 Thread Gary Lee
Hi guys


> On 1 Dec. 2016, at 4:24 pm, Nagendra Kumar  wrote:
> 
> Hi Minh,
>   But fixing small thing is complicating the code. You can just try to 
> stop CLM track again in Quisced->Standby transition and even if it fails you 
> may try few times as it is Standby and it is not doing much job accept that 
> it is receiving some checkpoints. If that fails, it is ok to reboot the 
> Standby node. I am sure this is very rare case and the added code may not be 
> getting chance to get executed.

I’m not sure if reboot the standby node is acceptable. It might be standby for 
AMF, but active for other SGs.

> Also, it is important to investigate why CLM is giving try-again.
> 
> Thanks
> -Nagu

Thanks
Gary


--
___
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel


Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-11-30 Thread Nagendra Kumar
Hi Minh,
But fixing small thing is complicating the code. You can just try to 
stop CLM track again in Quisced->Standby transition and even if it fails you 
may try few times as it is Standby and it is not doing much job accept that it 
is receiving some checkpoints. If that fails, it is ok to reboot the Standby 
node. I am sure this is very rare case and the added code may not be getting 
chance to get executed.
Also, it is important to investigate why CLM is giving try-again.

Thanks
-Nagu

> -Original Message-
> From: minh chau [mailto:minh.c...@dektech.com.au]
> Sent: 01 December 2016 10:02
> To: Nagendra Kumar; Praveen Malviya; hans.nordeb...@ericsson.com;
> gary@dektech.com.au; long.hb.ngu...@dektech.com.au
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: Re: [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM
> track callback [#2141]
> 
> Hi Nagu,
> 
> The patch has already had a variable to memorize the track start/stop status.
> Wouldn't it be nice to avoid a node reboot (as consequence of AMFD's
> exit) due to TRY_AGAIN/TIMEOUT which could be classified as temporary
> service denial? And retry to stop tracking until coming back to as active one.
> 
> Thanks,
> Minh
> 
> On 30/11/16 17:25, Nagendra Kumar wrote:
> > Hi Minh,
> > I think the patch could be made simpler by again try to stop CLM
> track(keep a variable to indicate it wasn't stopped before) when AMFD
> becomes Standby. If it doesn't get stopped again, then exit Amfd as it is kind
> of double fault.
> >
> > Thanks
> > -Nagu
> >
> >> -Original Message-
> >> From: minh chau [mailto:minh.c...@dektech.com.au]
> >> Sent: 30 November 2016 10:40
> >> To: praveen malviya;hans.nordeb...@ericsson.com; Nagendra Kumar;
> >> gary@dektech.com.au;long.hb.ngu...@dektech.com.au
> >> Cc:opensaf-devel@lists.sourceforge.net
> >> Subject: Re: [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives
> >> CLM track callback [#2141]
> >>
> >> Hi Praveen,
> >>
> >> I have seen a limitation of using the same job queue now, that it
> >> will increase the possibility of loss of IMM update job which is
> >> causing incorrect headless recovery. The timer is one alternative. We
> >> had spawned a thread to deal with TRY_AGAIN/TIMEOUT in case of start
> >> cluster tracking, so stop cluster tracking should be handled in a similar
> way.
> >> I will create another ticket and assign it to myself to introduce a
> >> thread handler for using CLM service in AMF. Some background/similar
> >> tickets: #1609, #1531. For the scope of this ticket, the floated
> >> patch + a correction from your comment that standby AMFD responds
> CLM
> >> callback, they should be enough to avoid the reported problem in
> >> #2141. Do you agree?
> >>
> >> Thanks,
> >> Minh
> >>
> >>
> >> On 21/11/16 22:37, praveen malviya wrote:
> >>> Hi Minh,
> >>>
> >>> If the intention is to use same job queue then, I think, one problem
> >>> will come:
> >>>   "Currently AMFD empties job queue when AMFD becomes quiesced in
> >>> avd_mds_qsd_role_evh(). Most probably before emptying, AMFD will try
> >>> to stop CLM tracking and now if the API again fails then this
> >>> failure cannot be handled as AMFD cannot be blocked in
> >>> avd_mds_qsd_role_evh() for long time. And then AMFD will empty the
> job queue."
> >>>
> >>> Can we think of some way to avoid above problem? One solution,
> >>> without using job queue, could be: When CLM track fails, run a timer
> >>> of 1 seconds. When this timer expires, check role of AMFD. If the
> >>> role is still standby and Track stop is pending then now AMFD will give a
> try.
> >>> If it again fails then rerun the timer based on the value of return
> >>> code. We can decide how many times this can be tried.
> >>>
> >>> One more thing. on standby controller if clm track has failed then
> >>> it should respond for any track callback that it receives without
> >>> taking any action. This needs to be done irrespective of the
> >>> approach of the solution.
> >>>
> >>> Thanks,
> >>> Praveen
> >>>
> >>> On 21-Nov-16 4:38 PM, minh chau wrote:
>  Hi Praveen,
> 
>  I am thinking if creating a CLM job (likely IMM update, NTF send
>  job) in order to stop tracking at non-active AMFD.
>  Should it be a better version than this patch that AMFD will still
>  have chance to stop tracking until being promoted back to active role?
> 
>  Thanks,
>  Minh
> 
>  On 18/11/16 11:38, minh chau wrote:
> > Hi Praveen,
> >
> > It has been seen once in swapping 2N Opensaf SI. It was just an
> > error indicating that AMFD failed to stop track callback, after
> > that the si-swap also succeeded. CLM service could be busy during
> > switch-over but AMF (as well as other services) is supposed to
> > handle that return code properly as AMFD has done in track start.
> > Further testing, it exposed a problem due to clm track callback
> received on standby AMFD.
> >
> > AMFD 

Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-11-30 Thread minh chau
Hi Nagu,

The patch has already had a variable to memorize the track start/stop 
status.
Wouldn't it be nice to avoid a node reboot (as consequence of AMFD's 
exit) due to TRY_AGAIN/TIMEOUT which could be classified as temporary 
service denial? And retry to stop tracking until coming back to as 
active one.

Thanks,
Minh

On 30/11/16 17:25, Nagendra Kumar wrote:
> Hi Minh,
>   I think the patch could be made simpler by again try to stop CLM 
> track(keep a variable to indicate it wasn't stopped before) when AMFD becomes 
> Standby. If it doesn't get stopped again, then exit Amfd as it is kind of 
> double fault.
>
> Thanks
> -Nagu
>
>> -Original Message-
>> From: minh chau [mailto:minh.c...@dektech.com.au]
>> Sent: 30 November 2016 10:40
>> To: praveen malviya;hans.nordeb...@ericsson.com; Nagendra Kumar;
>> gary@dektech.com.au;long.hb.ngu...@dektech.com.au
>> Cc:opensaf-devel@lists.sourceforge.net
>> Subject: Re: [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM
>> track callback [#2141]
>>
>> Hi Praveen,
>>
>> I have seen a limitation of using the same job queue now, that it will 
>> increase
>> the possibility of loss of IMM update job which is causing incorrect headless
>> recovery. The timer is one alternative. We had spawned a thread to deal with
>> TRY_AGAIN/TIMEOUT in case of start cluster tracking, so stop cluster
>> tracking should be handled in a similar way.
>> I will create another ticket and assign it to myself to introduce a thread
>> handler for using CLM service in AMF. Some background/similar
>> tickets: #1609, #1531. For the scope of this ticket, the floated patch + a
>> correction from your comment that standby AMFD responds CLM callback,
>> they should be enough to avoid the reported problem in #2141. Do you
>> agree?
>>
>> Thanks,
>> Minh
>>
>>
>> On 21/11/16 22:37, praveen malviya wrote:
>>> Hi Minh,
>>>
>>> If the intention is to use same job queue then, I think, one problem
>>> will come:
>>>   "Currently AMFD empties job queue when AMFD becomes quiesced in
>>> avd_mds_qsd_role_evh(). Most probably before emptying, AMFD will try
>>> to stop CLM tracking and now if the API again fails then this failure
>>> cannot be handled as AMFD cannot be blocked in avd_mds_qsd_role_evh()
>>> for long time. And then AMFD will empty the job queue."
>>>
>>> Can we think of some way to avoid above problem? One solution, without
>>> using job queue, could be: When CLM track fails, run a timer of 1
>>> seconds. When this timer expires, check role of AMFD. If the role is
>>> still standby and Track stop is pending then now AMFD will give a try.
>>> If it again fails then rerun the timer based on the value of return
>>> code. We can decide how many times this can be tried.
>>>
>>> One more thing. on standby controller if clm track has failed then it
>>> should respond for any track callback that it receives without taking
>>> any action. This needs to be done irrespective of the approach of the
>>> solution.
>>>
>>> Thanks,
>>> Praveen
>>>
>>> On 21-Nov-16 4:38 PM, minh chau wrote:
 Hi Praveen,

 I am thinking if creating a CLM job (likely IMM update, NTF send job) in
 order to stop tracking at non-active AMFD.
 Should it be a better version than this patch that AMFD will still have
 chance to stop tracking until being promoted back to active role?

 Thanks,
 Minh

 On 18/11/16 11:38, minh chau wrote:
> Hi Praveen,
>
> It has been seen once in swapping 2N Opensaf SI. It was just an error
> indicating that AMFD failed to stop track callback, after that the
> si-swap also succeeded. CLM service could be busy during switch-over
> but AMF (as well as other services) is supposed to handle that return
> code properly as AMFD has done in track start. Further testing, it
> exposed a problem due to clm track callback received on standby AMFD.
>
> AMFD can not be too busy just try to stop clm track. First option,
> AMFD can start another thread which just tries to stop track callback,
> but that's likely to generate more problems where this standby AMFD is
> promoted back to active in the meantime that thread of stopping track
> is on going. At that time, main thread will be starting track and
> another thread is trying to stop track.
>
> Second option as in the patch, AMFD retries to stop track where
> problem happens in callback. At the time standby AMFD switching back
> to active, we stop and start track again, but it seems no point of
> doing this. I hope it's less impact on existing behavior, do you see
> any problems?
>
> Regarding standby controller with locked/disabled CLM, I think it is
> mentioned (in #1828 or some where else?) that this matter requires
> more thoughts as whether we want Ha role assigned to AMFD in non
> member node. It seems to beyond this ticket.
>
> Thanks,
> Minh
>
> On 17/11/16 22:03, 

Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-11-29 Thread minh chau
Hi Praveen,

I have seen a limitation of using the same job queue now, that it will 
increase the possibility of loss of IMM update job which is causing 
incorrect headless recovery. The timer is one alternative. We had 
spawned a thread to deal with TRY_AGAIN/TIMEOUT in case of start cluster 
tracking, so stop cluster tracking should be handled in a similar way.
I will create another ticket and assign it to myself to introduce a 
thread handler for using CLM service in AMF. Some background/similar 
tickets: #1609, #1531. For the scope of this ticket, the floated patch + 
a correction from your comment that standby AMFD responds CLM callback, 
they should be enough to avoid the reported problem in #2141. Do you agree?

Thanks,
Minh


On 21/11/16 22:37, praveen malviya wrote:
> Hi Minh,
>
> If the intention is to use same job queue then, I think, one problem 
> will come:
>  "Currently AMFD empties job queue when AMFD becomes quiesced in 
> avd_mds_qsd_role_evh(). Most probably before emptying, AMFD will try 
> to stop CLM tracking and now if the API again fails then this failure 
> cannot be handled as AMFD cannot be blocked in avd_mds_qsd_role_evh() 
> for long time. And then AMFD will empty the job queue."
>
> Can we think of some way to avoid above problem? One solution, without 
> using job queue, could be: When CLM track fails, run a timer of 1 
> seconds. When this timer expires, check role of AMFD. If the role is 
> still standby and Track stop is pending then now AMFD will give a try. 
> If it again fails then rerun the timer based on the value of return 
> code. We can decide how many times this can be tried.
>
> One more thing. on standby controller if clm track has failed then it 
> should respond for any track callback that it receives without taking 
> any action. This needs to be done irrespective of the approach of the 
> solution.
>
> Thanks,
> Praveen
>
> On 21-Nov-16 4:38 PM, minh chau wrote:
>> Hi Praveen,
>>
>> I am thinking if creating a CLM job (likely IMM update, NTF send job) in
>> order to stop tracking at non-active AMFD.
>> Should it be a better version than this patch that AMFD will still have
>> chance to stop tracking until being promoted back to active role?
>>
>> Thanks,
>> Minh
>>
>> On 18/11/16 11:38, minh chau wrote:
>>> Hi Praveen,
>>>
>>> It has been seen once in swapping 2N Opensaf SI. It was just an error
>>> indicating that AMFD failed to stop track callback, after that the
>>> si-swap also succeeded. CLM service could be busy during switch-over
>>> but AMF (as well as other services) is supposed to handle that return
>>> code properly as AMFD has done in track start. Further testing, it
>>> exposed a problem due to clm track callback received on standby AMFD.
>>>
>>> AMFD can not be too busy just try to stop clm track. First option,
>>> AMFD can start another thread which just tries to stop track callback,
>>> but that's likely to generate more problems where this standby AMFD is
>>> promoted back to active in the meantime that thread of stopping track
>>> is on going. At that time, main thread will be starting track and
>>> another thread is trying to stop track.
>>>
>>> Second option as in the patch, AMFD retries to stop track where
>>> problem happens in callback. At the time standby AMFD switching back
>>> to active, we stop and start track again, but it seems no point of
>>> doing this. I hope it's less impact on existing behavior, do you see
>>> any problems?
>>>
>>> Regarding standby controller with locked/disabled CLM, I think it is
>>> mentioned (in #1828 or some where else?) that this matter requires
>>> more thoughts as whether we want Ha role assigned to AMFD in non
>>> member node. It seems to beyond this ticket.
>>>
>>> Thanks,
>>> Minh
>>>
>>> On 17/11/16 22:03, praveen malviya wrote:
 Hi Minh,

 We have not seen this issue earlier. Why CLMA is returning timeout
 where it is busy?
 If we are allowing Standby controller to continue tracking the Node,
 then I think (though not fully sure), it will increase one more
 client for validation step and this client does not exist locally.
 Also we need to evaluate the situation where Standby controller is
 CLM locked or disabled.

 Thanks,
 Praveen




 On 17-Nov-16 3:44 AM, minh chau wrote:
> Hi all,
>
> Has anyone had chance to review this patch?
>
> Thanks,
> Minh
>
> On 14/11/16 15:27, Minh Hon Chau wrote:
>> osaf/services/saf/amf/amfd/clm.cc |  37
>> +++-
>>   osaf/services/saf/amf/amfd/include/cb.h |   1 +
>>   osaf/services/saf/amf/amfd/role.cc  |  16 +++---
>>   3 files changed, 35 insertions(+), 19 deletions(-)
>>
>>
>> In controller failover/switchover, sometimes active AMFD fails to 
>> stop
>> CLM track callback. Therefore, when this AMFD become standby, 
>> AMFD can
>> continue receiving CLM track callback and 

Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-11-21 Thread praveen malviya
Hi Minh,

If the intention is to use same job queue then, I think, one problem 
will come:
  "Currently AMFD empties job queue when AMFD becomes quiesced in 
avd_mds_qsd_role_evh(). Most probably before emptying, AMFD will try to 
stop CLM tracking and now if the API again fails then this failure 
cannot be handled as AMFD cannot be blocked in avd_mds_qsd_role_evh() 
for long time. And then AMFD will empty the job queue."

Can we think of some way to avoid above problem? One solution, without 
using job queue, could be: When CLM track fails, run a timer of 1 
seconds. When this timer expires, check role of AMFD. If the role is 
still standby and Track stop is pending then now AMFD will give a try. 
If it again fails then rerun the timer based on the value of return 
code. We can decide how many times this can be tried.

One more thing. on standby controller if clm track has failed then it 
should respond for any track callback that it receives without taking 
any action. This needs to be done irrespective of the approach of the 
solution.

Thanks,
Praveen

On 21-Nov-16 4:38 PM, minh chau wrote:
> Hi Praveen,
>
> I am thinking if creating a CLM job (likely IMM update, NTF send job) in
> order to stop tracking at non-active AMFD.
> Should it be a better version than this patch that AMFD will still have
> chance to stop tracking until being promoted back to active role?
>
> Thanks,
> Minh
>
> On 18/11/16 11:38, minh chau wrote:
>> Hi Praveen,
>>
>> It has been seen once in swapping 2N Opensaf SI. It was just an error
>> indicating that AMFD failed to stop track callback, after that the
>> si-swap also succeeded. CLM service could be busy during switch-over
>> but AMF (as well as other services) is supposed to handle that return
>> code properly as AMFD has done in track start. Further testing, it
>> exposed a problem due to clm track callback received on standby AMFD.
>>
>> AMFD can not be too busy just try to stop clm track. First option,
>> AMFD can start another thread which just tries to stop track callback,
>> but that's likely to generate more problems where this standby AMFD is
>> promoted back to active in the meantime that thread of stopping track
>> is on going. At that time, main thread will be starting track and
>> another thread is trying to stop track.
>>
>> Second option as in the patch, AMFD retries to stop track where
>> problem happens in callback. At the time standby AMFD switching back
>> to active, we stop and start track again, but it seems no point of
>> doing this. I hope it's less impact on existing behavior, do you see
>> any problems?
>>
>> Regarding standby controller with locked/disabled CLM, I think it is
>> mentioned (in #1828 or some where else?) that this matter requires
>> more thoughts as whether we want Ha role assigned to AMFD in non
>> member node. It seems to beyond this ticket.
>>
>> Thanks,
>> Minh
>>
>> On 17/11/16 22:03, praveen malviya wrote:
>>> Hi Minh,
>>>
>>> We have not seen this issue earlier. Why CLMA is returning timeout
>>> where it is busy?
>>> If we are allowing Standby controller to continue tracking the Node,
>>> then I think (though not fully sure), it will increase one more
>>> client for validation step and this client does not exist locally.
>>> Also we need to evaluate the situation where Standby controller is
>>> CLM locked or disabled.
>>>
>>> Thanks,
>>> Praveen
>>>
>>>
>>>
>>>
>>> On 17-Nov-16 3:44 AM, minh chau wrote:
 Hi all,

 Has anyone had chance to review this patch?

 Thanks,
 Minh

 On 14/11/16 15:27, Minh Hon Chau wrote:
> osaf/services/saf/amf/amfd/clm.cc |  37
> +++-
>   osaf/services/saf/amf/amfd/include/cb.h |   1 +
>   osaf/services/saf/amf/amfd/role.cc  |  16 +++---
>   3 files changed, 35 insertions(+), 19 deletions(-)
>
>
> In controller failover/switchover, sometimes active AMFD fails to stop
> CLM track callback. Therefore, when this AMFD become standby, AMFD can
> continue receiving CLM track callback and trigger the operations which
> should only be executed in active AMFD.
>
> diff --git a/osaf/services/saf/amf/amfd/clm.cc
> b/osaf/services/saf/amf/amfd/clm.cc
> --- a/osaf/services/saf/amf/amfd/clm.cc
> +++ b/osaf/services/saf/amf/amfd/clm.cc
> @@ -219,7 +219,13 @@ static void clm_track_cb(const SaClmClus
>   LOG_ER("ClmTrackCallback received in error");
>   goto done;
>   }
> -
> +if (avd_cb->avail_state_avd != SA_AMF_HA_ACTIVE) {
> +if (avd_cb->is_clm_track_started == true) {
> +LOG_NO("Retry to stop clm track with AMFD state(%d)",
> avd_cb->avail_state_avd);
> +avd_clm_track_stop();
> +}
> +goto done;
> +}
>   /*
>   ** The CLM cluster can be larger than the AMF cluster thus it is
> not an
>   ** error if the corresponding AMF node 

Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-11-21 Thread minh chau
Hi Praveen,

I am thinking if creating a CLM job (likely IMM update, NTF send job) in 
order to stop tracking at non-active AMFD.
Should it be a better version than this patch that AMFD will still have 
chance to stop tracking until being promoted back to active role?

Thanks,
Minh

On 18/11/16 11:38, minh chau wrote:
> Hi Praveen,
>
> It has been seen once in swapping 2N Opensaf SI. It was just an error 
> indicating that AMFD failed to stop track callback, after that the 
> si-swap also succeeded. CLM service could be busy during switch-over 
> but AMF (as well as other services) is supposed to handle that return 
> code properly as AMFD has done in track start. Further testing, it 
> exposed a problem due to clm track callback received on standby AMFD.
>
> AMFD can not be too busy just try to stop clm track. First option, 
> AMFD can start another thread which just tries to stop track callback, 
> but that's likely to generate more problems where this standby AMFD is 
> promoted back to active in the meantime that thread of stopping track 
> is on going. At that time, main thread will be starting track and 
> another thread is trying to stop track.
>
> Second option as in the patch, AMFD retries to stop track where 
> problem happens in callback. At the time standby AMFD switching back 
> to active, we stop and start track again, but it seems no point of 
> doing this. I hope it's less impact on existing behavior, do you see 
> any problems?
>
> Regarding standby controller with locked/disabled CLM, I think it is 
> mentioned (in #1828 or some where else?) that this matter requires 
> more thoughts as whether we want Ha role assigned to AMFD in non 
> member node. It seems to beyond this ticket.
>
> Thanks,
> Minh
>
> On 17/11/16 22:03, praveen malviya wrote:
>> Hi Minh,
>>
>> We have not seen this issue earlier. Why CLMA is returning timeout 
>> where it is busy?
>> If we are allowing Standby controller to continue tracking the Node, 
>> then I think (though not fully sure), it will increase one more 
>> client for validation step and this client does not exist locally. 
>> Also we need to evaluate the situation where Standby controller is 
>> CLM locked or disabled.
>>
>> Thanks,
>> Praveen
>>
>>
>>
>>
>> On 17-Nov-16 3:44 AM, minh chau wrote:
>>> Hi all,
>>>
>>> Has anyone had chance to review this patch?
>>>
>>> Thanks,
>>> Minh
>>>
>>> On 14/11/16 15:27, Minh Hon Chau wrote:
 osaf/services/saf/amf/amfd/clm.cc |  37
 +++-
   osaf/services/saf/amf/amfd/include/cb.h |   1 +
   osaf/services/saf/amf/amfd/role.cc  |  16 +++---
   3 files changed, 35 insertions(+), 19 deletions(-)


 In controller failover/switchover, sometimes active AMFD fails to stop
 CLM track callback. Therefore, when this AMFD become standby, AMFD can
 continue receiving CLM track callback and trigger the operations which
 should only be executed in active AMFD.

 diff --git a/osaf/services/saf/amf/amfd/clm.cc
 b/osaf/services/saf/amf/amfd/clm.cc
 --- a/osaf/services/saf/amf/amfd/clm.cc
 +++ b/osaf/services/saf/amf/amfd/clm.cc
 @@ -219,7 +219,13 @@ static void clm_track_cb(const SaClmClus
   LOG_ER("ClmTrackCallback received in error");
   goto done;
   }
 -
 +if (avd_cb->avail_state_avd != SA_AMF_HA_ACTIVE) {
 +if (avd_cb->is_clm_track_started == true) {
 +LOG_NO("Retry to stop clm track with AMFD state(%d)",
 avd_cb->avail_state_avd);
 +avd_clm_track_stop();
 +}
 +goto done;
 +}
   /*
   ** The CLM cluster can be larger than the AMF cluster thus it is
 not an
   ** error if the corresponding AMF node cannot be found.
 @@ -394,6 +400,7 @@ SaAisErrorT avd_clm_init(AVD_CL_CB* cb)
 cb->clmHandle = 0;
   cb->clm_sel_obj = 0;
 +cb->is_clm_track_started = false;
   TRACE_ENTER();
   /*
* TODO: This CLM initialization thread can be re-factored
 @@ -453,6 +460,8 @@ SaAisErrorT avd_clm_track_start(void)
   } else {
   LOG_ER("Failed to start cluster tracking %u", error);
   }
 +} else {
 +avd_cb->is_clm_track_started = true;
   }
   TRACE_LEAVE();
   return error;
 @@ -460,17 +469,23 @@ SaAisErrorT avd_clm_track_start(void)
 SaAisErrorT avd_clm_track_stop(void)
   {
 -SaAisErrorT error = SA_AIS_OK;
 +SaAisErrorT error = SA_AIS_OK;
 +TRACE_ENTER();
 +error = saClmClusterTrackStop(avd_cb->clmHandle);
 +if (error != SA_AIS_OK) {
 +if (error == SA_AIS_ERR_TRY_AGAIN || error ==
 SA_AIS_ERR_TIMEOUT ||
 +error == SA_AIS_ERR_UNAVAILABLE) {
 +LOG_WA("Failed to stop cluster tracking %u", error);
 +} else {
 +

Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-11-17 Thread minh chau
Hi Praveen,

It has been seen once in swapping 2N Opensaf SI. It was just an error 
indicating that AMFD failed to stop track callback, after that the 
si-swap also succeeded. CLM service could be busy during switch-over but 
AMF (as well as other services) is supposed to handle that return code 
properly as AMFD has done in track start. Further testing, it exposed a 
problem due to clm track callback received on standby AMFD.

AMFD can not be too busy just try to stop clm track. First option, AMFD 
can start another thread which just tries to stop track callback, but 
that's likely to generate more problems where this standby AMFD is 
promoted back to active in the meantime that thread of stopping track is 
on going. At that time, main thread will be starting track and another 
thread is trying to stop track.

Second option as in the patch, AMFD retries to stop track where problem 
happens in callback. At the time standby AMFD switching back to active, 
we stop and start track again, but it seems no point of doing this. I 
hope it's less impact on existing behavior, do you see any problems?

Regarding standby controller with locked/disabled CLM, I think it is 
mentioned (in #1828 or some where else?) that this matter requires more 
thoughts as whether we want Ha role assigned to AMFD in non member node. 
It seems to beyond this ticket.

Thanks,
Minh

On 17/11/16 22:03, praveen malviya wrote:
> Hi Minh,
>
> We have not seen this issue earlier. Why CLMA is returning timeout 
> where it is busy?
> If we are allowing Standby controller to continue tracking the Node, 
> then I think (though not fully sure), it will increase one more client 
> for validation step and this client does not exist locally. Also we 
> need to evaluate the situation where Standby controller is CLM locked 
> or disabled.
>
> Thanks,
> Praveen
>
>
>
>
> On 17-Nov-16 3:44 AM, minh chau wrote:
>> Hi all,
>>
>> Has anyone had chance to review this patch?
>>
>> Thanks,
>> Minh
>>
>> On 14/11/16 15:27, Minh Hon Chau wrote:
>>> osaf/services/saf/amf/amfd/clm.cc   |  37
>>> +++-
>>>   osaf/services/saf/amf/amfd/include/cb.h |   1 +
>>>   osaf/services/saf/amf/amfd/role.cc  |  16 +++---
>>>   3 files changed, 35 insertions(+), 19 deletions(-)
>>>
>>>
>>> In controller failover/switchover, sometimes active AMFD fails to stop
>>> CLM track callback. Therefore, when this AMFD become standby, AMFD can
>>> continue receiving CLM track callback and trigger the operations which
>>> should only be executed in active AMFD.
>>>
>>> diff --git a/osaf/services/saf/amf/amfd/clm.cc
>>> b/osaf/services/saf/amf/amfd/clm.cc
>>> --- a/osaf/services/saf/amf/amfd/clm.cc
>>> +++ b/osaf/services/saf/amf/amfd/clm.cc
>>> @@ -219,7 +219,13 @@ static void clm_track_cb(const SaClmClus
>>>   LOG_ER("ClmTrackCallback received in error");
>>>   goto done;
>>>   }
>>> -
>>> +if (avd_cb->avail_state_avd != SA_AMF_HA_ACTIVE) {
>>> +if (avd_cb->is_clm_track_started == true) {
>>> +LOG_NO("Retry to stop clm track with AMFD state(%d)",
>>> avd_cb->avail_state_avd);
>>> +avd_clm_track_stop();
>>> +}
>>> +goto done;
>>> +}
>>>   /*
>>>   ** The CLM cluster can be larger than the AMF cluster thus it is
>>> not an
>>>   ** error if the corresponding AMF node cannot be found.
>>> @@ -394,6 +400,7 @@ SaAisErrorT avd_clm_init(AVD_CL_CB* cb)
>>> cb->clmHandle = 0;
>>>   cb->clm_sel_obj = 0;
>>> +cb->is_clm_track_started = false;
>>>   TRACE_ENTER();
>>>   /*
>>>* TODO: This CLM initialization thread can be re-factored
>>> @@ -453,6 +460,8 @@ SaAisErrorT avd_clm_track_start(void)
>>>   } else {
>>>   LOG_ER("Failed to start cluster tracking %u", error);
>>>   }
>>> +} else {
>>> +avd_cb->is_clm_track_started = true;
>>>   }
>>>   TRACE_LEAVE();
>>>   return error;
>>> @@ -460,17 +469,23 @@ SaAisErrorT avd_clm_track_start(void)
>>> SaAisErrorT avd_clm_track_stop(void)
>>>   {
>>> -SaAisErrorT error = SA_AIS_OK;
>>> +SaAisErrorT error = SA_AIS_OK;
>>> +TRACE_ENTER();
>>> +error = saClmClusterTrackStop(avd_cb->clmHandle);
>>> +if (error != SA_AIS_OK) {
>>> +if (error == SA_AIS_ERR_TRY_AGAIN || error ==
>>> SA_AIS_ERR_TIMEOUT ||
>>> +error == SA_AIS_ERR_UNAVAILABLE) {
>>> +LOG_WA("Failed to stop cluster tracking %u", error);
>>> +} else {
>>> +LOG_ER("Failed to stop cluster tracking %u", error);
>>> +}
>>> +} else {
>>> +TRACE("Sucessfully stops cluster tracking");
>>> +avd_cb->is_clm_track_started = false;
>>> +}
>>>   -TRACE_ENTER();
>>> -error = saClmClusterTrackStop(avd_cb->clmHandle);
>>> -if (SA_AIS_OK != error)
>>> -LOG_ER("Failed to stop cluster tracking %u", error);
>>> -else
>>> -TRACE("Sucessfully stops 

Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-11-17 Thread praveen malviya
Hi Minh,

We have not seen this issue earlier. Why CLMA is returning timeout where 
it is busy?
If we are allowing Standby controller to continue tracking the Node, 
then I think (though not fully sure), it will increase one more client 
for validation step and this client does not exist locally. Also we need 
to evaluate the situation where Standby controller is CLM locked or 
disabled.

Thanks,
Praveen




On 17-Nov-16 3:44 AM, minh chau wrote:
> Hi all,
>
> Has anyone had chance to review this patch?
>
> Thanks,
> Minh
>
> On 14/11/16 15:27, Minh Hon Chau wrote:
>>   osaf/services/saf/amf/amfd/clm.cc   |  37
>> +++-
>>   osaf/services/saf/amf/amfd/include/cb.h |   1 +
>>   osaf/services/saf/amf/amfd/role.cc  |  16 +++---
>>   3 files changed, 35 insertions(+), 19 deletions(-)
>>
>>
>> In controller failover/switchover, sometimes active AMFD fails to stop
>> CLM track callback. Therefore, when this AMFD become standby, AMFD can
>> continue receiving CLM track callback and trigger the operations which
>> should only be executed in active AMFD.
>>
>> diff --git a/osaf/services/saf/amf/amfd/clm.cc
>> b/osaf/services/saf/amf/amfd/clm.cc
>> --- a/osaf/services/saf/amf/amfd/clm.cc
>> +++ b/osaf/services/saf/amf/amfd/clm.cc
>> @@ -219,7 +219,13 @@ static void clm_track_cb(const SaClmClus
>>   LOG_ER("ClmTrackCallback received in error");
>>   goto done;
>>   }
>> -
>> +if (avd_cb->avail_state_avd != SA_AMF_HA_ACTIVE) {
>> +if (avd_cb->is_clm_track_started == true) {
>> +LOG_NO("Retry to stop clm track with AMFD state(%d)",
>> avd_cb->avail_state_avd);
>> +avd_clm_track_stop();
>> +}
>> +goto done;
>> +}
>>   /*
>>   ** The CLM cluster can be larger than the AMF cluster thus it is
>> not an
>>   ** error if the corresponding AMF node cannot be found.
>> @@ -394,6 +400,7 @@ SaAisErrorT avd_clm_init(AVD_CL_CB* cb)
>> cb->clmHandle = 0;
>>   cb->clm_sel_obj = 0;
>> +cb->is_clm_track_started = false;
>>   TRACE_ENTER();
>>   /*
>>* TODO: This CLM initialization thread can be re-factored
>> @@ -453,6 +460,8 @@ SaAisErrorT avd_clm_track_start(void)
>>   } else {
>>   LOG_ER("Failed to start cluster tracking %u", error);
>>   }
>> +} else {
>> +avd_cb->is_clm_track_started = true;
>>   }
>>   TRACE_LEAVE();
>>   return error;
>> @@ -460,17 +469,23 @@ SaAisErrorT avd_clm_track_start(void)
>> SaAisErrorT avd_clm_track_stop(void)
>>   {
>> -SaAisErrorT error = SA_AIS_OK;
>> +SaAisErrorT error = SA_AIS_OK;
>> +TRACE_ENTER();
>> +error = saClmClusterTrackStop(avd_cb->clmHandle);
>> +if (error != SA_AIS_OK) {
>> +if (error == SA_AIS_ERR_TRY_AGAIN || error ==
>> SA_AIS_ERR_TIMEOUT ||
>> +error == SA_AIS_ERR_UNAVAILABLE) {
>> +LOG_WA("Failed to stop cluster tracking %u", error);
>> +} else {
>> +LOG_ER("Failed to stop cluster tracking %u", error);
>> +}
>> +} else {
>> +TRACE("Sucessfully stops cluster tracking");
>> +avd_cb->is_clm_track_started = false;
>> +}
>>   -TRACE_ENTER();
>> -error = saClmClusterTrackStop(avd_cb->clmHandle);
>> -if (SA_AIS_OK != error)
>> -LOG_ER("Failed to stop cluster tracking %u", error);
>> -else
>> -TRACE("Sucessfully stops cluster tracking");
>> -
>> -TRACE_LEAVE();
>> -return error;
>> +TRACE_LEAVE();
>> +return error;
>>   }
>> void clm_node_terminate(AVD_AVND *node)
>> diff --git a/osaf/services/saf/amf/amfd/include/cb.h
>> b/osaf/services/saf/amf/amfd/include/cb.h
>> --- a/osaf/services/saf/amf/amfd/include/cb.h
>> +++ b/osaf/services/saf/amf/amfd/include/cb.h
>> @@ -210,6 +210,7 @@ typedef struct cl_cb_tag {
>>   /* Clm stuff */
>>   std::atomic clmHandle;
>>   std::atomic clm_sel_obj;
>> +bool is_clm_track_started;
>> bool fully_initialized;
>>   bool swap_switch; /* true - In middle of role switch. */
>> diff --git a/osaf/services/saf/amf/amfd/role.cc
>> b/osaf/services/saf/amf/amfd/role.cc
>> --- a/osaf/services/saf/amf/amfd/role.cc
>> +++ b/osaf/services/saf/amf/amfd/role.cc
>> @@ -1050,9 +1050,7 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB
>>   /*  Mark AVD as Quiesced. */
>>   cb->avail_state_avd = SA_AMF_HA_QUIESCED;
>>
>> -if (avd_clm_track_stop() != SA_AIS_OK) {
>> -LOG_ER("ClmTrack stop failed");
>> -}
>> +avd_clm_track_stop();
>> /* Go ahead and set mds role as already the NCS SU has been
>> switched */
>>   if (NCSCC_RC_SUCCESS != (rc = avd_mds_set_vdest_role(cb,
>> SA_AMF_HA_QUIESCED))) {
>> @@ -1260,11 +1258,13 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_C
>>   if (NCSCC_RC_SUCCESS != avd_rde_set_role(SA_AMF_HA_ACTIVE)) {
>>   LOG_ER("rde role change failed from stdy -> Active");
>>   }
>> -
>> - 

Re: [devel] [PATCH 1 of 1] AMFD: Do not let non-active AMFD receives CLM track callback [#2141]

2016-11-16 Thread minh chau
Hi all,

Has anyone had chance to review this patch?

Thanks,
Minh

On 14/11/16 15:27, Minh Hon Chau wrote:
>   osaf/services/saf/amf/amfd/clm.cc   |  37 
> +++-
>   osaf/services/saf/amf/amfd/include/cb.h |   1 +
>   osaf/services/saf/amf/amfd/role.cc  |  16 +++---
>   3 files changed, 35 insertions(+), 19 deletions(-)
>
>
> In controller failover/switchover, sometimes active AMFD fails to stop
> CLM track callback. Therefore, when this AMFD become standby, AMFD can
> continue receiving CLM track callback and trigger the operations which
> should only be executed in active AMFD.
>
> diff --git a/osaf/services/saf/amf/amfd/clm.cc 
> b/osaf/services/saf/amf/amfd/clm.cc
> --- a/osaf/services/saf/amf/amfd/clm.cc
> +++ b/osaf/services/saf/amf/amfd/clm.cc
> @@ -219,7 +219,13 @@ static void clm_track_cb(const SaClmClus
>   LOG_ER("ClmTrackCallback received in error");
>   goto done;
>   }
> -
> + if (avd_cb->avail_state_avd != SA_AMF_HA_ACTIVE) {
> + if (avd_cb->is_clm_track_started == true) {
> + LOG_NO("Retry to stop clm track with AMFD state(%d)", 
> avd_cb->avail_state_avd);
> + avd_clm_track_stop();
> + }
> + goto done;
> + }
>   /*
>   ** The CLM cluster can be larger than the AMF cluster thus it is not an
>   ** error if the corresponding AMF node cannot be found.
> @@ -394,6 +400,7 @@ SaAisErrorT avd_clm_init(AVD_CL_CB* cb)
>   
>   cb->clmHandle = 0;
>   cb->clm_sel_obj = 0;
> + cb->is_clm_track_started = false;
>   TRACE_ENTER();
>   /*
>* TODO: This CLM initialization thread can be re-factored
> @@ -453,6 +460,8 @@ SaAisErrorT avd_clm_track_start(void)
>   } else {
>   LOG_ER("Failed to start cluster tracking %u", error);
>   }
> + } else {
> + avd_cb->is_clm_track_started = true;
>   }
>   TRACE_LEAVE();
>   return error;
> @@ -460,17 +469,23 @@ SaAisErrorT avd_clm_track_start(void)
>   
>   SaAisErrorT avd_clm_track_stop(void)
>   {
> -SaAisErrorT error = SA_AIS_OK;
> + SaAisErrorT error = SA_AIS_OK;
> + TRACE_ENTER();
> + error = saClmClusterTrackStop(avd_cb->clmHandle);
> + if (error != SA_AIS_OK) {
> + if (error == SA_AIS_ERR_TRY_AGAIN || error == 
> SA_AIS_ERR_TIMEOUT ||
> + error == SA_AIS_ERR_UNAVAILABLE) {
> + LOG_WA("Failed to stop cluster tracking %u", error);
> + } else {
> + LOG_ER("Failed to stop cluster tracking %u", error);
> + }
> + } else {
> + TRACE("Sucessfully stops cluster tracking");
> + avd_cb->is_clm_track_started = false;
> + }
>   
> -TRACE_ENTER();
> - error = saClmClusterTrackStop(avd_cb->clmHandle);
> -if (SA_AIS_OK != error)
> -LOG_ER("Failed to stop cluster tracking %u", error);
> - else
> - TRACE("Sucessfully stops cluster tracking");
> -
> -TRACE_LEAVE();
> -return error;
> + TRACE_LEAVE();
> + return error;
>   }
>   
>   void clm_node_terminate(AVD_AVND *node)
> diff --git a/osaf/services/saf/amf/amfd/include/cb.h 
> b/osaf/services/saf/amf/amfd/include/cb.h
> --- a/osaf/services/saf/amf/amfd/include/cb.h
> +++ b/osaf/services/saf/amf/amfd/include/cb.h
> @@ -210,6 +210,7 @@ typedef struct cl_cb_tag {
>   /* Clm stuff */
>   std::atomic clmHandle;
>   std::atomic clm_sel_obj;
> + bool is_clm_track_started;
>   
>   bool fully_initialized;
>   bool swap_switch; /* true - In middle of role switch. */
> diff --git a/osaf/services/saf/amf/amfd/role.cc 
> b/osaf/services/saf/amf/amfd/role.cc
> --- a/osaf/services/saf/amf/amfd/role.cc
> +++ b/osaf/services/saf/amf/amfd/role.cc
> @@ -1050,9 +1050,7 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB
>   /*  Mark AVD as Quiesced. */
>   cb->avail_state_avd = SA_AMF_HA_QUIESCED;
>   
> - if (avd_clm_track_stop() != SA_AIS_OK) {
> - LOG_ER("ClmTrack stop failed");
> - }
> + avd_clm_track_stop();
>   
>   /* Go ahead and set mds role as already the NCS SU has been switched */
>   if (NCSCC_RC_SUCCESS != (rc = avd_mds_set_vdest_role(cb, 
> SA_AMF_HA_QUIESCED))) {
> @@ -1260,11 +1258,13 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_C
>   if (NCSCC_RC_SUCCESS != avd_rde_set_role(SA_AMF_HA_ACTIVE)) {
>   LOG_ER("rde role change failed from stdy -> Active");
>   }
> -
> - if(avd_clm_track_start() != SA_AIS_OK) {
> - LOG_ER("Switch Standby --> Active, clm track start failed");
> - avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
> - return NCSCC_RC_FAILURE;
> + // reuse clm track start
> + if (avd_cb->is_clm_track_started == false) {
> + if(avd_clm_track_start() != SA_AIS_OK) {
> +