Hi all,

Have you had time to review this patch?
It changes the component failover sequence, so I think we need more time 
to look at it.

Thanks,
Minh

On 23/01/17 12:28, Minh Hon Chau wrote:
>   src/amf/amfnd/avnd_su.h |   1 +
>   src/amf/amfnd/clc.cc    |   3 ---
>   src/amf/amfnd/di.cc     |  12 +++++++++++-
>   src/amf/amfnd/susm.cc   |  32 +++++++++++++++++++++++++++++---
>   4 files changed, 41 insertions(+), 7 deletions(-)
>
>
> In case component failover, faulty component will be terminated. When the 
> reinstantiation
> is done, amfnd will send su_oper_message (enabled) to amfd which is running 
> along with
> component failover. In the reported problem, if su_oper_message (enabled) 
> comes to amfd
> before the quiesced assignment response (as part of component failover 
> sequence) comes to
> amfd, then this quiesced assignment response is ignored, thus component 
> failover will not
> finish.
>
> The problem is in function susi_success_sg_realign with act=5, state=3, amfd 
> always assumes
> su having faulty component is OUT_OF_SERVICE. This assumption is true in most 
> of the time
> when su_oper_message (enabled) comes a little later than quiesced assignment 
> response. In fact
> the su_oper_message (enabled) is not designed as part of component failover 
> sequence, thus it
> can come any time during the failover. If amfd is getting a bit busier with 
> RTA update then
> the faulty component has enough to reinstiantiate so that amfnd sends 
> su_oper_message (enabled)
> before quiesced assignment response, the reported problem will be seen.
>
> This patch hardens the component failover sequence by ensuring the 
> su_oper_message (enabled) to
> be sent after su completes to remove assignment. This approach comes from the 
> similarity in
> su failover, where the su_oper_message (enabled) is sent in repair phase.
>
> diff --git a/src/amf/amfnd/avnd_su.h b/src/amf/amfnd/avnd_su.h
> --- a/src/amf/amfnd/avnd_su.h
> +++ b/src/amf/amfnd/avnd_su.h
> @@ -393,6 +393,7 @@ extern struct avnd_su_si_rec *avnd_silis
>   extern struct avnd_su_si_rec *avnd_silist_getprev(const struct 
> avnd_su_si_rec *);
>   extern struct avnd_su_si_rec *avnd_silist_getlast(void);
>   extern bool sufailover_in_progress(const AVND_SU *su);
> +extern bool componentfailover_in_progress(const AVND_SU *su);
>   extern bool sufailover_during_nodeswitchover(const AVND_SU *su);
>   extern bool all_csis_in_removed_state(const AVND_SU *su);
>   extern void su_reset_restart_count_in_comps(const struct avnd_cb_tag *cb, 
> const AVND_SU *su);
> diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
> --- a/src/amf/amfnd/clc.cc
> +++ b/src/amf/amfnd/clc.cc
> @@ -2381,9 +2381,6 @@ uint32_t avnd_comp_clc_terming_cleansucc
>                       (m_AVND_SU_IS_FAILOVER(su))) {
>               /* yes, request director to orchestrate component failover */
>               rc = avnd_di_oper_send(cb, su, SA_AMF_COMPONENT_FAILOVER);
> -
> -             //Reset component-failover here. SU failover is reset as part 
> of REPAIRED admin op.
> -             m_AVND_SU_FAILOVER_RESET(su);
>       }
>   
>       /*
> diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc
> --- a/src/amf/amfnd/di.cc
> +++ b/src/amf/amfnd/di.cc
> @@ -894,7 +894,17 @@ uint32_t avnd_di_susi_resp_send(AVND_CB
>               }
>               m_AVND_SU_ALL_SI_RESET(su);
>           }
> -
> +        if (componentfailover_in_progress(su)) {
> +             if (all_csis_in_removed_state(su) == true) {
> +                     bool is_en;
> +                     m_AVND_SU_IS_ENABLED(su, is_en);
> +                     if (is_en) {
> +                             if (avnd_di_oper_send(cb, su, 0) == 
> NCSCC_RC_SUCCESS) {
> +                                     m_AVND_SU_FAILOVER_RESET(su);
> +                             }
> +                     }
> +             }
> +        }
>       /* free the contents of avnd message */
>       avnd_msg_content_free(cb, &msg);
>   
> diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc
> --- a/src/amf/amfnd/susm.cc
> +++ b/src/amf/amfnd/susm.cc
> @@ -1633,10 +1633,22 @@ uint32_t avnd_su_pres_st_chng_prc(AVND_C
>                       m_AVND_SU_IS_ENABLED(su, is_en);
>                       if (true == is_en) {
>                               TRACE("SU oper state is enabled");
> +                             // do not send su_oper state if component 
> failover is in progress
>                               m_AVND_SU_OPER_STATE_SET(su, 
> SA_AMF_OPERATIONAL_ENABLED);
> -                             rc = avnd_di_oper_send(cb, su, 0);
> -                             if (NCSCC_RC_SUCCESS != rc)
> -                                     goto done;
> +                             if (componentfailover_in_progress(su) == true) {
> +                                     si = reinterpret_cast<AVND_SU_SI_REC*>
> +                                                     
> (m_NCS_DBLIST_FIND_FIRST(&su->si_list));
> +                                     if (si == nullptr || 
> all_csis_in_removed_state(su)) {
> +                                             rc = avnd_di_oper_send(cb, su, 
> 0);
> +                                             if (rc != NCSCC_RC_SUCCESS)
> +                                                     goto done;
> +                                             m_AVND_SU_FAILOVER_RESET(su);
> +                                     }
> +                             } else {
> +                                     rc = avnd_di_oper_send(cb, su, 0);
> +                                     if (NCSCC_RC_SUCCESS != rc)
> +                                             goto done;
> +                             }
>                       }
>                       else
>                               TRACE("SU oper state is disabled");
> @@ -3551,6 +3563,20 @@ bool sufailover_in_progress(const AVND_S
>   }
>   
>   /**
> + * This function checks if the componentfailover is going on.
> + * @param su: ptr to the SU .
> + *
> + * @return true/false.
> + */
> +bool componentfailover_in_progress(const AVND_SU *su) {
> +     if ((su->sufailover == false) && (!m_AVND_SU_IS_RESTART(su)) &&
> +                     (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED) && 
> (!su->is_ncs) &&
> +                     m_AVND_SU_IS_FAILOVER(su))
> +             return true;
> +     return false;
> +}
> +
> +/**
>    * This function checks if the sufailover and node switchover are going on.
>    * @param su: ptr to the SU .
>    *
>


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to