Hi Minh,

I have started reviewing this patch.

Thanks,
Praveen

On 15-Feb-17 9:22 AM, minh chau wrote:
> Hi all,
>
> Have you had time to review this patch?
> It changes the component failover sequence, so I think we need more time
> to look at it.
>
> Thanks,
> Minh
>
> On 23/01/17 12:28, Minh Hon Chau wrote:
>>   src/amf/amfnd/avnd_su.h |   1 +
>>   src/amf/amfnd/clc.cc    |   3 ---
>>   src/amf/amfnd/di.cc     |  12 +++++++++++-
>>   src/amf/amfnd/susm.cc   |  32 +++++++++++++++++++++++++++++---
>>   4 files changed, 41 insertions(+), 7 deletions(-)
>>
>>
>> In case component failover, faulty component will be terminated. When
>> the reinstantiation
>> is done, amfnd will send su_oper_message (enabled) to amfd which is
>> running along with
>> component failover. In the reported problem, if su_oper_message
>> (enabled) comes to amfd
>> before the quiesced assignment response (as part of component failover
>> sequence) comes to
>> amfd, then this quiesced assignment response is ignored, thus
>> component failover will not
>> finish.
>>
>> The problem is in function susi_success_sg_realign with act=5,
>> state=3, amfd always assumes
>> su having faulty component is OUT_OF_SERVICE. This assumption is true
>> in most of the time
>> when su_oper_message (enabled) comes a little later than quiesced
>> assignment response. In fact
>> the su_oper_message (enabled) is not designed as part of component
>> failover sequence, thus it
>> can come any time during the failover. If amfd is getting a bit busier
>> with RTA update then
>> the faulty component has enough to reinstiantiate so that amfnd sends
>> su_oper_message (enabled)
>> before quiesced assignment response, the reported problem will be seen.
>>
>> This patch hardens the component failover sequence by ensuring the
>> su_oper_message (enabled) to
>> be sent after su completes to remove assignment. This approach comes
>> from the similarity in
>> su failover, where the su_oper_message (enabled) is sent in repair phase.
>>
>> diff --git a/src/amf/amfnd/avnd_su.h b/src/amf/amfnd/avnd_su.h
>> --- a/src/amf/amfnd/avnd_su.h
>> +++ b/src/amf/amfnd/avnd_su.h
>> @@ -393,6 +393,7 @@ extern struct avnd_su_si_rec *avnd_silis
>>   extern struct avnd_su_si_rec *avnd_silist_getprev(const struct
>> avnd_su_si_rec *);
>>   extern struct avnd_su_si_rec *avnd_silist_getlast(void);
>>   extern bool sufailover_in_progress(const AVND_SU *su);
>> +extern bool componentfailover_in_progress(const AVND_SU *su);
>>   extern bool sufailover_during_nodeswitchover(const AVND_SU *su);
>>   extern bool all_csis_in_removed_state(const AVND_SU *su);
>>   extern void su_reset_restart_count_in_comps(const struct avnd_cb_tag
>> *cb, const AVND_SU *su);
>> diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
>> --- a/src/amf/amfnd/clc.cc
>> +++ b/src/amf/amfnd/clc.cc
>> @@ -2381,9 +2381,6 @@ uint32_t avnd_comp_clc_terming_cleansucc
>>               (m_AVND_SU_IS_FAILOVER(su))) {
>>           /* yes, request director to orchestrate component failover */
>>           rc = avnd_di_oper_send(cb, su, SA_AMF_COMPONENT_FAILOVER);
>> -
>> -        //Reset component-failover here. SU failover is reset as part
>> of REPAIRED admin op.
>> -        m_AVND_SU_FAILOVER_RESET(su);
>>       }
>>         /*
>> diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc
>> --- a/src/amf/amfnd/di.cc
>> +++ b/src/amf/amfnd/di.cc
>> @@ -894,7 +894,17 @@ uint32_t avnd_di_susi_resp_send(AVND_CB
>>               }
>>               m_AVND_SU_ALL_SI_RESET(su);
>>           }
>> -
>> +        if (componentfailover_in_progress(su)) {
>> +            if (all_csis_in_removed_state(su) == true) {
>> +                bool is_en;
>> +                m_AVND_SU_IS_ENABLED(su, is_en);
>> +                if (is_en) {
>> +                    if (avnd_di_oper_send(cb, su, 0) ==
>> NCSCC_RC_SUCCESS) {
>> +                        m_AVND_SU_FAILOVER_RESET(su);
>> +                    }
>> +                }
>> +            }
>> +        }
>>       /* free the contents of avnd message */
>>       avnd_msg_content_free(cb, &msg);
>>   diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc
>> --- a/src/amf/amfnd/susm.cc
>> +++ b/src/amf/amfnd/susm.cc
>> @@ -1633,10 +1633,22 @@ uint32_t avnd_su_pres_st_chng_prc(AVND_C
>>               m_AVND_SU_IS_ENABLED(su, is_en);
>>               if (true == is_en) {
>>                   TRACE("SU oper state is enabled");
>> +                // do not send su_oper state if component failover is
>> in progress
>>                   m_AVND_SU_OPER_STATE_SET(su,
>> SA_AMF_OPERATIONAL_ENABLED);
>> -                rc = avnd_di_oper_send(cb, su, 0);
>> -                if (NCSCC_RC_SUCCESS != rc)
>> -                    goto done;
>> +                if (componentfailover_in_progress(su) == true) {
>> +                    si = reinterpret_cast<AVND_SU_SI_REC*>
>> +                            (m_NCS_DBLIST_FIND_FIRST(&su->si_list));
>> +                    if (si == nullptr ||
>> all_csis_in_removed_state(su)) {
>> +                        rc = avnd_di_oper_send(cb, su, 0);
>> +                        if (rc != NCSCC_RC_SUCCESS)
>> +                            goto done;
>> +                        m_AVND_SU_FAILOVER_RESET(su);
>> +                    }
>> +                } else {
>> +                    rc = avnd_di_oper_send(cb, su, 0);
>> +                    if (NCSCC_RC_SUCCESS != rc)
>> +                        goto done;
>> +                }
>>               }
>>               else
>>                   TRACE("SU oper state is disabled");
>> @@ -3551,6 +3563,20 @@ bool sufailover_in_progress(const AVND_S
>>   }
>>     /**
>> + * This function checks if the componentfailover is going on.
>> + * @param su: ptr to the SU .
>> + *
>> + * @return true/false.
>> + */
>> +bool componentfailover_in_progress(const AVND_SU *su) {
>> +    if ((su->sufailover == false) && (!m_AVND_SU_IS_RESTART(su)) &&
>> +            (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED) &&
>> (!su->is_ncs) &&
>> +            m_AVND_SU_IS_FAILOVER(su))
>> +        return true;
>> +    return false;
>> +}
>> +
>> +/**
>>    * This function checks if the sufailover and node switchover are
>> going on.
>>    * @param su: ptr to the SU .
>>    *
>>
>

------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to