Hi Minh,
I have started reviewing this patch.
Thanks,
Praveen
On 15-Feb-17 9:22 AM, minh chau wrote:
> Hi all,
>
> Have you had time to review this patch?
> It changes the component failover sequence, so I think we need more time
> to look at it.
>
> Thanks,
> Minh
>
> On 23/01/17 12:28, Minh Hon Chau wrote:
>> src/amf/amfnd/avnd_su.h | 1 +
>> src/amf/amfnd/clc.cc | 3 ---
>> src/amf/amfnd/di.cc | 12 +++++++++++-
>> src/amf/amfnd/susm.cc | 32 +++++++++++++++++++++++++++++---
>> 4 files changed, 41 insertions(+), 7 deletions(-)
>>
>>
>> In case component failover, faulty component will be terminated. When
>> the reinstantiation
>> is done, amfnd will send su_oper_message (enabled) to amfd which is
>> running along with
>> component failover. In the reported problem, if su_oper_message
>> (enabled) comes to amfd
>> before the quiesced assignment response (as part of component failover
>> sequence) comes to
>> amfd, then this quiesced assignment response is ignored, thus
>> component failover will not
>> finish.
>>
>> The problem is in function susi_success_sg_realign with act=5,
>> state=3, amfd always assumes
>> su having faulty component is OUT_OF_SERVICE. This assumption is true
>> in most of the time
>> when su_oper_message (enabled) comes a little later than quiesced
>> assignment response. In fact
>> the su_oper_message (enabled) is not designed as part of component
>> failover sequence, thus it
>> can come any time during the failover. If amfd is getting a bit busier
>> with RTA update then
>> the faulty component has enough to reinstiantiate so that amfnd sends
>> su_oper_message (enabled)
>> before quiesced assignment response, the reported problem will be seen.
>>
>> This patch hardens the component failover sequence by ensuring the
>> su_oper_message (enabled) to
>> be sent after su completes to remove assignment. This approach comes
>> from the similarity in
>> su failover, where the su_oper_message (enabled) is sent in repair phase.
>>
>> diff --git a/src/amf/amfnd/avnd_su.h b/src/amf/amfnd/avnd_su.h
>> --- a/src/amf/amfnd/avnd_su.h
>> +++ b/src/amf/amfnd/avnd_su.h
>> @@ -393,6 +393,7 @@ extern struct avnd_su_si_rec *avnd_silis
>> extern struct avnd_su_si_rec *avnd_silist_getprev(const struct
>> avnd_su_si_rec *);
>> extern struct avnd_su_si_rec *avnd_silist_getlast(void);
>> extern bool sufailover_in_progress(const AVND_SU *su);
>> +extern bool componentfailover_in_progress(const AVND_SU *su);
>> extern bool sufailover_during_nodeswitchover(const AVND_SU *su);
>> extern bool all_csis_in_removed_state(const AVND_SU *su);
>> extern void su_reset_restart_count_in_comps(const struct avnd_cb_tag
>> *cb, const AVND_SU *su);
>> diff --git a/src/amf/amfnd/clc.cc b/src/amf/amfnd/clc.cc
>> --- a/src/amf/amfnd/clc.cc
>> +++ b/src/amf/amfnd/clc.cc
>> @@ -2381,9 +2381,6 @@ uint32_t avnd_comp_clc_terming_cleansucc
>> (m_AVND_SU_IS_FAILOVER(su))) {
>> /* yes, request director to orchestrate component failover */
>> rc = avnd_di_oper_send(cb, su, SA_AMF_COMPONENT_FAILOVER);
>> -
>> - //Reset component-failover here. SU failover is reset as part
>> of REPAIRED admin op.
>> - m_AVND_SU_FAILOVER_RESET(su);
>> }
>> /*
>> diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc
>> --- a/src/amf/amfnd/di.cc
>> +++ b/src/amf/amfnd/di.cc
>> @@ -894,7 +894,17 @@ uint32_t avnd_di_susi_resp_send(AVND_CB
>> }
>> m_AVND_SU_ALL_SI_RESET(su);
>> }
>> -
>> + if (componentfailover_in_progress(su)) {
>> + if (all_csis_in_removed_state(su) == true) {
>> + bool is_en;
>> + m_AVND_SU_IS_ENABLED(su, is_en);
>> + if (is_en) {
>> + if (avnd_di_oper_send(cb, su, 0) ==
>> NCSCC_RC_SUCCESS) {
>> + m_AVND_SU_FAILOVER_RESET(su);
>> + }
>> + }
>> + }
>> + }
>> /* free the contents of avnd message */
>> avnd_msg_content_free(cb, &msg);
>> diff --git a/src/amf/amfnd/susm.cc b/src/amf/amfnd/susm.cc
>> --- a/src/amf/amfnd/susm.cc
>> +++ b/src/amf/amfnd/susm.cc
>> @@ -1633,10 +1633,22 @@ uint32_t avnd_su_pres_st_chng_prc(AVND_C
>> m_AVND_SU_IS_ENABLED(su, is_en);
>> if (true == is_en) {
>> TRACE("SU oper state is enabled");
>> + // do not send su_oper state if component failover is
>> in progress
>> m_AVND_SU_OPER_STATE_SET(su,
>> SA_AMF_OPERATIONAL_ENABLED);
>> - rc = avnd_di_oper_send(cb, su, 0);
>> - if (NCSCC_RC_SUCCESS != rc)
>> - goto done;
>> + if (componentfailover_in_progress(su) == true) {
>> + si = reinterpret_cast<AVND_SU_SI_REC*>
>> + (m_NCS_DBLIST_FIND_FIRST(&su->si_list));
>> + if (si == nullptr ||
>> all_csis_in_removed_state(su)) {
>> + rc = avnd_di_oper_send(cb, su, 0);
>> + if (rc != NCSCC_RC_SUCCESS)
>> + goto done;
>> + m_AVND_SU_FAILOVER_RESET(su);
>> + }
>> + } else {
>> + rc = avnd_di_oper_send(cb, su, 0);
>> + if (NCSCC_RC_SUCCESS != rc)
>> + goto done;
>> + }
>> }
>> else
>> TRACE("SU oper state is disabled");
>> @@ -3551,6 +3563,20 @@ bool sufailover_in_progress(const AVND_S
>> }
>> /**
>> + * This function checks if the componentfailover is going on.
>> + * @param su: ptr to the SU .
>> + *
>> + * @return true/false.
>> + */
>> +bool componentfailover_in_progress(const AVND_SU *su) {
>> + if ((su->sufailover == false) && (!m_AVND_SU_IS_RESTART(su)) &&
>> + (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED) &&
>> (!su->is_ncs) &&
>> + m_AVND_SU_IS_FAILOVER(su))
>> + return true;
>> + return false;
>> +}
>> +
>> +/**
>> * This function checks if the sufailover and node switchover are
>> going on.
>> * @param su: ptr to the SU .
>> *
>>
>
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel