On 20-May-16 12:38 PM, minh chau wrote:
> Hi Praveen,
>
> I have tested the patch, it fixes the reported issue.
> The problem seems to be a data race condition between thread handling
> clc fsm and timer expiry thread which reset all variables of a su while
> these variables have being used elsewhere in clc fsm. This race
> condition should cause other problems but it's not in scope of this ticket.
>
I think timer expiry resets only escalation related variables only.
If there are other variables being reset then please raise a new ticket.

> I think the idea of the patch is not using @su_err_esc_level in
> conditional statement in clc fsm, since it's not reliable because it can
> be reset anytime by timer expiry thread. I still see one place in
> avnd_comp_clc_inst_clean_hdler() that @su_err_esc_level being used in
> *if* statement, can you check whether it's safe?
I missed this place.I will change that before pushing the patch.

Thanks,
Praveen

>
> Thanks,
> Minh
> On 19/05/16 20:35, praveen.malv...@oracle.com wrote:
>>   osaf/services/saf/amf/amfnd/clc.cc  |   2 +-
>>   osaf/services/saf/amf/amfnd/err.cc  |  11 ++++++++++-
>>   osaf/services/saf/amf/amfnd/susm.cc |  10 +++++-----
>>   3 files changed, 16 insertions(+), 7 deletions(-)
>>
>>
>> In the reported problem, fault of comp leads to comp-failover
>> escalation.But
>> failover did not happen.
>>
>> As a part of compt-failover, amfnd launched cleanup if component.
>> Before clean up gets
>> completed, su-failover time expires and AMND resets escalation
>> parameters. In this way
>> AMFND loses that the context of cleanup is comp-failver recovery. When
>> comp gets cleaned up
>> successfully, AMFND (not ware of context) does inform AMFD for
>> failover of assignments.
>>
>> Patch fixes problem by remembering the comp or su failover context
>> using a separate flag
>> and not relying on escalation params.
>>
>> diff --git a/osaf/services/saf/amf/amfnd/clc.cc
>> b/osaf/services/saf/amf/amfnd/clc.cc
>> --- a/osaf/services/saf/amf/amfnd/clc.cc
>> +++ b/osaf/services/saf/amf/amfnd/clc.cc
>> @@ -2291,7 +2291,7 @@ uint32_t avnd_comp_clc_terming_cleansucc
>>       if (m_AVND_COMP_IS_FAILED(comp) && m_AVND_SU_IS_FAILED(su) &&
>>               m_AVND_SU_IS_PREINSTANTIABLE(su) && (su->sufailover ==
>> false) &&
>>               (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED) &&
>> -            (su->su_err_esc_level == AVND_ERR_ESC_LEVEL_2)) {
>> +            (m_AVND_SU_IS_FAILOVER(su))) {
>>           /* yes, request director to orchestrate component failover */
>>           rc = avnd_di_oper_send(cb, su, SA_AMF_COMPONENT_FAILOVER);
>>       }
>> diff --git a/osaf/services/saf/amf/amfnd/err.cc
>> b/osaf/services/saf/amf/amfnd/err.cc
>> --- a/osaf/services/saf/amf/amfnd/err.cc
>> +++ b/osaf/services/saf/amf/amfnd/err.cc
>> @@ -759,6 +759,8 @@ uint32_t avnd_err_rcvr_comp_failover(AVN
>>       if (!m_AVND_SU_IS_FAILED(su)) {
>>           m_AVND_SU_FAILED_SET(su);
>>       }
>> +    //Remember component-failover/su-failover context.
>> +    m_AVND_SU_FAILOVER_SET(failed_comp->su);
>>         /* update su oper state */
>>       m_AVND_SU_OPER_STATE_SET(su, SA_AMF_OPERATIONAL_DISABLED);
>> @@ -839,6 +841,9 @@ uint32_t avnd_err_rcvr_su_failover(AVND_
>>           reset_suRestart_flag(su);
>>           su->admin_op_Id = static_cast<SaAmfAdminOperationIdT>(0);
>>       }
>> +    //Remember component-failover/su-failover context.
>> +    m_AVND_SU_FAILOVER_SET(failed_comp->su);
>> +
>>       LOG_NO("Terminating components of '%s'(abruptly &
>> unordered)",su->name.value);
>>       /* Unordered cleanup of components of failed SU */
>>       for (comp =
>> m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list));
>>
>> @@ -932,6 +937,8 @@ uint32_t avnd_err_rcvr_node_switchover(A
>>       {
>>           reset_suRestart_flag(failed_su);
>>           failed_su->admin_op_Id =
>> static_cast<SaAmfAdminOperationIdT>(0);
>> +        //Remember su-failover context.
>> +        m_AVND_SU_FAILOVER_SET(failed_comp->su);
>>             LOG_NO("Terminating components of '%s'(abruptly &
>> unordered)",failed_su->name.value);
>>           /* Unordered cleanup of components of failed SU */
>> @@ -1075,6 +1082,8 @@ uint32_t avnd_err_su_repair(AVND_CB *cb,
>>       if (all_comps_terminated_in_su(su) == true)
>>           is_uninst = true;
>>   +    //Reset component-failover here. SU failover is reset as part
>> of REPAIRED admin op.
>> +    m_AVND_SU_FAILOVER_RESET(su);
>>       /* scan & instantiate failed pi comps */
>>       for (comp =
>> m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_FIRST(&su->comp_list));
>>
>>            comp; comp =
>> m_AVND_COMP_FROM_SU_DLL_NODE_GET(m_NCS_DBLIST_FIND_NEXT(&comp->su_dll_node)))
>> {
>> @@ -1584,7 +1593,7 @@ uint32_t avnd_err_rcvr_node_failfast(AVN
>>   bool is_no_assignment_due_to_escalations(AVND_SU *su)
>>   {
>>       TRACE_ENTER();
>> -    if (((sufailover_in_progress(su) == true) &&
>> (su->su_err_esc_level == AVND_ERR_ESC_LEVEL_2)) ||
>> +    if ((sufailover_in_progress(su) == true) ||
>>               (sufailover_during_nodeswitchover(su) == true) ||
>>               (avnd_cb->term_state ==
>> AVND_TERM_STATE_NODE_FAILOVER_TERMINATING) ||
>>               (avnd_cb->term_state ==
>> AVND_TERM_STATE_NODE_FAILOVER_TERMINATED))  {
>> diff --git a/osaf/services/saf/amf/amfnd/susm.cc
>> b/osaf/services/saf/amf/amfnd/susm.cc
>> --- a/osaf/services/saf/amf/amfnd/susm.cc
>> +++ b/osaf/services/saf/amf/amfnd/susm.cc
>> @@ -1676,8 +1676,7 @@ uint32_t avnd_su_pres_st_chng_prc(AVND_C
>>                       ((su_all_comps_restartable(*su) == true) ||
>>                        ((su_all_comps_restartable(*su) == false)
>>                         && (is_any_non_restartable_comp_assigned(*su)
>> == false)) ||
>> -                     ((su->su_err_esc_level == AVND_ERR_ESC_LEVEL_2) &&
>> -                      (su->si_list.n_nodes == 0)))) {
>> +                     ((m_AVND_SU_IS_FAILOVER(su)) &&
>> (su->si_list.n_nodes == 0)))) {
>>                       /*
>>                          It means all comps are terminated in
>> surestart recovery or
>>                          admin op. For non restartable SU with no non
>> restartable comp
>> @@ -3479,8 +3478,9 @@ uint32_t avnd_su_pres_instfailed_compuni
>>   bool sufailover_in_progress(const AVND_SU *su)
>>   {
>>       if (m_AVND_SU_IS_FAILED(su) && (su->sufailover) &&
>> (!m_AVND_SU_IS_RESTART(su)) &&
>> -             (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED) &&
>> (!su->is_ncs))
>> -                return true;
>> +            (avnd_cb->oper_state != SA_AMF_OPERATIONAL_DISABLED) &&
>> (!su->is_ncs) &&
>> +            m_AVND_SU_IS_FAILOVER(su))
>> +        return true;
>>       return false;
>>   }
>>   @@ -3494,7 +3494,7 @@ bool sufailover_during_nodeswitchover(co
>>   {
>>       if ((m_AVND_SU_IS_FAILED(su) && (su->sufailover) &&
>> (!m_AVND_SU_IS_RESTART(su)) &&
>>                   (avnd_cb->term_state ==
>> AVND_TERM_STATE_NODE_SWITCHOVER_STARTED) &&
>> -                (!su->is_ncs)))
>> +                (!su->is_ncs) && (m_AVND_SU_IS_FAILOVER(su))))
>>           return true;
>>         return false;
>>
>

------------------------------------------------------------------------------
Mobile security can be enabling, not merely restricting. Employees who
bring their own devices (BYOD) to work are irked by the imposition of MDM
restrictions. Mobile Device Manager Plus allows you to control only the
apps on BYO-devices by containerizing them, leaving personal data untouched!
https://ad.doubleclick.net/ddm/clk/304595813;131938128;j
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to