Hi Minh,

One minor comment, patch restores node->admin_ng pointers and for 
clearing these pointers it relies on process_su_si_response_for_ng(). 
But  process_su_si_response_for_ng() will clear them for all nodes in 
first assignment response after headless. Because of there will problem 
in cases in which su/node faults with node-failover recovery after 
headless state. Functions avd_node_down_appl_susi_failover() and 
sg_su_failover_func() calls process_su_si_response_for_ng() based on 
admin_ng pointer which has been cleared in first assignment response.
So SG will remain in LOCKED state.

This can be reproduced by following test case:
1)Bring one controller and 1 payload up.
2)Host amf demo on PL-3 with sufailover recovery.
3)lock the ng but do not respond for callbacks on both SUs.
4)Restart SC-1.
5)First respond for standby SU. In  process_su_si_response_for_ng(), 
AMFD will clear all the admin_ng pointers.
6)Now kill comp in SU1. It will fail with SUfailover recovery.
7)From sg_su_failover_func(), AMFD will never call 
process_su_si_response_for_ng() because of if condition not satisfied:
         /* If nodegroup level operation is finished on all the nodes, 
reply to imm.*/
         if (su->su_on_node->admin_ng != nullptr)
                 process_su_si_response_for_ng(su, res);

I think this case we had discussed.
Anyways, I think this can be fixed in other phases where admin 
operations + escalations are considered. I think as a fix only ORing 
with ng_using_sgadmin is required.

Ack for the patch with some minor comments below.


Thanks,
Praveen



On 12-Sep-16 6:23 PM, Minh Hon Chau wrote:
>  osaf/services/saf/amf/amfd/include/node.h |   3 +
>  osaf/services/saf/amf/amfd/include/sg.h   |   5 +-
>  osaf/services/saf/amf/amfd/ndfsm.cc       |   3 +-
>  osaf/services/saf/amf/amfd/nodegroup.cc   |  83 
> +++++++++++++++++++++++++++++++
>  osaf/services/saf/amf/amfd/sg.cc          |  52 +++++++++++++++++-
>  osaf/services/saf/amf/amfd/sgproc.cc      |   2 +-
>  osaf/services/saf/amf/amfd/siass.cc       |   4 +-
>  7 files changed, 143 insertions(+), 9 deletions(-)
>
>
> The SG becomes unstable because some variables used in nodegroup operation 
> are not
> restored after headless if this admin operation on nodegroup was interrupted 
> just
> before cluster goes into headless stage.
>
> In order to restore nodegroup operation, AMF needs to know exactly whether 
> nodegroup
> operation was running during headless up on @susi assignment. If susi is in 
> QUIESCED,
> QUIESCING or being removed while its related entities su, si, sg are not in 
> LOCKED
> and SHUTTING_DOWN, that means either node or nodegroup MUST be in LOCKED or 
> SHUTTING
> DOWN. In case of SHUTTING_DOWN saAmfNGAdminState, that's enough to know a 
> nodegroup
> operation was running. However, if saAmfNGAdminState is in LOCKED, this case 
> is an
> ambiguity of locking a node. The reason of differentiation of locking a node 
> or node
> group is because 2N SG uses both AVD_SG_FSM_SG_ADMIN and AVD_SG_FSM_SU_OPER 
> for node
> group operation while AVD_SG_FSM_SU_OPER is only used for node operation. 
> When 2N SG
> uses AVD_SG_FSM_SG_ADMIN for nodegroup, the saAmfSGAdminState is borrowed 
> (but not
> updated to IMM) to run the admin operation sequence. Therefore, after 
> headless if
> AVD_SG_FSM_SG_ADMIN was being used for nodegroup then saAmfSGAdminState also 
> needs to
> be set.
>
> diff --git a/osaf/services/saf/amf/amfd/include/node.h 
> b/osaf/services/saf/amf/amfd/include/node.h
> --- a/osaf/services/saf/amf/amfd/include/node.h
> +++ b/osaf/services/saf/amf/amfd/include/node.h
> @@ -45,6 +45,7 @@
>  #include <set>
>  #include <vector>
>  #include <string>
> +#include <susi.h>
>
>  class AVD_SU;
>  struct avd_cluster_tag;
> @@ -232,4 +233,6 @@ extern void ng_complete_admin_op(AVD_AMF
>  extern void avd_ng_admin_state_set(AVD_AMF_NG* ng, SaAmfAdminStateT state);
>  extern bool are_all_ngs_in_unlocked_state(const AVD_AVND *node);
>  extern bool any_ng_in_locked_in_state(const AVD_AVND *node);
> +void avd_ng_restore_headless_states(AVD_CL_CB *cb, struct avd_su_si_rel_tag* 
> susi);
> +
>  #endif
> diff --git a/osaf/services/saf/amf/amfd/include/sg.h 
> b/osaf/services/saf/amf/amfd/include/sg.h
> --- a/osaf/services/saf/amf/amfd/include/sg.h
> +++ b/osaf/services/saf/amf/amfd/include/sg.h
> @@ -46,6 +46,7 @@
>  class AVD_SU;
>  class AVD_SI;
>  class AVD_APP;
> +class AVD_AMF_NG;
>
>  /* The valid SG FSM states. */
>  typedef enum {
> @@ -576,8 +577,8 @@ private:
>  #define m_AVD_SET_SG_ADMIN_SI(cb,si) (si)->sg_of_si->set_admin_si((si))
>  #define m_AVD_CLEAR_SG_ADMIN_SI(cb,sg) (sg)->clear_admin_si()
>  #define m_AVD_CHK_OPLIST(i_su,flag) (flag) = 
> (i_su)->sg_of_su->in_su_oper_list(i_su)
> -
> -void avd_sg_read_headless_cached_rta(AVD_CL_CB *cb);
> +void avd_sg_read_headless_fsm_state_cached_rta(AVD_CL_CB *cb);
> +void avd_sg_read_headless_su_oper_list_cached_rta(AVD_CL_CB *cb);
>
>  extern void avd_sg_delete(AVD_SG *sg);
>  extern void avd_sg_db_add(AVD_SG *sg);
> diff --git a/osaf/services/saf/amf/amfd/ndfsm.cc 
> b/osaf/services/saf/amf/amfd/ndfsm.cc
> --- a/osaf/services/saf/amf/amfd/ndfsm.cc
> +++ b/osaf/services/saf/amf/amfd/ndfsm.cc
> @@ -129,8 +129,9 @@ void avd_process_state_info_queue(AVD_CL
>       // Reading sg must be after reading susi
>       if (found_state_info == true) {
>               avd_compcsi_cleanup_imm_object(cb);
> +             avd_sg_read_headless_fsm_state_cached_rta(cb);
>               avd_susi_read_headless_cached_rta(cb);
> -             avd_sg_read_headless_cached_rta(cb);
> +             avd_sg_read_headless_su_oper_list_cached_rta(cb);
>               avd_su_read_headless_cached_rta(cb);
>       }
>  done:
> diff --git a/osaf/services/saf/amf/amfd/nodegroup.cc 
> b/osaf/services/saf/amf/amfd/nodegroup.cc
> --- a/osaf/services/saf/amf/amfd/nodegroup.cc
> +++ b/osaf/services/saf/amf/amfd/nodegroup.cc
> @@ -1356,3 +1356,86 @@ void avd_ng_constructor(void)
>                       ng_ccb_apply_cb);
>  }
>
> +/*
> + * @brief: Restore nodegroup related variables which are used in nodegroup
> + *         operations before headless. The main variables are @admin_ng,
> + *         @ng_using_saAmfSGAdminState, and the saAmfSGAdminState which
> + *         is borrowed by nodegroup operation on 2N SG
> + * @param: control block, and susi is being read after headless
> + * @return: void
> + */
> +
> +void avd_ng_restore_headless_states(AVD_CL_CB *cb, struct avd_su_si_rel_tag* 
> susi) {
> +
> +     TRACE_ENTER();
> +     osafassert(cb->scs_absence_max_duration > 0);
> +     // In order to restore nodegroup operation, AMF needs to know exactly
> +     // whether nodegroup operation was running during headless up on @susi
> +     // If susi is in QUIESCED/QUIESCING or being removed while its related
> +     // entities su, si, sg are not in LOCKED and SHUTTING_DOWN, that means
> +     // either node or nodegroup MUST be in LOCKED or SHUTTING_DOWN.
> +     // In case of SHUTTING_DOWN saAmfNGAdminState, that's enough to know
> +     // a nodegroup operation was running. However, if saAmfNGAdminState is
> +     // in LOCKED, this case is an ambiguity of locking a node.
> +     // The reason of differentiation of locking a node or nodegroup is 
> because
> +     // 2N SG uses both AVD_SG_FSM_SG_ADMIN and AVD_SG_FSM_SU_OPER for 
> nodegroup
> +     // operation while AVD_SG_FSM_SU_OPER is only used for node operation.
> +     // When 2N SG uses AVD_SG_FSM_SG_ADMIN for nodegroup, the 
> saAmfSGAdminState
> +     // is borrowed (but not updated to IMM) to run the admin operation 
> sequence.
> +     // Therefore, after headless if AVD_SG_FSM_SG_ADMIN was being used for 
> nodegroup
> +     // then saAmfSGAdminState also needs to be set.
> +
> +     // Make sure su, si, sg are not in LOCKED, SHUTTING_DOWN
> +     if (susi->su->saAmfSUAdminState != SA_AMF_ADMIN_LOCKED &&
> +                     susi->su->saAmfSUAdminState != 
> SA_AMF_ADMIN_SHUTTING_DOWN &&
> +                     susi->si->saAmfSIAdminState != SA_AMF_ADMIN_LOCKED &&
> +                     susi->si->saAmfSIAdminState != 
> SA_AMF_ADMIN_SHUTTING_DOWN &&
> +                     susi->su->sg_of_su->saAmfSGAdminState != 
> SA_AMF_ADMIN_LOCKED &&
> +                     susi->su->sg_of_su->saAmfSGAdminState != 
> SA_AMF_ADMIN_SHUTTING_DOWN) {
> +
> +             // susi are in state transition
> +             if (susi->state == SA_AMF_HA_QUIESCED || susi->state == 
> SA_AMF_HA_QUIESCING ||
> +                             susi->fsm == AVD_SU_SI_STATE_UNASGN) {
> +
> +             for (std::map<std::string, AVD_AMF_NG*>::const_iterator it = 
> nodegroup_db->begin();
> +                     it != nodegroup_db->end(); it++) {
> +                     AVD_AMF_NG *ng = it->second;
> +                     for (std::set<std::string>::const_iterator iter = 
> ng->saAmfNGNodeList.begin();
> +                                     iter != ng->saAmfNGNodeList.end(); 
> ++iter) {
> +                             AVD_AVND *node = avd_node_get(*iter);
> +
> +                             if (susi->su->su_on_node == node) {
> +                                     // Nodegroup was shutting down, also 
> needs to check whether nodegroup
> +                                     // operation was borrowing SG ADMIN FSM 
> code (for 2N only)
> +                                     if (ng->saAmfNGAdminState == 
> SA_AMF_ADMIN_SHUTTING_DOWN) {
> +                                             node->admin_ng = ng;
> +                                             if 
> (susi->su->sg_of_su->sg_fsm_state == AVD_SG_FSM_SG_ADMIN) {
> +                                                     
> osafassert(susi->su->sg_of_su->sg_redundancy_model == 
> SA_AMF_2N_REDUNDANCY_MODEL);
> +                                                             
> susi->su->sg_of_su->ng_using_saAmfSGAdminState = true;
> +                                                             
> susi->su->sg_of_su->saAmfSGAdminState = ng->saAmfNGAdminState;
> +                                                             
> m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, susi->su->sg_of_su, 
> AVSV_CKPT_SG_ADMIN_STATE);
> +                                             }
> +                                     }
> +                                     // Node group was LOCKED, AMFD treat 
> nodegroup and node operation in the same view,
> +                                     // except nodegroup was borrowing SG 
> ADMIN FSM. This is because the other case that
> +                                     // nodegroup operation was only SG 
> SU_OPER FSM, which is the same as of node operation
> +                                     if (ng->saAmfNGAdminState == 
> SA_AMF_ADMIN_LOCKED) {
> +                                             
> if(susi->su->sg_of_su->sg_fsm_state == AVD_SG_FSM_SG_ADMIN) {
> +                                                     
> osafassert(susi->su->sg_of_su->sg_redundancy_model == 
> SA_AMF_2N_REDUNDANCY_MODEL);
> +                                                             node->admin_ng 
> = ng;
> +                                                             
> susi->su->sg_of_su->ng_using_saAmfSGAdminState = true;
> +                                                             
> susi->su->sg_of_su->saAmfSGAdminState = ng->saAmfNGAdminState;
> +                                                             
> m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, susi->su->sg_of_su, 
> AVSV_CKPT_SG_ADMIN_STATE);
> +                                             }
> +                                     }
> +                                             TRACE("Restore nodegroup 
> operation, SG(%s), ng_using_saAmfSGAdminState: %u, saAmfSGAdminState:%u",
> +                                                             
> susi->su->sg_of_su->name.c_str(), 
> susi->su->sg_of_su->ng_using_saAmfSGAdminState,
> +                                                             
> susi->su->sg_of_su->saAmfSGAdminState);
[Praveen] If we get one NG which was undergoing admin operation, then I 
think we should break from both the loops.
Admin operations are sequential.
It can be done as a part of refactoring later.

> +
> +                             }
> +                     }
> +             }
> +             }
> +     }
> +     TRACE_LEAVE();
> +}
> diff --git a/osaf/services/saf/amf/amfd/sg.cc 
> b/osaf/services/saf/amf/amfd/sg.cc
> --- a/osaf/services/saf/amf/amfd/sg.cc
> +++ b/osaf/services/saf/amf/amfd/sg.cc
> @@ -374,7 +374,6 @@ static AVD_SG *sg_create(const std::stri
>       if (immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfSGAdminState"), 
> attributes, 0, &sg->saAmfSGAdminState) != SA_AIS_OK) {
>               sg->saAmfSGAdminState = SA_AMF_ADMIN_UNLOCKED;
>       }
> -
>       /*  TODO use value in type instead? */
>       sg->sg_redundancy_model = sgt->saAmfSgtRedundancyModel;
>
> @@ -2083,7 +2082,7 @@ uint32_t AVD_SG::curr_non_instantiated_s
>                       (su->saAmfSUPresenceState == 
> SA_AMF_PRESENCE_UNINSTANTIATED));}));      
>  }
>
> -void avd_sg_read_headless_cached_rta(AVD_CL_CB *cb)
> +void avd_sg_read_headless_fsm_state_cached_rta(AVD_CL_CB *cb)
>  {
>
>       SaAisErrorT rc;
> @@ -2092,13 +2091,11 @@ void avd_sg_read_headless_cached_rta(AVD
>
>       SaNameT sg_dn;
>       AVD_SG *sg;
> -     unsigned int num_of_values = 0;
>       const SaImmAttrValuesT_2 **attributes;
>       AVD_SG_FSM_STATE imm_sg_fsm_state;
>       const char *className = "SaAmfSG";
>       const SaImmAttrNameT searchAttributes[] = {
>               const_cast<SaImmAttrNameT>("osafAmfSGFsmState"),
> -             const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
>               NULL
>       };
>
> @@ -2127,6 +2124,53 @@ void avd_sg_read_headless_cached_rta(AVD
>                                       attributes, 0, &imm_sg_fsm_state);
>                       osafassert(rc == SA_AIS_OK);
>                       sg->set_fsm_state(imm_sg_fsm_state, false);
> +             }
> +     }
> +
> +     (void)immutil_saImmOmSearchFinalize(searchHandle);
> +
> +done:
> +     TRACE_LEAVE();
> +
> +}
> +
> +void avd_sg_read_headless_su_oper_list_cached_rta(AVD_CL_CB *cb)
> +{
> +
> +     SaAisErrorT rc;
> +     SaImmSearchHandleT searchHandle;
> +     SaImmSearchParametersT_2 searchParam;
> +
> +     SaNameT sg_dn;
> +     AVD_SG *sg;
> +     unsigned int num_of_values = 0;
> +     const SaImmAttrValuesT_2 **attributes;
> +     const char *className = "SaAmfSG";
> +     const SaImmAttrNameT searchAttributes[] = {
> +             const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
> +             NULL
> +     };
> +
> +     TRACE_ENTER();
> +
> +     osafassert(cb->scs_absence_max_duration > 0);
> +
> +     searchParam.searchOneAttr.attrName = 
> const_cast<SaImmAttrNameT>("SaImmAttrClassName");
> +     searchParam.searchOneAttr.attrValueType = SA_IMM_ATTR_SASTRINGT;
> +     searchParam.searchOneAttr.attrValue = &className;
> +
> +     if ((rc = immutil_saImmOmSearchInitialize_2(cb->immOmHandle, NULL, 
> SA_IMM_SUBTREE,
> +           SA_IMM_SEARCH_ONE_ATTR | SA_IMM_SEARCH_GET_SOME_ATTR, 
> &searchParam,
> +               searchAttributes, &searchHandle)) != SA_AIS_OK) {
> +
> +             LOG_ER("%s: saImmOmSearchInitialize_2 failed: %u", 
> __FUNCTION__, rc);
> +             goto done;
> +     }
> +
> +     while ((rc = immutil_saImmOmSearchNext_2(searchHandle, &sg_dn,
> +                                     (SaImmAttrValuesT_2 ***)&attributes)) 
> == SA_AIS_OK) {
> +             sg = sg_db->find(Amf::to_string(&sg_dn));
> +             if (sg && sg->sg_ncs_spec == false) {
>                       // Read sg operation list
>                       if 
> (immutil_getAttrValuesNumber(const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
>  attributes, &num_of_values) == SA_AIS_OK) {
>                               unsigned int i;
> diff --git a/osaf/services/saf/amf/amfd/sgproc.cc 
> b/osaf/services/saf/amf/amfd/sgproc.cc
> --- a/osaf/services/saf/amf/amfd/sgproc.cc
> +++ b/osaf/services/saf/amf/amfd/sgproc.cc
> @@ -1098,7 +1098,7 @@ void avd_su_si_assign_evh(AVD_CL_CB *cb,
>               }
>
>               if (su->list_of_susi == AVD_SU_SI_REL_NULL) {
> -                     LOG_ER("%s: no susis", __FUNCTION__);
> +                     LOG_WA("%s: no susis", __FUNCTION__);
>                       goto done;
>               }
>
> diff --git a/osaf/services/saf/amf/amfd/siass.cc 
> b/osaf/services/saf/amf/amfd/siass.cc
> --- a/osaf/services/saf/amf/amfd/siass.cc
> +++ b/osaf/services/saf/amf/amfd/siass.cc
> @@ -261,7 +261,9 @@ void avd_susi_read_headless_cached_rta(A
>                                               m_AVD_SET_SG_ADMIN_SI(cb, si);
>                               }
>                       }
> -
> +                     // only restore if not done
> +                     if (susi->su->su_on_node->admin_ng == nullptr)
> +                             avd_ng_restore_headless_states(cb, susi);
[Praveen][ Since admin operations are sequential, I think 
avd_ng_restore_headless_states() function
  can return ptr to NG or NULL. Now for setting admin_ng for any other 
SUSI when we know admin operated NG we can use it to avoid calls to 
avd_ng_restore_headless_states(). Like:
         if ((susi->su->su_on_node->admin_ng == nullptr) && (admin_ng == 
nullptr)) {
                 admin_ng =  avd_ng_restore_headless_states(cb, susi);
         }  else if ((node_in_nodegroup(susi->su->su_on_node->admin_ng , 
admin_ng) == true)  &&
                 (admin_ng->saAmfNGAdminState == LOCKED or SHUTTING_DOWN)) {
                 susi->su->su_on_node->admin_ng = admin_ng;
         }
It can be done as a part of refactoring later.

>               } else {
>                       // This susi does not exist after headless, but it's 
> still in IMM
>                       // delete it for now
>

------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to