Hi Minh,

I could not understand why AMF should become implementer/applier earlier.
Anyways, please find one query below with [Praveen].

Thanks,
Praveen

On 05-Sep-16 7:13 AM, Minh Hon Chau wrote:
>  osaf/services/saf/amf/amfd/include/node.h |   3 +
>  osaf/services/saf/amf/amfd/include/sg.h   |   1 +
>  osaf/services/saf/amf/amfd/nodegroup.cc   |  83 
> +++++++++++++++++++++++++++++++
>  osaf/services/saf/amf/amfd/role.cc        |  20 +++---
>  osaf/services/saf/amf/amfd/sg.cc          |  15 ++--
>  osaf/services/saf/amf/amfd/sgproc.cc      |   2 +-
>  osaf/services/saf/amf/amfd/siass.cc       |   4 +-
>  7 files changed, 109 insertions(+), 19 deletions(-)
>
>
> The SG becomes unstable because some variables used in nodegroup operation 
> are not
> restored after headless if this admin operation on nodegroup was interrupted 
> just
> before cluster goes into headless stage.
>
> In order to restore nodegroup operation, AMF needs to know exactly whether 
> nodegroup
> operation was running during headless up on @susi assignment. If susi is in 
> QUIESCED,
> QUIESCING or being removed while its related entities su, si, sg are not in 
> LOCKED
> and SHUTTING_DOWN, that means either node or nodegroup MUST be in LOCKED or 
> SHUTTING
> DOWN. In case of SHUTTING_DOWN saAmfNGAdminState, that's enough to know a 
> nodegroup
> operation was running. However, if saAmfNGAdminState is in LOCKED, this case 
> is an
> ambiguity of locking a node. The reason of differentiation of locking a node 
> or node
> group is because 2N SG uses both AVD_SG_FSM_SG_ADMIN and AVD_SG_FSM_SU_OPER 
> for node
> group operation while AVD_SG_FSM_SU_OPER is only used for node operation. 
> When 2N SG
> uses AVD_SG_FSM_SG_ADMIN for nodegroup, the saAmfSGAdminState is borrowed 
> (but not
> updated to IMM) to run the admin operation sequence. Therefore, after 
> headless if
> AVD_SG_FSM_SG_ADMIN was being used for nodegroup then saAmfSGAdminState also 
> needs to
> be set.
>
> Because SG FSM state is used to restore nodegroup during restoring susi 
> assignment,
> the osaAmfSGFsmState (RTA) needs to be read earlier than reading susi 
> assignment. This
> needs active AMFD become implementer earlier than reading sg object. There 
> was a known
> ticket reported in 1720, if only make active AMFD as early implementer than 
> it will
> cause the standby AMFD missing ccb apply callback.This patch also needs to 
> set both
> active and standby AMFD become implementer and applier earlier so that AMFD 
> can read
> osaAmfSGFsmState and do not cause regression of 1720.
>

> diff --git a/osaf/services/saf/amf/amfd/include/node.h 
> b/osaf/services/saf/amf/amfd/include/node.h
> --- a/osaf/services/saf/amf/amfd/include/node.h
> +++ b/osaf/services/saf/amf/amfd/include/node.h
> @@ -45,6 +45,7 @@
>  #include <set>
>  #include <vector>
>  #include <string>
> +#include <susi.h>
>
>  class AVD_SU;
>  struct avd_cluster_tag;
> @@ -232,4 +233,6 @@ extern void ng_complete_admin_op(AVD_AMF
>  extern void avd_ng_admin_state_set(AVD_AMF_NG* ng, SaAmfAdminStateT state);
>  extern bool are_all_ngs_in_unlocked_state(const AVD_AVND *node);
>  extern bool any_ng_in_locked_in_state(const AVD_AVND *node);
> +void avd_ng_restore_headless_states(AVD_CL_CB *cb, struct avd_su_si_rel_tag* 
> susi);
> +
>  #endif
> diff --git a/osaf/services/saf/amf/amfd/include/sg.h 
> b/osaf/services/saf/amf/amfd/include/sg.h
> --- a/osaf/services/saf/amf/amfd/include/sg.h
> +++ b/osaf/services/saf/amf/amfd/include/sg.h
> @@ -46,6 +46,7 @@
>  class AVD_SU;
>  class AVD_SI;
>  class AVD_APP;
> +class AVD_AMF_NG;
>
>  /* The valid SG FSM states. */
>  typedef enum {
> diff --git a/osaf/services/saf/amf/amfd/nodegroup.cc 
> b/osaf/services/saf/amf/amfd/nodegroup.cc
> --- a/osaf/services/saf/amf/amfd/nodegroup.cc
> +++ b/osaf/services/saf/amf/amfd/nodegroup.cc
> @@ -1356,3 +1356,86 @@ void avd_ng_constructor(void)
>                       ng_ccb_apply_cb);
>  }
>
> +/*
> + * @brief: Restore nodegroup related variables which are used in nodegroup
> + *         operations before headless. The main variables are @admin_ng,
> + *         @ng_using_saAmfSGAdminState, and the saAmfSGAdminState which
> + *         is borrowed by nodegroup operation on 2N SG
> + * @param: control block, and susi is being read after headless
> + * @return: void
> + */
> +
> +void avd_ng_restore_headless_states(AVD_CL_CB *cb, struct avd_su_si_rel_tag* 
> susi) {
> +
> +     TRACE_ENTER();
> +     osafassert(cb->scs_absence_max_duration > 0);
> +     // In order to restore nodegroup operation, AMF needs to know exactly
> +     // whether nodegroup operation was running during headless up on @susi
> +     // If susi is in QUIESCED/QUIESCING or being removed while its related
> +     // entities su, si, sg are not in LOCKED and SHUTTING_DOWN, that means
> +     // either node or nodegroup MUST be in LOCKED or SHUTTING_DOWN.
> +     // In case of SHUTTING_DOWN saAmfNGAdminState, that's enough to know
> +     // a nodegroup operation was running. However, if saAmfNGAdminState is
> +     // in LOCKED, this case is an ambiguity of locking a node.
> +     // The reason of differentiation of locking a node or nodegroup is 
> because
> +     // 2N SG uses both AVD_SG_FSM_SG_ADMIN and AVD_SG_FSM_SU_OPER for 
> nodegroup
> +     // operation while AVD_SG_FSM_SU_OPER is only used for node operation.
> +     // When 2N SG uses AVD_SG_FSM_SG_ADMIN for nodegroup, the 
> saAmfSGAdminState
> +     // is borrowed (but not updated to IMM) to run the admin operation 
> sequence.
> +     // Therefore, after headless if AVD_SG_FSM_SG_ADMIN was being used for 
> nodegroup
> +     // then saAmfSGAdminState also needs to be set.
> +
> +     // Make sure su, si, sg are not in LOCKED, SHUTTING_DOWN
> +     if (susi->su->saAmfSUAdminState != SA_AMF_ADMIN_LOCKED &&
> +                     susi->su->saAmfSUAdminState != 
> SA_AMF_ADMIN_SHUTTING_DOWN &&
> +                     susi->si->saAmfSIAdminState != SA_AMF_ADMIN_LOCKED &&
> +                     susi->si->saAmfSIAdminState != 
> SA_AMF_ADMIN_SHUTTING_DOWN &&
> +                     susi->su->sg_of_su->saAmfSGAdminState != 
> SA_AMF_ADMIN_LOCKED &&
> +                     susi->su->sg_of_su->saAmfSGAdminState != 
> SA_AMF_ADMIN_SHUTTING_DOWN) {
> +
> +             // susi are in state transition
> +             if (susi->state == SA_AMF_HA_QUIESCED || susi->state == 
> SA_AMF_HA_QUIESCING ||
> +                             susi->fsm == AVD_SU_SI_STATE_UNASGN) {
> +
> +             for (std::map<std::string, AVD_AMF_NG*>::const_iterator it = 
> nodegroup_db->begin();
> +                     it != nodegroup_db->end(); it++) {
> +                     AVD_AMF_NG *ng = it->second;
> +                     for (std::set<std::string>::const_iterator iter = 
> ng->saAmfNGNodeList.begin();
> +                                     iter != ng->saAmfNGNodeList.end(); 
> ++iter) {
> +                             AVD_AVND *node = avd_node_get(*iter);
> +
> +                             if (susi->su->su_on_node == node) {
> +                                     // Nodegroup was shutting down, also 
> needs to check whether nodegroup
> +                                     // operation was borrowing SG ADMIN FSM 
> code (for 2N only)
> +                                     if (ng->saAmfNGAdminState == 
> SA_AMF_ADMIN_SHUTTING_DOWN) {
> +                                             node->admin_ng = ng;
> +                                             if 
> (susi->su->sg_of_su->sg_fsm_state == AVD_SG_FSM_SG_ADMIN) {
> +                                                     
> osafassert(susi->su->sg_of_su->sg_redundancy_model == 
> SA_AMF_2N_REDUNDANCY_MODEL);
> +                                                             
> susi->su->sg_of_su->ng_using_saAmfSGAdminState = true;
> +                                                             
> susi->su->sg_of_su->saAmfSGAdminState = ng->saAmfNGAdminState;
> +                                                             
> m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, susi->su->sg_of_su, 
> AVSV_CKPT_SG_ADMIN_STATE);
> +                                             }
> +                                     }
> +                                     // Node group was LOCKED, AMFD treat 
> nodegroup and node operation in the same view,
> +                                     // except nodegroup was borrowing SG 
> ADMIN FSM. This is because the other case that
> +                                     // nodegroup operation was only SG 
> SU_OPER FSM, which is the same as of node operation
> +                                     if (ng->saAmfNGAdminState == 
> SA_AMF_ADMIN_LOCKED) {
> +                                             
> if(susi->su->sg_of_su->sg_fsm_state == AVD_SG_FSM_SG_ADMIN) {
> +                                                     
> osafassert(susi->su->sg_of_su->sg_redundancy_model == 
> SA_AMF_2N_REDUNDANCY_MODEL);
> +                                                             node->admin_ng 
> = ng;
> +                                                             
> susi->su->sg_of_su->ng_using_saAmfSGAdminState = true;
> +                                                             
> susi->su->sg_of_su->saAmfSGAdminState = ng->saAmfNGAdminState;
> +                                                             
> m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, susi->su->sg_of_su, 
> AVSV_CKPT_SG_ADMIN_STATE);
> +                                             }
> +                                     }
> +                                             TRACE("Restore nodegroup 
> operation, SG(%s), ng_using_saAmfSGAdminState: %u, saAmfSGAdminState:%u",
> +                                                             
> susi->su->sg_of_su->name.c_str(), 
> susi->su->sg_of_su->ng_using_saAmfSGAdminState,
> +                                                             
> susi->su->sg_of_su->saAmfSGAdminState);
> +
> +                             }
> +                     }
> +             }
> +             }
> +     }
> +     TRACE_LEAVE();
> +}
> diff --git a/osaf/services/saf/amf/amfd/role.cc 
> b/osaf/services/saf/amf/amfd/role.cc
> --- a/osaf/services/saf/amf/amfd/role.cc
> +++ b/osaf/services/saf/amf/amfd/role.cc
> @@ -260,6 +260,11 @@ uint32_t avd_active_role_initialization(
>
>       TRACE_ENTER();
>
> +     if (avd_imm_impl_set() != SA_AIS_OK) {
> +             LOG_ER("avd_imm_impl_set FAILED");
> +             goto done;
> +     }
> +
[Praveen] I do not know whether IMM waits for full cluster start up, but 
both IMMD and IMMND comes before AMF.If AMF becomes implementer before 
reading the objects, Will the CCB operations allowed by IMM?
If IMM does not wait for full cluster startup (AMF bcoming ready) then 
it can lead to accumulation of CCB callbacks in AMF mailbox waiting for 
Dispatch() being called from AMF.
Also what is the possibility that IMM will not reject CCBs when 
implementer is slow to respond.


>       if (avd_imm_config_get() != NCSCC_RC_SUCCESS) {
>               LOG_ER("avd_imm_config_get FAILED");
>               goto done;
> @@ -267,11 +272,6 @@ uint32_t avd_active_role_initialization(
>
>       cb->init_state = AVD_CFG_DONE;
>
> -     if (avd_imm_impl_set() != SA_AIS_OK) {
> -             LOG_ER("avd_imm_impl_set FAILED");
> -             goto done;
> -     }
> -
>       avd_imm_update_runtime_attrs();
>
>       status = NCSCC_RC_SUCCESS;
> @@ -294,6 +294,11 @@ uint32_t avd_standby_role_initialization
>
>       TRACE_ENTER();
>
> +     if (avd_imm_applier_set() != SA_AIS_OK) {
> +             LOG_ER("avd_imm_applier_set FAILED");
> +             goto done;
> +     }
> +
>       if (avd_imm_config_get() != NCSCC_RC_SUCCESS) {
>               LOG_ER("avd_imm_config_get FAILED");
>               goto done;
> @@ -301,11 +306,6 @@ uint32_t avd_standby_role_initialization
>
>       cb->init_state = AVD_CFG_DONE;
>
> -     if (avd_imm_applier_set() != SA_AIS_OK) {
> -             LOG_ER("avd_imm_applier_set FAILED");
> -             goto done;
> -     }
> -
>       status = NCSCC_RC_SUCCESS;
>  done:
>       TRACE_LEAVE();
> diff --git a/osaf/services/saf/amf/amfd/sg.cc 
> b/osaf/services/saf/amf/amfd/sg.cc
> --- a/osaf/services/saf/amf/amfd/sg.cc
> +++ b/osaf/services/saf/amf/amfd/sg.cc
> @@ -374,6 +374,13 @@ static AVD_SG *sg_create(const std::stri
>       if (immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfSGAdminState"), 
> attributes, 0, &sg->saAmfSGAdminState) != SA_AIS_OK) {
>               sg->saAmfSGAdminState = SA_AMF_ADMIN_UNLOCKED;
>       }
> +     // only read SG FSM State for non-ncs SG
> +     if (sg_name.find("safApp=OpenSAF") == std::string::npos) {
> +             if 
> (immutil_getAttr(const_cast<SaImmAttrNameT>("osafAmfSGFsmState"), attributes, 
> 0, &sg->sg_fsm_state) != SA_AIS_OK) {
> +                     sg->sg_fsm_state = AVD_SG_FSM_STABLE;
> +             }
> +             TRACE("sg_fsm_state(%u) read from osafAmfSGFsmState", 
> sg->sg_fsm_state);
> +     }
>
>       /*  TODO use value in type instead? */
>       sg->sg_redundancy_model = sgt->saAmfSgtRedundancyModel;
> @@ -422,6 +429,7 @@ SaAisErrorT avd_sg_config_get(const std:
>               const_cast<SaImmAttrNameT>("saAmfSGSuRestartProb"),
>               const_cast<SaImmAttrNameT>("saAmfSGSuRestartMax"),
>               const_cast<SaImmAttrNameT>("saAmfSGAdminState"),
> +             const_cast<SaImmAttrNameT>("osafAmfSGFsmState"),
>               nullptr
>       };
>
> @@ -2094,10 +2102,8 @@ void avd_sg_read_headless_cached_rta(AVD
>       AVD_SG *sg;
>       unsigned int num_of_values = 0;
>       const SaImmAttrValuesT_2 **attributes;
> -     AVD_SG_FSM_STATE imm_sg_fsm_state;
>       const char *className = "SaAmfSG";
>       const SaImmAttrNameT searchAttributes[] = {
> -             const_cast<SaImmAttrNameT>("osafAmfSGFsmState"),
>               const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
>               NULL
>       };
> @@ -2122,11 +2128,6 @@ void avd_sg_read_headless_cached_rta(AVD
>                                       (SaImmAttrValuesT_2 ***)&attributes)) 
> == SA_AIS_OK) {
>               sg = sg_db->find(Amf::to_string(&sg_dn));
>               if (sg && sg->sg_ncs_spec == false) {
> -                     // Read sg fsm state
> -                     rc = 
> immutil_getAttr(const_cast<SaImmAttrNameT>("osafAmfSGFsmState"),
> -                                     attributes, 0, &imm_sg_fsm_state);
> -                     osafassert(rc == SA_AIS_OK);
> -                     sg->set_fsm_state(imm_sg_fsm_state, false);
>                       // Read sg operation list
>                       if 
> (immutil_getAttrValuesNumber(const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
>  attributes, &num_of_values) == SA_AIS_OK) {
>                               unsigned int i;
> diff --git a/osaf/services/saf/amf/amfd/sgproc.cc 
> b/osaf/services/saf/amf/amfd/sgproc.cc
> --- a/osaf/services/saf/amf/amfd/sgproc.cc
> +++ b/osaf/services/saf/amf/amfd/sgproc.cc
> @@ -1112,7 +1112,7 @@ void avd_su_si_assign_evh(AVD_CL_CB *cb,
>               }
>
>               if (su->list_of_susi == AVD_SU_SI_REL_NULL) {
> -                     LOG_ER("%s: no susis", __FUNCTION__);
> +                     LOG_WA("%s: no susis", __FUNCTION__);
>                       goto done;
>               }
>
> diff --git a/osaf/services/saf/amf/amfd/siass.cc 
> b/osaf/services/saf/amf/amfd/siass.cc
> --- a/osaf/services/saf/amf/amfd/siass.cc
> +++ b/osaf/services/saf/amf/amfd/siass.cc
> @@ -261,7 +261,9 @@ void avd_susi_read_headless_cached_rta(A
>                                               m_AVD_SET_SG_ADMIN_SI(cb, si);
>                               }
>                       }
> -
> +                     // only restore if not done
> +                     if (susi->su->su_on_node->admin_ng == nullptr)
> +                             avd_ng_restore_headless_states(cb, susi);
>               } else {
>                       // This susi does not exist after headless, but it's 
> still in IMM
>                       // delete it for now
>

------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to