Hi Minh,
One minor comment, patch restores node->admin_ng pointers and for
clearing these pointers it relies on process_su_si_response_for_ng().
But process_su_si_response_for_ng() will clear them for all nodes in
first assignment response after headless. Because of there will problem
in cases in which su/node faults with node-failover recovery after
headless state. Functions avd_node_down_appl_susi_failover() and
sg_su_failover_func() calls process_su_si_response_for_ng() based on
admin_ng pointer which has been cleared in first assignment response.
So SG will remain in LOCKED state.
This can be reproduced by following test case:
1)Bring one controller and 1 payload up.
2)Host amf demo on PL-3 with sufailover recovery.
3)lock the ng but do not respond for callbacks on both SUs.
4)Restart SC-1.
5)First respond for standby SU. In process_su_si_response_for_ng(),
AMFD will clear all the admin_ng pointers.
6)Now kill comp in SU1. It will fail with SUfailover recovery.
7)From sg_su_failover_func(), AMFD will never call
process_su_si_response_for_ng() because of if condition not satisfied:
/* If nodegroup level operation is finished on all the nodes,
reply to imm.*/
if (su->su_on_node->admin_ng != nullptr)
process_su_si_response_for_ng(su, res);
I think this case we had discussed.
Anyways, I think this can be fixed in other phases where admin
operations + escalations are considered. I think as a fix only ORing
with ng_using_sgadmin is required.
Ack for the patch with some minor comments below.
Thanks,
Praveen
On 12-Sep-16 6:23 PM, Minh Hon Chau wrote:
> osaf/services/saf/amf/amfd/include/node.h | 3 +
> osaf/services/saf/amf/amfd/include/sg.h | 5 +-
> osaf/services/saf/amf/amfd/ndfsm.cc | 3 +-
> osaf/services/saf/amf/amfd/nodegroup.cc | 83
> +++++++++++++++++++++++++++++++
> osaf/services/saf/amf/amfd/sg.cc | 52 +++++++++++++++++-
> osaf/services/saf/amf/amfd/sgproc.cc | 2 +-
> osaf/services/saf/amf/amfd/siass.cc | 4 +-
> 7 files changed, 143 insertions(+), 9 deletions(-)
>
>
> The SG becomes unstable because some variables used in nodegroup operation
> are not
> restored after headless if this admin operation on nodegroup was interrupted
> just
> before cluster goes into headless stage.
>
> In order to restore nodegroup operation, AMF needs to know exactly whether
> nodegroup
> operation was running during headless up on @susi assignment. If susi is in
> QUIESCED,
> QUIESCING or being removed while its related entities su, si, sg are not in
> LOCKED
> and SHUTTING_DOWN, that means either node or nodegroup MUST be in LOCKED or
> SHUTTING
> DOWN. In case of SHUTTING_DOWN saAmfNGAdminState, that's enough to know a
> nodegroup
> operation was running. However, if saAmfNGAdminState is in LOCKED, this case
> is an
> ambiguity of locking a node. The reason of differentiation of locking a node
> or node
> group is because 2N SG uses both AVD_SG_FSM_SG_ADMIN and AVD_SG_FSM_SU_OPER
> for node
> group operation while AVD_SG_FSM_SU_OPER is only used for node operation.
> When 2N SG
> uses AVD_SG_FSM_SG_ADMIN for nodegroup, the saAmfSGAdminState is borrowed
> (but not
> updated to IMM) to run the admin operation sequence. Therefore, after
> headless if
> AVD_SG_FSM_SG_ADMIN was being used for nodegroup then saAmfSGAdminState also
> needs to
> be set.
>
> diff --git a/osaf/services/saf/amf/amfd/include/node.h
> b/osaf/services/saf/amf/amfd/include/node.h
> --- a/osaf/services/saf/amf/amfd/include/node.h
> +++ b/osaf/services/saf/amf/amfd/include/node.h
> @@ -45,6 +45,7 @@
> #include <set>
> #include <vector>
> #include <string>
> +#include <susi.h>
>
> class AVD_SU;
> struct avd_cluster_tag;
> @@ -232,4 +233,6 @@ extern void ng_complete_admin_op(AVD_AMF
> extern void avd_ng_admin_state_set(AVD_AMF_NG* ng, SaAmfAdminStateT state);
> extern bool are_all_ngs_in_unlocked_state(const AVD_AVND *node);
> extern bool any_ng_in_locked_in_state(const AVD_AVND *node);
> +void avd_ng_restore_headless_states(AVD_CL_CB *cb, struct avd_su_si_rel_tag*
> susi);
> +
> #endif
> diff --git a/osaf/services/saf/amf/amfd/include/sg.h
> b/osaf/services/saf/amf/amfd/include/sg.h
> --- a/osaf/services/saf/amf/amfd/include/sg.h
> +++ b/osaf/services/saf/amf/amfd/include/sg.h
> @@ -46,6 +46,7 @@
> class AVD_SU;
> class AVD_SI;
> class AVD_APP;
> +class AVD_AMF_NG;
>
> /* The valid SG FSM states. */
> typedef enum {
> @@ -576,8 +577,8 @@ private:
> #define m_AVD_SET_SG_ADMIN_SI(cb,si) (si)->sg_of_si->set_admin_si((si))
> #define m_AVD_CLEAR_SG_ADMIN_SI(cb,sg) (sg)->clear_admin_si()
> #define m_AVD_CHK_OPLIST(i_su,flag) (flag) =
> (i_su)->sg_of_su->in_su_oper_list(i_su)
> -
> -void avd_sg_read_headless_cached_rta(AVD_CL_CB *cb);
> +void avd_sg_read_headless_fsm_state_cached_rta(AVD_CL_CB *cb);
> +void avd_sg_read_headless_su_oper_list_cached_rta(AVD_CL_CB *cb);
>
> extern void avd_sg_delete(AVD_SG *sg);
> extern void avd_sg_db_add(AVD_SG *sg);
> diff --git a/osaf/services/saf/amf/amfd/ndfsm.cc
> b/osaf/services/saf/amf/amfd/ndfsm.cc
> --- a/osaf/services/saf/amf/amfd/ndfsm.cc
> +++ b/osaf/services/saf/amf/amfd/ndfsm.cc
> @@ -129,8 +129,9 @@ void avd_process_state_info_queue(AVD_CL
> // Reading sg must be after reading susi
> if (found_state_info == true) {
> avd_compcsi_cleanup_imm_object(cb);
> + avd_sg_read_headless_fsm_state_cached_rta(cb);
> avd_susi_read_headless_cached_rta(cb);
> - avd_sg_read_headless_cached_rta(cb);
> + avd_sg_read_headless_su_oper_list_cached_rta(cb);
> avd_su_read_headless_cached_rta(cb);
> }
> done:
> diff --git a/osaf/services/saf/amf/amfd/nodegroup.cc
> b/osaf/services/saf/amf/amfd/nodegroup.cc
> --- a/osaf/services/saf/amf/amfd/nodegroup.cc
> +++ b/osaf/services/saf/amf/amfd/nodegroup.cc
> @@ -1356,3 +1356,86 @@ void avd_ng_constructor(void)
> ng_ccb_apply_cb);
> }
>
> +/*
> + * @brief: Restore nodegroup related variables which are used in nodegroup
> + * operations before headless. The main variables are @admin_ng,
> + * @ng_using_saAmfSGAdminState, and the saAmfSGAdminState which
> + * is borrowed by nodegroup operation on 2N SG
> + * @param: control block, and susi is being read after headless
> + * @return: void
> + */
> +
> +void avd_ng_restore_headless_states(AVD_CL_CB *cb, struct avd_su_si_rel_tag*
> susi) {
> +
> + TRACE_ENTER();
> + osafassert(cb->scs_absence_max_duration > 0);
> + // In order to restore nodegroup operation, AMF needs to know exactly
> + // whether nodegroup operation was running during headless up on @susi
> + // If susi is in QUIESCED/QUIESCING or being removed while its related
> + // entities su, si, sg are not in LOCKED and SHUTTING_DOWN, that means
> + // either node or nodegroup MUST be in LOCKED or SHUTTING_DOWN.
> + // In case of SHUTTING_DOWN saAmfNGAdminState, that's enough to know
> + // a nodegroup operation was running. However, if saAmfNGAdminState is
> + // in LOCKED, this case is an ambiguity of locking a node.
> + // The reason of differentiation of locking a node or nodegroup is
> because
> + // 2N SG uses both AVD_SG_FSM_SG_ADMIN and AVD_SG_FSM_SU_OPER for
> nodegroup
> + // operation while AVD_SG_FSM_SU_OPER is only used for node operation.
> + // When 2N SG uses AVD_SG_FSM_SG_ADMIN for nodegroup, the
> saAmfSGAdminState
> + // is borrowed (but not updated to IMM) to run the admin operation
> sequence.
> + // Therefore, after headless if AVD_SG_FSM_SG_ADMIN was being used for
> nodegroup
> + // then saAmfSGAdminState also needs to be set.
> +
> + // Make sure su, si, sg are not in LOCKED, SHUTTING_DOWN
> + if (susi->su->saAmfSUAdminState != SA_AMF_ADMIN_LOCKED &&
> + susi->su->saAmfSUAdminState !=
> SA_AMF_ADMIN_SHUTTING_DOWN &&
> + susi->si->saAmfSIAdminState != SA_AMF_ADMIN_LOCKED &&
> + susi->si->saAmfSIAdminState !=
> SA_AMF_ADMIN_SHUTTING_DOWN &&
> + susi->su->sg_of_su->saAmfSGAdminState !=
> SA_AMF_ADMIN_LOCKED &&
> + susi->su->sg_of_su->saAmfSGAdminState !=
> SA_AMF_ADMIN_SHUTTING_DOWN) {
> +
> + // susi are in state transition
> + if (susi->state == SA_AMF_HA_QUIESCED || susi->state ==
> SA_AMF_HA_QUIESCING ||
> + susi->fsm == AVD_SU_SI_STATE_UNASGN) {
> +
> + for (std::map<std::string, AVD_AMF_NG*>::const_iterator it =
> nodegroup_db->begin();
> + it != nodegroup_db->end(); it++) {
> + AVD_AMF_NG *ng = it->second;
> + for (std::set<std::string>::const_iterator iter =
> ng->saAmfNGNodeList.begin();
> + iter != ng->saAmfNGNodeList.end();
> ++iter) {
> + AVD_AVND *node = avd_node_get(*iter);
> +
> + if (susi->su->su_on_node == node) {
> + // Nodegroup was shutting down, also
> needs to check whether nodegroup
> + // operation was borrowing SG ADMIN FSM
> code (for 2N only)
> + if (ng->saAmfNGAdminState ==
> SA_AMF_ADMIN_SHUTTING_DOWN) {
> + node->admin_ng = ng;
> + if
> (susi->su->sg_of_su->sg_fsm_state == AVD_SG_FSM_SG_ADMIN) {
> +
> osafassert(susi->su->sg_of_su->sg_redundancy_model ==
> SA_AMF_2N_REDUNDANCY_MODEL);
> +
> susi->su->sg_of_su->ng_using_saAmfSGAdminState = true;
> +
> susi->su->sg_of_su->saAmfSGAdminState = ng->saAmfNGAdminState;
> +
> m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, susi->su->sg_of_su,
> AVSV_CKPT_SG_ADMIN_STATE);
> + }
> + }
> + // Node group was LOCKED, AMFD treat
> nodegroup and node operation in the same view,
> + // except nodegroup was borrowing SG
> ADMIN FSM. This is because the other case that
> + // nodegroup operation was only SG
> SU_OPER FSM, which is the same as of node operation
> + if (ng->saAmfNGAdminState ==
> SA_AMF_ADMIN_LOCKED) {
> +
> if(susi->su->sg_of_su->sg_fsm_state == AVD_SG_FSM_SG_ADMIN) {
> +
> osafassert(susi->su->sg_of_su->sg_redundancy_model ==
> SA_AMF_2N_REDUNDANCY_MODEL);
> + node->admin_ng
> = ng;
> +
> susi->su->sg_of_su->ng_using_saAmfSGAdminState = true;
> +
> susi->su->sg_of_su->saAmfSGAdminState = ng->saAmfNGAdminState;
> +
> m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, susi->su->sg_of_su,
> AVSV_CKPT_SG_ADMIN_STATE);
> + }
> + }
> + TRACE("Restore nodegroup
> operation, SG(%s), ng_using_saAmfSGAdminState: %u, saAmfSGAdminState:%u",
> +
> susi->su->sg_of_su->name.c_str(),
> susi->su->sg_of_su->ng_using_saAmfSGAdminState,
> +
> susi->su->sg_of_su->saAmfSGAdminState);
[Praveen] If we get one NG which was undergoing admin operation, then I
think we should break from both the loops.
Admin operations are sequential.
It can be done as a part of refactoring later.
> +
> + }
> + }
> + }
> + }
> + }
> + TRACE_LEAVE();
> +}
> diff --git a/osaf/services/saf/amf/amfd/sg.cc
> b/osaf/services/saf/amf/amfd/sg.cc
> --- a/osaf/services/saf/amf/amfd/sg.cc
> +++ b/osaf/services/saf/amf/amfd/sg.cc
> @@ -374,7 +374,6 @@ static AVD_SG *sg_create(const std::stri
> if (immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfSGAdminState"),
> attributes, 0, &sg->saAmfSGAdminState) != SA_AIS_OK) {
> sg->saAmfSGAdminState = SA_AMF_ADMIN_UNLOCKED;
> }
> -
> /* TODO use value in type instead? */
> sg->sg_redundancy_model = sgt->saAmfSgtRedundancyModel;
>
> @@ -2083,7 +2082,7 @@ uint32_t AVD_SG::curr_non_instantiated_s
> (su->saAmfSUPresenceState ==
> SA_AMF_PRESENCE_UNINSTANTIATED));}));
> }
>
> -void avd_sg_read_headless_cached_rta(AVD_CL_CB *cb)
> +void avd_sg_read_headless_fsm_state_cached_rta(AVD_CL_CB *cb)
> {
>
> SaAisErrorT rc;
> @@ -2092,13 +2091,11 @@ void avd_sg_read_headless_cached_rta(AVD
>
> SaNameT sg_dn;
> AVD_SG *sg;
> - unsigned int num_of_values = 0;
> const SaImmAttrValuesT_2 **attributes;
> AVD_SG_FSM_STATE imm_sg_fsm_state;
> const char *className = "SaAmfSG";
> const SaImmAttrNameT searchAttributes[] = {
> const_cast<SaImmAttrNameT>("osafAmfSGFsmState"),
> - const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
> NULL
> };
>
> @@ -2127,6 +2124,53 @@ void avd_sg_read_headless_cached_rta(AVD
> attributes, 0, &imm_sg_fsm_state);
> osafassert(rc == SA_AIS_OK);
> sg->set_fsm_state(imm_sg_fsm_state, false);
> + }
> + }
> +
> + (void)immutil_saImmOmSearchFinalize(searchHandle);
> +
> +done:
> + TRACE_LEAVE();
> +
> +}
> +
> +void avd_sg_read_headless_su_oper_list_cached_rta(AVD_CL_CB *cb)
> +{
> +
> + SaAisErrorT rc;
> + SaImmSearchHandleT searchHandle;
> + SaImmSearchParametersT_2 searchParam;
> +
> + SaNameT sg_dn;
> + AVD_SG *sg;
> + unsigned int num_of_values = 0;
> + const SaImmAttrValuesT_2 **attributes;
> + const char *className = "SaAmfSG";
> + const SaImmAttrNameT searchAttributes[] = {
> + const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
> + NULL
> + };
> +
> + TRACE_ENTER();
> +
> + osafassert(cb->scs_absence_max_duration > 0);
> +
> + searchParam.searchOneAttr.attrName =
> const_cast<SaImmAttrNameT>("SaImmAttrClassName");
> + searchParam.searchOneAttr.attrValueType = SA_IMM_ATTR_SASTRINGT;
> + searchParam.searchOneAttr.attrValue = &className;
> +
> + if ((rc = immutil_saImmOmSearchInitialize_2(cb->immOmHandle, NULL,
> SA_IMM_SUBTREE,
> + SA_IMM_SEARCH_ONE_ATTR | SA_IMM_SEARCH_GET_SOME_ATTR,
> &searchParam,
> + searchAttributes, &searchHandle)) != SA_AIS_OK) {
> +
> + LOG_ER("%s: saImmOmSearchInitialize_2 failed: %u",
> __FUNCTION__, rc);
> + goto done;
> + }
> +
> + while ((rc = immutil_saImmOmSearchNext_2(searchHandle, &sg_dn,
> + (SaImmAttrValuesT_2 ***)&attributes))
> == SA_AIS_OK) {
> + sg = sg_db->find(Amf::to_string(&sg_dn));
> + if (sg && sg->sg_ncs_spec == false) {
> // Read sg operation list
> if
> (immutil_getAttrValuesNumber(const_cast<SaImmAttrNameT>("osafAmfSGSuOperationList"),
> attributes, &num_of_values) == SA_AIS_OK) {
> unsigned int i;
> diff --git a/osaf/services/saf/amf/amfd/sgproc.cc
> b/osaf/services/saf/amf/amfd/sgproc.cc
> --- a/osaf/services/saf/amf/amfd/sgproc.cc
> +++ b/osaf/services/saf/amf/amfd/sgproc.cc
> @@ -1098,7 +1098,7 @@ void avd_su_si_assign_evh(AVD_CL_CB *cb,
> }
>
> if (su->list_of_susi == AVD_SU_SI_REL_NULL) {
> - LOG_ER("%s: no susis", __FUNCTION__);
> + LOG_WA("%s: no susis", __FUNCTION__);
> goto done;
> }
>
> diff --git a/osaf/services/saf/amf/amfd/siass.cc
> b/osaf/services/saf/amf/amfd/siass.cc
> --- a/osaf/services/saf/amf/amfd/siass.cc
> +++ b/osaf/services/saf/amf/amfd/siass.cc
> @@ -261,7 +261,9 @@ void avd_susi_read_headless_cached_rta(A
> m_AVD_SET_SG_ADMIN_SI(cb, si);
> }
> }
> -
> + // only restore if not done
> + if (susi->su->su_on_node->admin_ng == nullptr)
> + avd_ng_restore_headless_states(cb, susi);
[Praveen][ Since admin operations are sequential, I think
avd_ng_restore_headless_states() function
can return ptr to NG or NULL. Now for setting admin_ng for any other
SUSI when we know admin operated NG we can use it to avoid calls to
avd_ng_restore_headless_states(). Like:
if ((susi->su->su_on_node->admin_ng == nullptr) && (admin_ng ==
nullptr)) {
admin_ng = avd_ng_restore_headless_states(cb, susi);
} else if ((node_in_nodegroup(susi->su->su_on_node->admin_ng ,
admin_ng) == true) &&
(admin_ng->saAmfNGAdminState == LOCKED or SHUTTING_DOWN)) {
susi->su->su_on_node->admin_ng = admin_ng;
}
It can be done as a part of refactoring later.
> } else {
> // This susi does not exist after headless, but it's
> still in IMM
> // delete it for now
>
------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel