Hi Praveen Ack (reviewed and regression tests run)
Thanks -----Original Message----- From: <[email protected]> Date: Wednesday, 15 February 2017 at 7:39 pm To: <[email protected]>, <[email protected]>, gary <[email protected]>, <[email protected]> Cc: <[email protected]> Subject: [PATCH 1 of 1] amfd: support for cluster reset recovery[#2065] src/amf/amfd/comp.cc | 5 ++- src/amf/amfd/comptype.cc | 5 ++- src/amf/amfd/sgproc.cc | 49 ++++++++++++++++++++++++++++++++++++ src/amf/amfd/util.cc | 40 ++++++++++++++++++++++++++++++ src/amf/amfd/util.h | 2 + src/amf/amfnd/cpm.cc | 3 +- src/amf/amfnd/err.cc | 64 +++++++++++++++++++++++++++++++++++------------ 7 files changed, 146 insertions(+), 22 deletions(-) Support for cluster reset recovery (SA_AMF_CLUSTER_RESET = 7) mentioned in B.04.01 section 3.11.1.3.4 Cluster Reset Recovery Action. -Use this by setting saAmfCompRecoveryOnError or saAmfCtDefRecoveryOnError in application configuration. Or -pass as argument in APIs: saAmfPmStart(), saAmfComponentErrorReport(), saAmfPmStart_3(), saAmfHealthcheckStart(), and saAmfComponentErrorReport_4(). TODO: AMFD will have to raise alarm for cluster reset. diff --git a/src/amf/amfd/comp.cc b/src/amf/amfd/comp.cc --- a/src/amf/amfd/comp.cc +++ b/src/amf/amfd/comp.cc @@ -2,6 +2,7 @@ * * (C) Copyright 2008 The OpenSAF Foundation * (C) Copyright 2017 Ericsson AB - All Rights Reserved. + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY @@ -368,7 +369,7 @@ static int is_config_valid(const std::st rc = immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfCompRecoveryOnError"), attributes, 0, &value); if (rc == SA_AIS_OK) { - if ((value < SA_AMF_NO_RECOMMENDATION) || (value > SA_AMF_NODE_FAILFAST)) { + if ((value < SA_AMF_NO_RECOMMENDATION) || (value > SA_AMF_CLUSTER_RESET)) { report_ccb_validation_error(opdata, "Illegal/unsupported saAmfCompRecoveryOnError value %u for '%s'", value, dn.c_str()); return 0; @@ -1186,7 +1187,7 @@ static SaAisErrorT ccb_completed_modify_ if (value_is_deleted == true) continue; uint32_t recovery = *((SaUint32T *)value); - if ((recovery < SA_AMF_NO_RECOMMENDATION) || (recovery > SA_AMF_CONTAINER_RESTART )) { + if ((recovery < SA_AMF_NO_RECOMMENDATION) || (recovery > SA_AMF_CLUSTER_RESET)) { report_ccb_validation_error(opdata, "Modification of saAmfCompRecoveryOnError Fail," " Invalid recovery =%d",recovery); goto done; diff --git a/src/amf/amfd/comptype.cc b/src/amf/amfd/comptype.cc --- a/src/amf/amfd/comptype.cc +++ b/src/amf/amfd/comptype.cc @@ -1,6 +1,7 @@ /* -*- OpenSAF -*- * * (C) Copyright 2008 The OpenSAF Foundation + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY @@ -335,7 +336,7 @@ static bool config_is_valid(const std::s rc = immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfCtDefRecoveryOnError"), attributes, 0, &value); osafassert(rc == SA_AIS_OK); - if ((value < SA_AMF_NO_RECOMMENDATION) || (value > SA_AMF_NODE_FAILFAST)) { + if ((value < SA_AMF_NO_RECOMMENDATION) || (value > SA_AMF_CLUSTER_RESET)) { report_ccb_validation_error(opdata, "Illegal/unsupported saAmfCtDefRecoveryOnError value %u for '%s'", value, dn.c_str()); return false; @@ -646,7 +647,7 @@ static SaAisErrorT ccb_completed_modify_ goto done; } uint32_t value = *((SaUint32T *)mod->modAttr.attrValues[0]); - if ((value < SA_AMF_COMPONENT_RESTART) || (value > SA_AMF_NODE_FAILFAST)) { + if ((value < SA_AMF_COMPONENT_RESTART) || (value > SA_AMF_CLUSTER_RESET)) { report_ccb_validation_error(opdata, "Invalid saAmfCtDefRecoveryOnError for '%s'", dn); rc = SA_AIS_ERR_BAD_OPERATION; diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc --- a/src/amf/amfd/sgproc.cc +++ b/src/amf/amfd/sgproc.cc @@ -1,6 +1,7 @@ /* -*- OpenSAF -*- * * (C) Copyright 2008 The OpenSAF Foundation + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY @@ -626,6 +627,46 @@ static void perform_nodeswitchover_recov done: TRACE_LEAVE(); } + +/** + * @brief Performs Cluster reset recovery. + **/ +static void perform_cluster_reset_recovery() { + TRACE_ENTER(); + uint32_t rc = NCSCC_RC_SUCCESS; + AVD_AVND *node = nullptr; + for (std::map<uint32_t, AVD_AVND *>::const_iterator it = node_id_db->begin(); + it != node_id_db->end(); it++) { + node = it->second; + //First reboot payloads. + if ((node->node_info.nodeId == avd_cb->node_id_avd) || + (node->node_info.nodeId == avd_cb->node_id_avd_other)) + continue; + TRACE_1("node:'%s', nodeId:%x", node->name.c_str(), node->node_info.nodeId); + rc = avd_send_reboot_msg_directly(node); + if (rc != NCSCC_RC_SUCCESS) + TRACE_1("Send failed fpr Reboot msg to payload."); + } + + //Send for standby. + node = nullptr; + node = avd_node_find_nodeid(avd_cb->node_id_avd_other); + if (node != nullptr) { + rc = avd_send_reboot_msg_directly(node); + if (rc != NCSCC_RC_SUCCESS) + TRACE_1("Send failed for Reboot msg to standby."); + } + + //Send for self. + node = nullptr; + node = avd_node_find_nodeid(avd_cb->node_id_avd); + osafassert(node != nullptr); + rc = avd_send_reboot_msg_directly(node); + if (rc != NCSCC_RC_SUCCESS) + TRACE_1("Send failed for Reboot msg to active."); + + TRACE_LEAVE(); +} /***************************************************************************** * Function: avd_su_oper_state_func * @@ -692,6 +733,8 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb } else if (n2d_msg->msg_info.n2d_opr_state.rec_rcvr.saf_amf == SA_AMF_NODE_FAILOVER) { saflog(LOG_NOTICE, amfSvcUsrName, "Node Fail-Over requested by '%s'", node->name.c_str()); + } else if (n2d_msg->msg_info.n2d_opr_state.rec_rcvr.saf_amf == SA_AMF_CLUSTER_RESET) { + saflog(LOG_NOTICE, amfSvcUsrName, "Cluster reset requested by '%s'", node->name.c_str()); } /* Verify that the SU and node oper state is diabled and rcvr is failfast */ @@ -819,6 +862,12 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb perform_nodeswitchover_recovery(su->su_on_node); goto done; break; + case SA_AMF_CLUSTER_RESET: + perform_cluster_reset_recovery(); + LOG_WA("Wait for reboot"); + for (;;) + sleep(1); + break; default : break; } diff --git a/src/amf/amfd/util.cc b/src/amf/amfd/util.cc --- a/src/amf/amfd/util.cc +++ b/src/amf/amfd/util.cc @@ -1,6 +1,7 @@ /* -*- OpenSAF -*- * * (C) Copyright 2008 The OpenSAF Foundation + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY @@ -2125,3 +2126,42 @@ void avd_association_namet_init(const st child.erase(std::remove(child.begin(), child.end(), '\\'), child.end()); } +/** + * @brief Sends reboot msg to node directly without queueing it up in + * AMFD message queue. + * @param ptr to AVD_AVND. + * @return NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. + **/ +uint32_t avd_send_reboot_msg_directly(AVD_AVND *node) { + NCSMDS_INFO snd_mds = {0}; + uint32_t rc = NCSCC_RC_SUCCESS; + AVD_DND_MSG *d2n_msg = new AVD_DND_MSG(); + + d2n_msg->msg_type = AVSV_D2N_REBOOT_MSG; + d2n_msg->msg_info.d2n_reboot_info.node_id = node->node_info.nodeId; + + if (node->adest == 0) { + LOG_WA("Invalid adest for %x, msg type %u", + node->node_info.nodeId, d2n_msg->msg_type); + rc = NCSCC_RC_FAILURE; + goto done; + } + d2n_msg->msg_info.d2n_reboot_info.msg_id = ++(node->snd_msg_id); + + TRACE("Sending REBOOT MSG to %x", node->node_info.nodeId); + + snd_mds.i_mds_hdl = avd_cb->adest_hdl; + snd_mds.i_svc_id = NCSMDS_SVC_ID_AVD; + snd_mds.i_op = MDS_SEND; + snd_mds.info.svc_send.i_msg = (NCSCONTEXT)d2n_msg; + snd_mds.info.svc_send.i_to_svc = NCSMDS_SVC_ID_AVND; + snd_mds.info.svc_send.i_priority = MDS_SEND_PRIORITY_HIGH; + snd_mds.info.svc_send.i_sendtype = MDS_SENDTYPE_SND; + snd_mds.info.svc_send.info.snd.i_to_dest = node->adest; + if ((rc = ncsmds_api(&snd_mds)) != NCSCC_RC_SUCCESS) { + LOG_ER("ncsmds_api failed %u", rc); + } +done: + delete d2n_msg; + return rc; +} diff --git a/src/amf/amfd/util.h b/src/amf/amfd/util.h --- a/src/amf/amfd/util.h +++ b/src/amf/amfd/util.h @@ -1,6 +1,7 @@ /* -*- OpenSAF -*- * * (C) Copyright 2008 The OpenSAF Foundation + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY @@ -104,5 +105,6 @@ extern const char *admin_op_name(SaAmfAd int compare_sanamet(const std::string& lhs, const std::string& rhs); uint32_t avd_snd_compcsi_msg(AVD_COMP *comp, AVD_CSI *csi, avd_comp_csi_rel_tag *compcsi, AVSV_COMPCSI_ACT act); +uint32_t avd_send_reboot_msg_directly(AVD_AVND *node); #endif // AMF_AMFD_UTIL_H_ diff --git a/src/amf/amfnd/cpm.cc b/src/amf/amfnd/cpm.cc --- a/src/amf/amfnd/cpm.cc +++ b/src/amf/amfnd/cpm.cc @@ -1,6 +1,7 @@ /* -*- OpenSAF -*- * * (C) Copyright 2008 The OpenSAF Foundation + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY @@ -556,7 +557,7 @@ void avnd_comp_pm_param_val(AVND_CB *cb, *o_amf_rc = SA_AIS_ERR_NOT_EXIST; return; } - if ((pm_start->rec_rcvr.saf_amf >= SA_AMF_CLUSTER_RESET) && + if ((pm_start->rec_rcvr.saf_amf > SA_AMF_CLUSTER_RESET) && (pm_start->rec_rcvr.saf_amf <= SA_AMF_CONTAINER_RESTART)) { *o_amf_rc = SA_AIS_ERR_NOT_SUPPORTED; return; diff --git a/src/amf/amfnd/err.cc b/src/amf/amfnd/err.cc --- a/src/amf/amfnd/err.cc +++ b/src/amf/amfnd/err.cc @@ -1,6 +1,7 @@ /* -*- OpenSAF -*- * * (C) Copyright 2008 The OpenSAF Foundation + * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY @@ -189,11 +190,15 @@ uint32_t avnd_evt_ava_err_rep_evh(AVND_C amf_rc = SA_AIS_ERR_INVALID_PARAM; } - if(comp && ((err_rep->rec_rcvr.saf_amf == SA_AMF_CLUSTER_RESET) || - (err_rep->rec_rcvr.saf_amf == SA_AMF_APPLICATION_RESTART)|| + if(comp && ((err_rep->rec_rcvr.saf_amf == SA_AMF_APPLICATION_RESTART)|| (err_rep->rec_rcvr.saf_amf == SA_AMF_CONTAINER_RESTART))) amf_rc = SA_AIS_ERR_NOT_SUPPORTED; + if (comp && (comp->su->is_ncs == true) && + (err_rep->rec_rcvr.saf_amf == SA_AMF_CLUSTER_RESET)) { + LOG_NO("Cluster Reset recovery not supported for MW components '%s'", comp->name.c_str()); + amf_rc = SA_AIS_ERR_NOT_SUPPORTED; + } /* send the response back to AvA */ rc = avnd_amf_resp_send(cb, AVSV_AMF_ERR_REP, amf_rc, 0, &api_info->dest, &evt->mds_ctxt, comp, msg_from_avnd); @@ -296,6 +301,45 @@ uint32_t avnd_evt_ava_err_clear_evh(AVND return rc; } +/** + * @brief Performs cluster reset recovery action. + * + * @param cb: ptr to AvND control block. + * @param su: ptr to the SU which contains the failed component. + * @param comp: ptr to failed component. + * + * @return NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE. + */ +static uint32_t avnd_err_rcvr_cluster_reset(AVND_CB *cb, AVND_SU *failed_su, AVND_COMP *failed_comp) { + uint32_t rc = NCSCC_RC_SUCCESS; + TRACE_ENTER(); + + m_AVND_COMP_FAILED_SET(failed_comp); + m_AVND_SU_FAILED_SET(failed_su); + + m_AVND_COMP_OPER_STATE_SET(failed_comp, SA_AMF_OPERATIONAL_DISABLED); + rc = avnd_comp_oper_state_avd_sync(cb, failed_comp); + if (NCSCC_RC_SUCCESS != rc) + goto done; + + rc = avnd_comp_curr_info_del(cb, failed_comp); + if (NCSCC_RC_SUCCESS != rc) + goto done; + + //AMFD will not send any assignments, so clean up PI/NPI comp. + rc = avnd_comp_clc_fsm_run(cb, failed_comp, AVND_COMP_CLC_PRES_FSM_EV_CLEANUP); + if (NCSCC_RC_SUCCESS != rc) + goto done; + + cb->oper_state = SA_AMF_OPERATIONAL_DISABLED; + m_AVND_SU_OPER_STATE_SET(failed_su, SA_AMF_OPERATIONAL_DISABLED); + rc = avnd_di_oper_send(cb, failed_su, SA_AMF_CLUSTER_RESET); + +done: + TRACE_LEAVE2("%u", rc); + return rc; +} + /**************************************************************************** Name : avnd_err_process @@ -532,20 +576,6 @@ uint32_t avnd_err_recover(AVND_CB *cb, A return rc; } - /* if we are already inst-failed, do nothing */ - if ((su->pres == SA_AMF_PRESENCE_INSTANTIATION_FAILED) && - (comp->pres == SA_AMF_PRESENCE_TERMINATING) && (rcvr != SA_AMF_NODE_FAILOVER) - && (rcvr != SA_AMF_NODE_FAILFAST)) { - rc = avnd_comp_clc_fsm_run(cb, comp, AVND_COMP_CLC_PRES_FSM_EV_CLEANUP); - return rc; - } - - /* if we are already terminating do nothing */ - if ((comp->pres == SA_AMF_PRESENCE_TERMINATING) && (rcvr == SA_AMF_COMPONENT_RESTART)) { - rc = avnd_comp_clc_fsm_run(cb, comp, AVND_COMP_CLC_PRES_FSM_EV_CLEANUP); - return rc; - } - /* When SU is in TERMINATING state, higher level recovery (SA_AMF_NODE_FAILOVER, SA_AMF_NODE_FAILFAST and SA_AMF_NODE_SWITCHOVER) should be processed because higher level recovery will terminate the component. If the faulted component has recovery @@ -595,7 +625,7 @@ uint32_t avnd_err_recover(AVND_CB *cb, A break; case SA_AMF_CLUSTER_RESET: - /* not supported */ + rc = avnd_err_rcvr_cluster_reset(cb, su, comp); break; case AVSV_ERR_RCVR_SU_RESTART: ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
