src/amf/amfd/comp.cc | 5 ++-
src/amf/amfd/comptype.cc | 5 ++-
src/amf/amfd/sgproc.cc | 49 ++++++++++++++++++++++++++++++++++++
src/amf/amfd/util.cc | 40 ++++++++++++++++++++++++++++++
src/amf/amfd/util.h | 2 +
src/amf/amfnd/cpm.cc | 3 +-
src/amf/amfnd/err.cc | 64 +++++++++++++++++++++++++++++++++++------------
7 files changed, 146 insertions(+), 22 deletions(-)
Support for cluster reset recovery (SA_AMF_CLUSTER_RESET = 7) mentioned in
B.04.01 section 3.11.1.3.4 Cluster Reset Recovery Action.
-Use this by setting saAmfCompRecoveryOnError or saAmfCtDefRecoveryOnError in
application configuration. Or
-pass as argument in APIs:
saAmfPmStart(), saAmfComponentErrorReport(), saAmfPmStart_3(),
saAmfHealthcheckStart(),
and saAmfComponentErrorReport_4().
TODO: AMFD will have to raise alarm for cluster reset.
diff --git a/src/amf/amfd/comp.cc b/src/amf/amfd/comp.cc
--- a/src/amf/amfd/comp.cc
+++ b/src/amf/amfd/comp.cc
@@ -2,6 +2,7 @@
*
* (C) Copyright 2008 The OpenSAF Foundation
* (C) Copyright 2017 Ericsson AB - All Rights Reserved.
+ * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -368,7 +369,7 @@ static int is_config_valid(const std::st
rc =
immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfCompRecoveryOnError"),
attributes, 0, &value);
if (rc == SA_AIS_OK) {
- if ((value < SA_AMF_NO_RECOMMENDATION) || (value >
SA_AMF_NODE_FAILFAST)) {
+ if ((value < SA_AMF_NO_RECOMMENDATION) || (value >
SA_AMF_CLUSTER_RESET)) {
report_ccb_validation_error(opdata,
"Illegal/unsupported saAmfCompRecoveryOnError value %u for '%s'",
value, dn.c_str());
return 0;
@@ -1186,7 +1187,7 @@ static SaAisErrorT ccb_completed_modify_
if (value_is_deleted == true)
continue;
uint32_t recovery = *((SaUint32T *)value);
- if ((recovery < SA_AMF_NO_RECOMMENDATION) || (recovery
> SA_AMF_CONTAINER_RESTART )) {
+ if ((recovery < SA_AMF_NO_RECOMMENDATION) || (recovery
> SA_AMF_CLUSTER_RESET)) {
report_ccb_validation_error(opdata,
"Modification of saAmfCompRecoveryOnError Fail,"
" Invalid recovery
=%d",recovery);
goto done;
diff --git a/src/amf/amfd/comptype.cc b/src/amf/amfd/comptype.cc
--- a/src/amf/amfd/comptype.cc
+++ b/src/amf/amfd/comptype.cc
@@ -1,6 +1,7 @@
/* -*- OpenSAF -*-
*
* (C) Copyright 2008 The OpenSAF Foundation
+ * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -335,7 +336,7 @@ static bool config_is_valid(const std::s
rc =
immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfCtDefRecoveryOnError"),
attributes, 0, &value);
osafassert(rc == SA_AIS_OK);
- if ((value < SA_AMF_NO_RECOMMENDATION) || (value >
SA_AMF_NODE_FAILFAST)) {
+ if ((value < SA_AMF_NO_RECOMMENDATION) || (value >
SA_AMF_CLUSTER_RESET)) {
report_ccb_validation_error(opdata, "Illegal/unsupported
saAmfCtDefRecoveryOnError value %u for '%s'",
value, dn.c_str());
return false;
@@ -646,7 +647,7 @@ static SaAisErrorT ccb_completed_modify_
goto done;
}
uint32_t value = *((SaUint32T
*)mod->modAttr.attrValues[0]);
- if ((value < SA_AMF_COMPONENT_RESTART) || (value >
SA_AMF_NODE_FAILFAST)) {
+ if ((value < SA_AMF_COMPONENT_RESTART) || (value >
SA_AMF_CLUSTER_RESET)) {
report_ccb_validation_error(opdata,
"Invalid saAmfCtDefRecoveryOnError for
'%s'", dn);
rc = SA_AIS_ERR_BAD_OPERATION;
diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc
--- a/src/amf/amfd/sgproc.cc
+++ b/src/amf/amfd/sgproc.cc
@@ -1,6 +1,7 @@
/* -*- OpenSAF -*-
*
* (C) Copyright 2008 The OpenSAF Foundation
+ * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -626,6 +627,46 @@ static void perform_nodeswitchover_recov
done:
TRACE_LEAVE();
}
+
+/**
+ * @brief Performs Cluster reset recovery.
+ **/
+static void perform_cluster_reset_recovery() {
+ TRACE_ENTER();
+ uint32_t rc = NCSCC_RC_SUCCESS;
+ AVD_AVND *node = nullptr;
+ for (std::map<uint32_t, AVD_AVND *>::const_iterator it = node_id_db->begin();
+ it != node_id_db->end(); it++) {
+ node = it->second;
+ //First reboot payloads.
+ if ((node->node_info.nodeId == avd_cb->node_id_avd) ||
+ (node->node_info.nodeId == avd_cb->node_id_avd_other))
+ continue;
+ TRACE_1("node:'%s', nodeId:%x", node->name.c_str(),
node->node_info.nodeId);
+ rc = avd_send_reboot_msg_directly(node);
+ if (rc != NCSCC_RC_SUCCESS)
+ TRACE_1("Send failed fpr Reboot msg to payload.");
+ }
+
+ //Send for standby.
+ node = nullptr;
+ node = avd_node_find_nodeid(avd_cb->node_id_avd_other);
+ if (node != nullptr) {
+ rc = avd_send_reboot_msg_directly(node);
+ if (rc != NCSCC_RC_SUCCESS)
+ TRACE_1("Send failed for Reboot msg to standby.");
+ }
+
+ //Send for self.
+ node = nullptr;
+ node = avd_node_find_nodeid(avd_cb->node_id_avd);
+ osafassert(node != nullptr);
+ rc = avd_send_reboot_msg_directly(node);
+ if (rc != NCSCC_RC_SUCCESS)
+ TRACE_1("Send failed for Reboot msg to active.");
+
+ TRACE_LEAVE();
+}
/*****************************************************************************
* Function: avd_su_oper_state_func
*
@@ -692,6 +733,8 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb
} else if (n2d_msg->msg_info.n2d_opr_state.rec_rcvr.saf_amf ==
SA_AMF_NODE_FAILOVER) {
saflog(LOG_NOTICE, amfSvcUsrName, "Node Fail-Over requested by
'%s'",
node->name.c_str());
+ } else if (n2d_msg->msg_info.n2d_opr_state.rec_rcvr.saf_amf ==
SA_AMF_CLUSTER_RESET) {
+ saflog(LOG_NOTICE, amfSvcUsrName, "Cluster reset requested by
'%s'", node->name.c_str());
}
/* Verify that the SU and node oper state is diabled and rcvr is
failfast */
@@ -819,6 +862,12 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb
perform_nodeswitchover_recovery(su->su_on_node);
goto done;
break;
+ case SA_AMF_CLUSTER_RESET:
+ perform_cluster_reset_recovery();
+ LOG_WA("Wait for reboot");
+ for (;;)
+ sleep(1);
+ break;
default :
break;
}
diff --git a/src/amf/amfd/util.cc b/src/amf/amfd/util.cc
--- a/src/amf/amfd/util.cc
+++ b/src/amf/amfd/util.cc
@@ -1,6 +1,7 @@
/* -*- OpenSAF -*-
*
* (C) Copyright 2008 The OpenSAF Foundation
+ * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -2125,3 +2126,42 @@ void avd_association_namet_init(const st
child.erase(std::remove(child.begin(), child.end(), '\\'), child.end());
}
+/**
+ * @brief Sends reboot msg to node directly without queueing it up in
+ * AMFD message queue.
+ * @param ptr to AVD_AVND.
+ * @return NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.
+ **/
+uint32_t avd_send_reboot_msg_directly(AVD_AVND *node) {
+ NCSMDS_INFO snd_mds = {0};
+ uint32_t rc = NCSCC_RC_SUCCESS;
+ AVD_DND_MSG *d2n_msg = new AVD_DND_MSG();
+
+ d2n_msg->msg_type = AVSV_D2N_REBOOT_MSG;
+ d2n_msg->msg_info.d2n_reboot_info.node_id = node->node_info.nodeId;
+
+ if (node->adest == 0) {
+ LOG_WA("Invalid adest for %x, msg type %u",
+ node->node_info.nodeId, d2n_msg->msg_type);
+ rc = NCSCC_RC_FAILURE;
+ goto done;
+ }
+ d2n_msg->msg_info.d2n_reboot_info.msg_id = ++(node->snd_msg_id);
+
+ TRACE("Sending REBOOT MSG to %x", node->node_info.nodeId);
+
+ snd_mds.i_mds_hdl = avd_cb->adest_hdl;
+ snd_mds.i_svc_id = NCSMDS_SVC_ID_AVD;
+ snd_mds.i_op = MDS_SEND;
+ snd_mds.info.svc_send.i_msg = (NCSCONTEXT)d2n_msg;
+ snd_mds.info.svc_send.i_to_svc = NCSMDS_SVC_ID_AVND;
+ snd_mds.info.svc_send.i_priority = MDS_SEND_PRIORITY_HIGH;
+ snd_mds.info.svc_send.i_sendtype = MDS_SENDTYPE_SND;
+ snd_mds.info.svc_send.info.snd.i_to_dest = node->adest;
+ if ((rc = ncsmds_api(&snd_mds)) != NCSCC_RC_SUCCESS) {
+ LOG_ER("ncsmds_api failed %u", rc);
+ }
+done:
+ delete d2n_msg;
+ return rc;
+}
diff --git a/src/amf/amfd/util.h b/src/amf/amfd/util.h
--- a/src/amf/amfd/util.h
+++ b/src/amf/amfd/util.h
@@ -1,6 +1,7 @@
/* -*- OpenSAF -*-
*
* (C) Copyright 2008 The OpenSAF Foundation
+ * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -104,5 +105,6 @@ extern const char *admin_op_name(SaAmfAd
int compare_sanamet(const std::string& lhs, const std::string& rhs);
uint32_t avd_snd_compcsi_msg(AVD_COMP *comp, AVD_CSI *csi,
avd_comp_csi_rel_tag *compcsi, AVSV_COMPCSI_ACT act);
+uint32_t avd_send_reboot_msg_directly(AVD_AVND *node);
#endif // AMF_AMFD_UTIL_H_
diff --git a/src/amf/amfnd/cpm.cc b/src/amf/amfnd/cpm.cc
--- a/src/amf/amfnd/cpm.cc
+++ b/src/amf/amfnd/cpm.cc
@@ -1,6 +1,7 @@
/* -*- OpenSAF -*-
*
* (C) Copyright 2008 The OpenSAF Foundation
+ * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -556,7 +557,7 @@ void avnd_comp_pm_param_val(AVND_CB *cb,
*o_amf_rc = SA_AIS_ERR_NOT_EXIST;
return;
}
- if ((pm_start->rec_rcvr.saf_amf >=
SA_AMF_CLUSTER_RESET) &&
+ if ((pm_start->rec_rcvr.saf_amf > SA_AMF_CLUSTER_RESET)
&&
(pm_start->rec_rcvr.saf_amf <=
SA_AMF_CONTAINER_RESTART)) {
*o_amf_rc = SA_AIS_ERR_NOT_SUPPORTED;
return;
diff --git a/src/amf/amfnd/err.cc b/src/amf/amfnd/err.cc
--- a/src/amf/amfnd/err.cc
+++ b/src/amf/amfnd/err.cc
@@ -1,6 +1,7 @@
/* -*- OpenSAF -*-
*
* (C) Copyright 2008 The OpenSAF Foundation
+ * Copyright (C) 2017, Oracle and/or its affiliates. All rights reserved.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -189,11 +190,15 @@ uint32_t avnd_evt_ava_err_rep_evh(AVND_C
amf_rc = SA_AIS_ERR_INVALID_PARAM;
}
- if(comp && ((err_rep->rec_rcvr.saf_amf == SA_AMF_CLUSTER_RESET) ||
- (err_rep->rec_rcvr.saf_amf ==
SA_AMF_APPLICATION_RESTART)||
+ if(comp && ((err_rep->rec_rcvr.saf_amf == SA_AMF_APPLICATION_RESTART)||
(err_rep->rec_rcvr.saf_amf ==
SA_AMF_CONTAINER_RESTART)))
amf_rc = SA_AIS_ERR_NOT_SUPPORTED;
+ if (comp && (comp->su->is_ncs == true) &&
+ (err_rep->rec_rcvr.saf_amf == SA_AMF_CLUSTER_RESET)) {
+ LOG_NO("Cluster Reset recovery not supported for MW components
'%s'", comp->name.c_str());
+ amf_rc = SA_AIS_ERR_NOT_SUPPORTED;
+ }
/* send the response back to AvA */
rc = avnd_amf_resp_send(cb, AVSV_AMF_ERR_REP, amf_rc, 0,
&api_info->dest, &evt->mds_ctxt, comp, msg_from_avnd);
@@ -296,6 +301,45 @@ uint32_t avnd_evt_ava_err_clear_evh(AVND
return rc;
}
+/**
+ * @brief Performs cluster reset recovery action.
+ *
+ * @param cb: ptr to AvND control block.
+ * @param su: ptr to the SU which contains the failed component.
+ * @param comp: ptr to failed component.
+ *
+ * @return NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE.
+ */
+static uint32_t avnd_err_rcvr_cluster_reset(AVND_CB *cb, AVND_SU *failed_su,
AVND_COMP *failed_comp) {
+ uint32_t rc = NCSCC_RC_SUCCESS;
+ TRACE_ENTER();
+
+ m_AVND_COMP_FAILED_SET(failed_comp);
+ m_AVND_SU_FAILED_SET(failed_su);
+
+ m_AVND_COMP_OPER_STATE_SET(failed_comp, SA_AMF_OPERATIONAL_DISABLED);
+ rc = avnd_comp_oper_state_avd_sync(cb, failed_comp);
+ if (NCSCC_RC_SUCCESS != rc)
+ goto done;
+
+ rc = avnd_comp_curr_info_del(cb, failed_comp);
+ if (NCSCC_RC_SUCCESS != rc)
+ goto done;
+
+ //AMFD will not send any assignments, so clean up PI/NPI comp.
+ rc = avnd_comp_clc_fsm_run(cb, failed_comp,
AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
+ if (NCSCC_RC_SUCCESS != rc)
+ goto done;
+
+ cb->oper_state = SA_AMF_OPERATIONAL_DISABLED;
+ m_AVND_SU_OPER_STATE_SET(failed_su, SA_AMF_OPERATIONAL_DISABLED);
+ rc = avnd_di_oper_send(cb, failed_su, SA_AMF_CLUSTER_RESET);
+
+done:
+ TRACE_LEAVE2("%u", rc);
+ return rc;
+}
+
/****************************************************************************
Name : avnd_err_process
@@ -532,20 +576,6 @@ uint32_t avnd_err_recover(AVND_CB *cb, A
return rc;
}
- /* if we are already inst-failed, do nothing */
- if ((su->pres == SA_AMF_PRESENCE_INSTANTIATION_FAILED) &&
- (comp->pres == SA_AMF_PRESENCE_TERMINATING) && (rcvr !=
SA_AMF_NODE_FAILOVER)
- && (rcvr != SA_AMF_NODE_FAILFAST)) {
- rc = avnd_comp_clc_fsm_run(cb, comp,
AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
- return rc;
- }
-
- /* if we are already terminating do nothing */
- if ((comp->pres == SA_AMF_PRESENCE_TERMINATING) && (rcvr ==
SA_AMF_COMPONENT_RESTART)) {
- rc = avnd_comp_clc_fsm_run(cb, comp,
AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
- return rc;
- }
-
/* When SU is in TERMINATING state, higher level recovery
(SA_AMF_NODE_FAILOVER,
SA_AMF_NODE_FAILFAST and SA_AMF_NODE_SWITCHOVER) should be processed
because higher
level recovery will terminate the component. If the faulted
component has recovery
@@ -595,7 +625,7 @@ uint32_t avnd_err_recover(AVND_CB *cb, A
break;
case SA_AMF_CLUSTER_RESET:
- /* not supported */
+ rc = avnd_err_rcvr_cluster_reset(cb, su, comp);
break;
case AVSV_ERR_RCVR_SU_RESTART:
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel