osaf/services/saf/amf/amfd/cluster.cc | 69 ++++- osaf/services/saf/amf/amfd/comp.cc | 8 +- osaf/services/saf/amf/amfd/csi.cc | 107 +++++++ osaf/services/saf/amf/amfd/imm.cc | 58 ++++ osaf/services/saf/amf/amfd/include/cb.h | 5 + osaf/services/saf/amf/amfd/include/cluster.h | 1 + osaf/services/saf/amf/amfd/include/csi.h | 2 + osaf/services/saf/amf/amfd/include/db_template.h | 1 + osaf/services/saf/amf/amfd/include/evt.h | 3 + osaf/services/saf/amf/amfd/include/mds.h | 7 +- osaf/services/saf/amf/amfd/include/msg.h | 2 +- osaf/services/saf/amf/amfd/include/node.h | 3 + osaf/services/saf/amf/amfd/include/proc.h | 7 + osaf/services/saf/amf/amfd/include/sg.h | 16 +- osaf/services/saf/amf/amfd/include/si.h | 1 + osaf/services/saf/amf/amfd/include/susi.h | 3 + osaf/services/saf/amf/amfd/include/timer.h | 1 + osaf/services/saf/amf/amfd/include/util.h | 2 +- osaf/services/saf/amf/amfd/main.cc | 24 + osaf/services/saf/amf/amfd/mds.cc | 4 +- osaf/services/saf/amf/amfd/ndfsm.cc | 325 ++++++++++++++++++++++- osaf/services/saf/amf/amfd/ndmsg.cc | 18 +- osaf/services/saf/amf/amfd/ndproc.cc | 103 +++++++- osaf/services/saf/amf/amfd/node.cc | 17 +- osaf/services/saf/amf/amfd/sg.cc | 57 ++++ osaf/services/saf/amf/amfd/sg_2n_fsm.cc | 140 +++++++++ osaf/services/saf/amf/amfd/sg_nored_fsm.cc | 6 + osaf/services/saf/amf/amfd/sg_npm_fsm.cc | 24 + osaf/services/saf/amf/amfd/sg_nway_fsm.cc | 24 + osaf/services/saf/amf/amfd/sg_nwayact_fsm.cc | 6 + osaf/services/saf/amf/amfd/sgproc.cc | 47 ++- osaf/services/saf/amf/amfd/si.cc | 43 ++- osaf/services/saf/amf/amfd/siass.cc | 121 ++++++++ osaf/services/saf/amf/amfd/su.cc | 19 +- 34 files changed, 1207 insertions(+), 67 deletions(-)
Outlined changes: . node_up_msg event handling has changed so that amfd can collect the sync information sent from amfnd . Node Sync timer is introduced as a window of amfnd sync from headless . Failover may happens during headless, adjust_delayed_failover() to balance the assignment in term of active/standby availability . SI dependencies also can change due to assignment removal during headless adjust_delayed_sidep() to update the si dependencies diff --git a/osaf/services/saf/amf/amfd/cluster.cc b/osaf/services/saf/amf/amfd/cluster.cc --- a/osaf/services/saf/amf/amfd/cluster.cc +++ b/osaf/services/saf/amf/amfd/cluster.cc @@ -25,6 +25,7 @@ #include <imm.h> #include <evt.h> #include <proc.h> +#include <si_dep.h> /* Singleton cluster object */ static AVD_CLUSTER _avd_cluster; @@ -52,6 +53,7 @@ AVD_CLUSTER *avd_cluster = &_avd_cluster void avd_cluster_tmr_init_evh(AVD_CL_CB *cb, AVD_EVT *evt) { TRACE_ENTER(); + AVD_SU *su = nullptr; saflog(LOG_NOTICE, amfSvcUsrName, "Cluster startup timeout, assigning SIs to SUs"); osafassert(evt->info.tmr.type == AVD_TMR_CL_INIT); @@ -74,19 +76,84 @@ void avd_cluster_tmr_init_evh(AVD_CL_CB * system that are not NCS specific. */ + /* The SI Dependency could be broken due to failover or instantiation/ + * termination failure during headless. + * adjust_delayed_sidep() removes SI(s) assignment which has any + * unassigned sponsored SI. + * + */ + if (cb->scs_absence_max_duration > 0) { + adjust_delayed_sidep(); + } + for (std::map<std::string, AVD_SG*>::const_iterator it = sg_db->begin(); it != sg_db->end(); it++) { AVD_SG *i_sg = it->second; if ((i_sg->list_of_su.empty() == true) || (i_sg->sg_ncs_spec == true)) { continue; } - i_sg->realign(cb, i_sg); + + /* If hydra is enabled and su failover happened during headless, + * currently only the active assignment is removed but the standby + * assignment has not been switched to active. + * adjust_delayed_failover() finds the standby assignment being + * moved to active. + * Next, the realign() will find the remaining possible su-si + * to satisfy the number of assignment configuration. + */ + if (cb->scs_absence_max_duration > 0) { + i_sg->adjust_delayed_failover(cb); + } + if (i_sg->sg_fsm_state == AVD_SG_FSM_STABLE) + i_sg->realign(cb, i_sg); + } + + if (cb->scs_absence_max_duration > 0) { + TRACE("check if any SU is auto repair enabled"); + + for (std::map<std::string, AVD_SU*>::const_iterator it = su_db->begin(); + it != su_db->end(); it++) { + + su = it->second; + + if (su->list_of_susi == nullptr && + su->su_on_node != nullptr && + su->su_on_node->saAmfNodeOperState == SA_AMF_OPERATIONAL_ENABLED) { + su_try_repair(su); + } + } } done: TRACE_LEAVE(); } +/**************************************************************************** + * Name : avd_node_sync_tmr_evh + * + * Description : This is node sync timer expiry routine handler + * + * Arguments : cb - AvD cb + * evt - ptr to the received event + * + * Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE + * + * Notes : None. + ***************************************************************************/ +void avd_node_sync_tmr_evh(AVD_CL_CB *cb, AVD_EVT *evt) +{ + TRACE_ENTER(); + + osafassert(evt->info.tmr.type == AVD_TMR_NODE_SYNC); + LOG_NO("NodeSync timeout"); + + // Setting true here to indicate the node sync window has closed + // Further node up message will be treated specially + cb->node_sync_window_closed = true; + + TRACE_LEAVE(); +} + static void ccb_apply_modify_hdlr(struct CcbUtilOperationData *opdata) { const SaImmAttrModificationT_2 *attr_mod; diff --git a/osaf/services/saf/amf/amfd/comp.cc b/osaf/services/saf/amf/amfd/comp.cc --- a/osaf/services/saf/amf/amfd/comp.cc +++ b/osaf/services/saf/amf/amfd/comp.cc @@ -161,7 +161,13 @@ void avd_comp_pres_state_set(AVD_COMP *c node->name.value); LOG_NO("Node Failfast for '%s' as '%s' enters Term/Inst Failed state", node->name.value,comp->comp_info.name.value); - avd_d2n_reboot_snd(node); + + if (node->node_state < AVD_AVND_STATE_PRESENT) { + // reboot when node_up is processed + node->reboot = true; + } else { + avd_d2n_reboot_snd(node); + } } TRACE_LEAVE(); } diff --git a/osaf/services/saf/amf/amfd/csi.cc b/osaf/services/saf/amf/amfd/csi.cc --- a/osaf/services/saf/amf/amfd/csi.cc +++ b/osaf/services/saf/amf/amfd/csi.cc @@ -1409,3 +1409,110 @@ bool are_sponsor_csis_assigned_in_su(AVD return true; } +/** + * Clean up COMPCSI objects by searching for SaAmfCSIAssignment instances in IMM + * @return SA_AIS_OK when OK + */ +SaAisErrorT avd_compcsi_cleanup(void) +{ + SaAisErrorT rc; + SaImmSearchHandleT searchHandle; + SaImmSearchParametersT_2 searchParam; + const char *className = "SaAmfCSIAssignment"; + + TRACE_ENTER(); + + searchParam.searchOneAttr.attrName = const_cast<SaImmAttrNameT>("SaImmAttrClassName"); + searchParam.searchOneAttr.attrValueType = SA_IMM_ATTR_SASTRINGT; + searchParam.searchOneAttr.attrValue = &className; + + if ((rc = immutil_saImmOmSearchInitialize_2(avd_cb->immOmHandle, nullptr, SA_IMM_SUBTREE, + SA_IMM_SEARCH_ONE_ATTR | SA_IMM_SEARCH_GET_NO_ATTR, &searchParam, + nullptr, &searchHandle)) != SA_AIS_OK) { + LOG_ER("%s: saImmOmSearchInitialize_2 failed: %u", __FUNCTION__, rc); + goto done; + } + + SaNameT csiass_name; + const SaImmAttrValuesT_2 **attributes; + while ((rc = immutil_saImmOmSearchNext_2(searchHandle, &csiass_name, + (SaImmAttrValuesT_2 ***)&attributes)) == SA_AIS_OK) { + avd_saImmOiRtObjectDelete(&csiass_name); + } + + (void)immutil_saImmOmSearchFinalize(searchHandle); + +done: + TRACE_LEAVE(); + return SA_AIS_OK; +} + +/** + * Re-create csi assignment and update comp related states, which are + * collected after headless + * Update relevant runtime attributes + * @return SA_AIS_OK when OK + */ +SaAisErrorT avd_compcsi_recreate(AVSV_N2D_ND_CSICOMP_STATE_MSG_INFO *info) +{ + AVD_SU_SI_REL *susi; + const AVD_SI *si; + AVD_CSI *csi; + AVD_COMP *comp; + const AVSV_CSICOMP_STATE_MSG *csicomp; + const AVSV_COMP_STATE_MSG *comp_state; + + TRACE_ENTER(); + + for (csicomp = info->csicomp_list; csicomp != nullptr; csicomp=csicomp->next) { + csi = csi_db->find(Amf::to_string(&csicomp->safCSI)); + osafassert(csi); + + comp = comp_db->find(Amf::to_string(&csicomp->safComp)); + osafassert(comp); + + TRACE("Received CSICOMP state msg: csi %s, comp %s", + (char*)&csicomp->safCSI.value, (char*)&csicomp->safComp.value); + + si = csi->si; + osafassert(si); + + susi = avd_susi_find(avd_cb, &comp->su->name, &si->name); + if (susi == 0) { + LOG_ER("SU_SI_REL record for SU '%s' and SI '%s' was not found", + comp->su->name.value, si->name.value); + return SA_AIS_ERR_NOT_EXIST; + } + + AVD_COMP_CSI_REL *compcsi = avd_compcsi_create(susi, csi, comp, true); + osafassert(compcsi); + } + + for (comp_state = info->comp_list; comp_state != nullptr; comp_state = comp_state->next) { + comp = comp_db->find(Amf::to_string(&comp_state->safComp)); + osafassert(comp); + + // operation state + avd_comp_oper_state_set(comp, + static_cast<SaAmfOperationalStateT>(comp_state->comp_oper_state)); + + // . update saAmfCompReadinessState after SU:saAmfSuReadinessState + // . saAmfCompCurrProxyName and saAmfCompCurrProxiedNames wouldn't change during headless + // so they need not to update + + // presense state + avd_comp_pres_state_set(comp, + static_cast<SaAmfPresenceStateT>(comp_state->comp_pres_state)); + + // restart count + comp->saAmfCompRestartCount = comp_state->comp_restart_cnt; + m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, comp, AVSV_CKPT_COMP_RESTART_COUNT); + avd_saImmOiRtObjectUpdate(&comp->comp_info.name, + const_cast<SaImmAttrNameT>("saAmfCompRestartCount"), SA_IMM_ATTR_SAUINT32T, + &comp->saAmfCompRestartCount); + } + + TRACE_LEAVE(); + return SA_AIS_OK; +} + diff --git a/osaf/services/saf/amf/amfd/imm.cc b/osaf/services/saf/amf/amfd/imm.cc --- a/osaf/services/saf/amf/amfd/imm.cc +++ b/osaf/services/saf/amf/amfd/imm.cc @@ -1245,6 +1245,60 @@ static const SaImmOiCallbacksT_2 avd_cal }; /***************************************************************************** + * Function: hydra_config_get + * + * Purpose: This function checks if Hydra configuration is enabled in IMM + * then set the corresponding value to scs_absence_max_duration variable in + * avd_cb. + * + * Input: None. + * + * Returns: SaAisErrorT + * + * NOTES: If IMM attribute fetching fails that means Hydra + * configuration is disabled. + * + **************************************************************************/ +static SaAisErrorT hydra_config_get(void) +{ + SaAisErrorT rc = SA_AIS_OK; + const SaImmAttrValuesT_2 **attributes; + SaImmAccessorHandleT accessorHandle; + SaNameT dn = {0, "opensafImm=opensafImm,safApp=safImmService"}; + SaImmAttrNameT attrName = const_cast<SaImmAttrNameT>("scAbsenceAllowed"); + SaImmAttrNameT attributeNames[] = {attrName, nullptr}; + const SaUint32T *value = nullptr; + + TRACE_ENTER(); + + dn.length = strlen((char *)dn.value); + + immutil_saImmOmAccessorInitialize(avd_cb->immOmHandle, &accessorHandle); + rc = immutil_saImmOmAccessorGet_2(accessorHandle, &dn, attributeNames, + (SaImmAttrValuesT_2 ***)&attributes); + + if (rc != SA_AIS_OK) { + LOG_WA("saImmOmAccessorGet_2 FAILED %u for %s", rc, dn.value); + goto done; + } + + value = immutil_getUint32Attr(attributes, attrName, 0); + if (value == nullptr) { + LOG_WA("immutil_getUint32Attr FAILED for %s", dn.value); + goto done; + } + + avd_cb->scs_absence_max_duration = *value; + +done: + immutil_saImmOmAccessorFinalize(accessorHandle); + LOG_IN("scs_absence_max_duration: %d", avd_cb->scs_absence_max_duration); + + TRACE_LEAVE(); + return SA_AIS_OK; +} + +/***************************************************************************** * Function: avd_imm_init * * Purpose: This function Initialize the OI interface and get a selection @@ -1501,6 +1555,10 @@ unsigned int avd_imm_config_get(void) if (avd_sidep_config_get() != SA_AIS_OK) goto done; + /* retrieve hydra configuration from IMM */ + if (hydra_config_get() != SA_AIS_OK) + goto done; + // SGs needs to adjust configuration once all instances have been added { for (std::map<std::string, AVD_SG*>::const_iterator it = sg_db->begin(); diff --git a/osaf/services/saf/amf/amfd/include/cb.h b/osaf/services/saf/amf/amfd/include/cb.h --- a/osaf/services/saf/amf/amfd/include/cb.h +++ b/osaf/services/saf/amf/amfd/include/cb.h @@ -182,6 +182,7 @@ typedef struct cl_cb_tag { SaClmNodeIdT node_avd_failed; /* node id where AVD is down */ AVD_TMR amf_init_tmr; /* The timer for amf initialisation. */ + AVD_TMR node_sync_tmr; /* The timer for reception of all node_up from all PLs. */ AVD_TMR heartbeat_tmr; /* The timer for sending heart beats to nd. */ SaTimeT heartbeat_tmr_period; @@ -215,6 +216,8 @@ typedef struct cl_cb_tag { * Used to skip usage of dependent services in the no-active state */ bool active_services_exist; + bool all_nodes_synced; + bool node_sync_window_closed; /* A list of those SIs for which SI dep tolerance timer is running. @@ -228,6 +231,8 @@ typedef struct cl_cb_tag { */ std::list<AVD_SI*> sis_in_Tolerance_Timer_state; + /* The duration that amfd should tolerate the absence of SCs */ + uint32_t scs_absence_max_duration; } AVD_CL_CB; extern AVD_CL_CB *avd_cb; diff --git a/osaf/services/saf/amf/amfd/include/cluster.h b/osaf/services/saf/amf/amfd/include/cluster.h --- a/osaf/services/saf/amf/amfd/include/cluster.h +++ b/osaf/services/saf/amf/amfd/include/cluster.h @@ -38,6 +38,7 @@ extern AVD_CLUSTER *avd_cluster; extern SaAisErrorT avd_cluster_config_get(void); extern void avd_cluster_tmr_init_evh(AVD_CL_CB *cb, struct avd_evt_tag *evt); +extern void avd_node_sync_tmr_evh(AVD_CL_CB *cb, struct avd_evt_tag *evt); extern void avd_cluster_constructor(void); #endif diff --git a/osaf/services/saf/amf/amfd/include/csi.h b/osaf/services/saf/amf/amfd/include/csi.h --- a/osaf/services/saf/amf/amfd/include/csi.h +++ b/osaf/services/saf/amf/amfd/include/csi.h @@ -184,5 +184,7 @@ extern AVD_CSI *csi_create(const SaNameT extern bool csi_assignment_validate(AVD_SG *sg); extern SaAisErrorT csi_assign_hdlr(AVD_CSI *csi); extern bool are_sponsor_csis_assigned_in_su(AVD_CSI *dep_csi, AVD_SU *su); +SaAisErrorT avd_compcsi_cleanup(void); +SaAisErrorT avd_compcsi_recreate(AVSV_N2D_ND_CSICOMP_STATE_MSG_INFO *info); #endif diff --git a/osaf/services/saf/amf/amfd/include/db_template.h b/osaf/services/saf/amf/amfd/include/db_template.h --- a/osaf/services/saf/amf/amfd/include/db_template.h +++ b/osaf/services/saf/amf/amfd/include/db_template.h @@ -44,6 +44,7 @@ class AmfDb { const_iterator begin() const {return db.begin();} const_iterator end() const {return db.end();} + typename AmfDbMap::size_type size() const {return db.size();} const_reverse_iterator rbegin() const {return db.rbegin();} const_reverse_iterator rend() const {return db.rend();} diff --git a/osaf/services/saf/amf/amfd/include/evt.h b/osaf/services/saf/amf/amfd/include/evt.h --- a/osaf/services/saf/amf/amfd/include/evt.h +++ b/osaf/services/saf/amf/amfd/include/evt.h @@ -51,10 +51,13 @@ typedef enum avd_evt_type { AVD_EVT_SHUTDOWN_APP_SU_MSG, AVD_EVT_VERIFY_ACK_NACK_MSG, AVD_EVT_COMP_VALIDATION_MSG, + AVD_EVT_ND_SISU_STATE_INFO_MSG, + AVD_EVT_ND_CSICOMP_STATE_INFO_MSG, AVD_EVT_MSG_MAX, AVD_EVT_TMR_SND_HB = AVD_EVT_MSG_MAX, AVD_EVT_TMR_CL_INIT, AVD_EVT_TMR_SI_DEP_TOL, + AVD_EVT_TMR_NODE_SYNC, AVD_EVT_TMR_MAX, AVD_EVT_MDS_AVD_UP = AVD_EVT_TMR_MAX, AVD_EVT_MDS_AVD_DOWN, diff --git a/osaf/services/saf/amf/amfd/include/mds.h b/osaf/services/saf/amf/amfd/include/mds.h --- a/osaf/services/saf/amf/amfd/include/mds.h +++ b/osaf/services/saf/amf/amfd/include/mds.h @@ -33,13 +33,13 @@ /* In Service upgrade support */ #define AVD_MDS_SUB_PART_VERSION_4 4 -#define AVD_MDS_SUB_PART_VERSION 5 +#define AVD_MDS_SUB_PART_VERSION 6 #define AVD_AVND_SUBPART_VER_MIN 1 -#define AVD_AVND_SUBPART_VER_MAX 5 +#define AVD_AVND_SUBPART_VER_MAX 6 #define AVD_AVD_SUBPART_VER_MIN 1 -#define AVD_AVD_SUBPART_VER_MAX 5 +#define AVD_AVD_SUBPART_VER_MAX 6 /* Message format versions */ #define AVD_AVD_MSG_FMT_VER_1 1 @@ -47,6 +47,7 @@ #define AVD_AVD_MSG_FMT_VER_3 3 #define AVD_AVD_MSG_FMT_VER_4 4 #define AVD_AVD_MSG_FMT_VER_5 5 +#define AVD_AVD_MSG_FMT_VER_6 6 uint32_t avd_mds_set_vdest_role(struct cl_cb_tag *cb, SaAmfHAStateT role); uint32_t avd_mds_init(struct cl_cb_tag *cb); diff --git a/osaf/services/saf/amf/amfd/include/msg.h b/osaf/services/saf/amf/amfd/include/msg.h --- a/osaf/services/saf/amf/amfd/include/msg.h +++ b/osaf/services/saf/amf/amfd/include/msg.h @@ -37,7 +37,7 @@ #include <amf_d2nmsg.h> typedef enum { - AVD_D2D_CHANGE_ROLE_REQ = AVSV_DND_MSG_MAX, + AVD_D2D_CHANGE_ROLE_REQ = AVSV_D2D_CHANGE_ROLE_REQ, AVD_D2D_CHANGE_ROLE_RSP, AVD_D2D_MSG_MAX, } AVD_D2D_MSG_TYPE; diff --git a/osaf/services/saf/amf/amfd/include/node.h b/osaf/services/saf/amf/amfd/include/node.h --- a/osaf/services/saf/amf/amfd/include/node.h +++ b/osaf/services/saf/amf/amfd/include/node.h @@ -144,6 +144,8 @@ class AVD_AVND { bool clm_change_start_preceded; /* to indicate there was CLM start cbk before CLM completed cb. */ bool recvr_fail_sw; /* to indicate there was node reboot because of node failover/switchover.*/ AVD_AMF_NG *admin_ng; /* points to the nodegroup on which admin operation is going on.*/ + uint16_t node_up_msg_count; /* to count of node_up msg that director had received from this node */ + bool reboot; private: void initialize(); // disallow copy and assign @@ -206,6 +208,7 @@ extern AVD_AVND *avd_node_getnext(const extern uint32_t avd_node_add_nodeid(AVD_AVND *avnd); extern void avd_node_delete_nodeid(AVD_AVND *node); extern AVD_AVND *avd_node_find_nodeid(SaClmNodeIdT node_id); +extern AVD_AVND *avd_node_get(const SaNameT *dn); extern SaAisErrorT avd_node_config_get(void); extern void avd_node_state_set(AVD_AVND *node, AVD_AVND_STATE node_state); extern void avd_node_oper_state_set(AVD_AVND *node, SaAmfOperationalStateT oper_state); diff --git a/osaf/services/saf/amf/amfd/include/proc.h b/osaf/services/saf/amf/amfd/include/proc.h --- a/osaf/services/saf/amf/amfd/include/proc.h +++ b/osaf/services/saf/amf/amfd/include/proc.h @@ -43,6 +43,7 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb void avd_su_si_assign_evh(AVD_CL_CB *cb, struct avd_evt_tag *evt); uint32_t avd_new_assgn_susi(AVD_CL_CB *cb, AVD_SU *su, AVD_SI *si, SaAmfHAStateT role, bool ckpt, AVD_SU_SI_REL **ret_ptr); +void su_try_repair(const AVD_SU *su); void avd_sg_app_node_su_inst_func(AVD_CL_CB *cb, AVD_AVND *avnd); uint32_t avd_sg_app_su_inst_func(AVD_CL_CB *cb, AVD_SG *sg); uint32_t avd_sg_su_oper_list_add(AVD_CL_CB *cb, AVD_SU *su, bool ckpt); @@ -58,6 +59,10 @@ uint32_t avd_sg_nway_si_assign(AVD_CL_CB /* The following are for N-way Active redundancy model */ AVD_SU *avd_sg_nacvred_su_chose_asgn(AVD_CL_CB *cb, AVD_SG *sg); +uint32_t avd_count_node_up(AVD_CL_CB *cb); +uint32_t avd_evt_queue_count(AVD_CL_CB *cb); +uint32_t avd_count_sync_node_size(AVD_CL_CB *cb); +void avd_process_state_info_queue(AVD_CL_CB *cb); void avd_node_up_evh(AVD_CL_CB *cb, struct avd_evt_tag *evt); void avd_reg_su_evh(AVD_CL_CB *cb, struct avd_evt_tag *evt); void avd_oper_req_evh(AVD_CL_CB *cb, struct avd_evt_tag *evt); @@ -72,6 +77,8 @@ void avd_mds_qsd_role_evh(AVD_CL_CB *cb, void avd_node_down_appl_susi_failover(AVD_CL_CB *cb, AVD_AVND *avnd); void avd_node_down_mw_susi_failover(AVD_CL_CB *cb, AVD_AVND *avnd); void avd_node_down_func(AVD_CL_CB *cb, AVD_AVND *avnd); +void avd_nd_sisu_state_info_evh(AVD_CL_CB *cb, struct avd_evt_tag *evt); +void avd_nd_compcsi_state_info_evh(AVD_CL_CB *cb, struct avd_evt_tag *evt); uint32_t avd_node_down(AVD_CL_CB *cb, SaClmNodeIdT node_id); AVD_AVND *avd_msg_sanity_chk(AVD_EVT *evt, SaClmNodeIdT node_id, AVSV_DND_MSG_TYPE msg_typ, uint32_t msg_id); diff --git a/osaf/services/saf/amf/amfd/include/sg.h b/osaf/services/saf/amf/amfd/include/sg.h --- a/osaf/services/saf/amf/amfd/include/sg.h +++ b/osaf/services/saf/amf/amfd/include/sg.h @@ -288,6 +288,15 @@ public: virtual void node_fail(AVD_CL_CB *cb, AVD_SU *su) = 0; /** + * Run the delayed su failover that happened during headless + * Called when cluster init finishes and hydra option is enabled + * + * @param cb + * @return + */ + virtual void adjust_delayed_failover(AVD_CL_CB *cb) = 0; + + /** * Handle SG realign * Assign SIs if needed. If any assigning is gets done it adds * the SUs to the operation list and sets the SG FSM state to SG realign. @@ -439,6 +448,7 @@ class SG_2N : public AVD_SG { public: ~SG_2N(); void node_fail(AVD_CL_CB*, AVD_SU*); + void adjust_delayed_failover(AVD_CL_CB *cb); uint32_t realign(AVD_CL_CB *cb, AVD_SG *sg); uint32_t si_assign(AVD_CL_CB *cb, AVD_SI *si); uint32_t si_admin_down(AVD_CL_CB *cb, AVD_SI *si); @@ -472,6 +482,7 @@ class SG_NORED : public AVD_SG { public: ~SG_NORED(); void node_fail(AVD_CL_CB*, AVD_SU*); + void adjust_delayed_failover(AVD_CL_CB *cb); uint32_t realign(AVD_CL_CB *cb, AVD_SG *sg); uint32_t si_assign(AVD_CL_CB *cb, AVD_SI *si); uint32_t si_admin_down(AVD_CL_CB *cb, AVD_SI *si); @@ -495,6 +506,7 @@ class SG_NPM : public AVD_SG { public: ~SG_NPM(); void node_fail(AVD_CL_CB*, AVD_SU*); + void adjust_delayed_failover(AVD_CL_CB *cb); uint32_t realign(AVD_CL_CB *cb, AVD_SG *sg); uint32_t si_assign(AVD_CL_CB *cb, AVD_SI *si); uint32_t si_admin_down(AVD_CL_CB *cb, AVD_SI *si); @@ -526,6 +538,7 @@ class SG_NACV : public AVD_SG { public: ~SG_NACV(); void node_fail(AVD_CL_CB*, AVD_SU*); + void adjust_delayed_failover(AVD_CL_CB *cb); uint32_t realign(AVD_CL_CB *cb, AVD_SG *sg); uint32_t si_assign(AVD_CL_CB *cb, AVD_SI *si); uint32_t si_admin_down(AVD_CL_CB *cb, AVD_SI *si); @@ -547,6 +560,7 @@ class SG_NWAY : public AVD_SG { public: ~SG_NWAY(); void node_fail(AVD_CL_CB*, AVD_SU*); + void adjust_delayed_failover(AVD_CL_CB *cb); uint32_t realign(AVD_CL_CB *cb, AVD_SG *sg); uint32_t si_assign(AVD_CL_CB *cb, AVD_SI *si); uint32_t si_admin_down(AVD_CL_CB *cb, AVD_SI *si); @@ -592,6 +606,6 @@ extern void avd_sg_adjust_config(AVD_SG extern uint32_t sg_instantiated_su_count(const AVD_SG *sg); extern bool sg_stable_after_lock_in_or_unlock_in(AVD_SG *sg); extern void process_su_si_response_for_ng(AVD_SU *su, SaAisErrorT res); - +extern void adjust_delayed_sidep(void); #endif diff --git a/osaf/services/saf/amf/amfd/include/si.h b/osaf/services/saf/amf/amfd/include/si.h --- a/osaf/services/saf/amf/amfd/include/si.h +++ b/osaf/services/saf/amf/amfd/include/si.h @@ -140,6 +140,7 @@ public: void arrange_dep_csi(AVD_CSI* csi); void add_csi_db(AVD_CSI* csi); bool is_sirank_valid(uint32_t newSiRank) const; + void update_alarm_state(bool alarm_state, bool sent_notification = true); void update_sirank(uint32_t newSiRank); bool si_dep_states_check(); const AVD_SIRANKEDSU *get_si_ranked_su(const std::string &su_name) const; diff --git a/osaf/services/saf/amf/amfd/include/susi.h b/osaf/services/saf/amf/amfd/include/susi.h --- a/osaf/services/saf/amf/amfd/include/susi.h +++ b/osaf/services/saf/amf/amfd/include/susi.h @@ -157,4 +157,7 @@ extern bool si_assignment_state_check(AV extern SaAmfHAStateT avd_su_state_determine(AVD_SU *su); extern AVD_SU_SI_REL *avd_siass_next_susi_to_quiesce(const AVD_SU_SI_REL *susi); extern bool avd_susi_quiesced_canbe_given(const AVD_SU_SI_REL *susi); +SaAisErrorT avd_susi_cleanup(void); +SaAisErrorT avd_susi_recreate(AVSV_N2D_ND_SISU_STATE_MSG_INFO*); + #endif diff --git a/osaf/services/saf/amf/amfd/include/timer.h b/osaf/services/saf/amf/amfd/include/timer.h --- a/osaf/services/saf/amf/amfd/include/timer.h +++ b/osaf/services/saf/amf/amfd/include/timer.h @@ -44,6 +44,7 @@ typedef enum avd_tmr_type { * SIs to application SU. */ AVD_TMR_SI_DEP_TOL, /* SI_SI dependency tolerance timer */ + AVD_TMR_NODE_SYNC, /* node sync timer for all PLs from headless */ AVD_TMR_MAX } AVD_TMR_TYPE; diff --git a/osaf/services/saf/amf/amfd/include/util.h b/osaf/services/saf/amf/amfd/include/util.h --- a/osaf/services/saf/amf/amfd/include/util.h +++ b/osaf/services/saf/amf/amfd/include/util.h @@ -44,7 +44,7 @@ class AVD_SU; typedef enum { - AVD_D2D_CHANGE_ROLE_REQ = AVSV_DND_MSG_MAX, + AVD_D2D_CHANGE_ROLE_REQ = AVSV_D2D_CHANGE_ROLE_REQ, AVD_D2D_CHANGE_ROLE_RSP, AVD_D2D_MSG_MAX, } AVD_D2D_MSG_TYPE; diff --git a/osaf/services/saf/amf/amfd/main.cc b/osaf/services/saf/amf/amfd/main.cc --- a/osaf/services/saf/amf/amfd/main.cc +++ b/osaf/services/saf/amf/amfd/main.cc @@ -100,11 +100,14 @@ static const AVD_EVT_HDLR g_actv_list[AV invalid_evh, /* AVD_EVT_SHUTDOWN_APP_SU_MSG */ avd_ack_nack_evh, /* AVD_EVT_VERIFY_ACK_NACK_MSG */ avd_comp_validation_evh, /* AVD_EVT_COMP_VALIDATION_MSG */ + avd_nd_sisu_state_info_evh, /* AVD_EVT_ND_SISU_STATE_INFO_MSG */ + avd_nd_compcsi_state_info_evh, /* AVD_EVT_ND_COMPCSI_STATE_INFO_MSG */ /* active AvD timer events processing */ avd_tmr_snd_hb_evh, /* AVD_EVT_TMR_SND_HB */ avd_cluster_tmr_init_evh, /* AVD_EVT_TMR_CL_INIT */ avd_sidep_tol_tmr_evh, /* AVD_EVT_TMR_SI_DEP_TOL */ + avd_node_sync_tmr_evh, /* AVD_EVT_TMR_ALL_NODE_UP */ /* active AvD MDS events processing */ avd_mds_avd_up_evh, /* AVD_EVT_MDS_AVD_UP */ @@ -139,11 +142,15 @@ static const AVD_EVT_HDLR g_stndby_list[ standby_invalid_evh, /* AVD_EVT_SHUTDOWN_APP_SU_MSG */ standby_invalid_evh, /* AVD_EVT_VERIFY_ACK_NACK_MSG */ standby_invalid_evh, /* AVD_EVT_COMP_VALIDATION_MSG */ + standby_invalid_evh, /* AVD_EVT_ND_SUSI_STATE_INFO_MSG */ + standby_invalid_evh, /* AVD_EVT_ND_COMPCSI_STATE_INFO_MSG */ + /* standby AvD timer events processing */ avd_tmr_snd_hb_evh, /* AVD_EVT_TMR_SND_HB */ standby_invalid_evh, /* AVD_EVT_TMR_CL_INIT */ avd_sidep_tol_tmr_evh, /* AVD_EVT_TMR_SI_DEP_TOL */ + standby_invalid_evh, /* AVD_EVT_TMR_ALL_NODE_UP */ /* standby AvD MDS events processing */ avd_mds_avd_up_evh, /* AVD_EVT_MDS_AVD_UP */ @@ -177,11 +184,14 @@ static const AVD_EVT_HDLR g_quiesc_list[ invalid_evh, /* AVD_EVT_SHUTDOWN_APP_SU_MSG */ qsd_invalid_evh, /* AVD_EVT_VERIFY_ACK_NACK_MSG */ avd_comp_validation_evh, /* AVD_EVT_COMP_VALIDATION_MSG */ + qsd_invalid_evh, /* AVD_EVT_ND_SISU_STATE_INFO_MSG */ + qsd_invalid_evh, /* AVD_EVT_ND_COMPCSI_STATE_INFO_MSG */ /* active AvD timer events processing */ avd_tmr_snd_hb_evh, /* AVD_EVT_TMR_SND_HB */ qsd_ignore_evh, /* AVD_EVT_TMR_CL_INIT */ avd_sidep_tol_tmr_evh, /* AVD_EVT_TMR_SI_DEP_TOL */ + qsd_ignore_evh, /* AVD_EVT_TMR_ALL_NODE_UP */ /* active AvD MDS events processing */ avd_mds_avd_up_evh, /* AVD_EVT_MDS_AVD_UP */ @@ -532,6 +542,9 @@ static uint32_t initialize(void) cb->heartbeat_tmr.is_active = false; cb->heartbeat_tmr.type = AVD_TMR_SND_HB; cb->heartbeat_tmr_period = AVSV_DEF_HB_PERIOD; + cb->all_nodes_synced = false; + cb->node_sync_window_closed = false; + cb->scs_absence_max_duration = 0; if ((val = getenv("AVSV_HB_PERIOD")) != nullptr) { cb->heartbeat_tmr_period = strtoll(val, nullptr, 0); @@ -598,6 +611,13 @@ static uint32_t initialize(void) LOG_ER("avd_active_role_initialization FAILED"); goto done; } + + /* in a normal cluster start there will be no assignments object found so + * nothing happens. Used to cleanup cached RTAs after SCs recover after + * being headless. + */ + avd_susi_cleanup(); + avd_compcsi_cleanup(); } else { rc = avd_standby_role_initialization(cb); @@ -780,6 +800,8 @@ static void main_loop(void) **************************************************************************/ static void process_event(AVD_CL_CB *cb_now, AVD_EVT *evt) { + TRACE_ENTER2("evt->rcv_evt %u", evt->rcv_evt); + /* check the HA state */ if (cb_now->avail_state_avd == SA_AMF_HA_ACTIVE) { /* if active call g_avd_actv_list functions */ @@ -824,6 +846,8 @@ static void process_event(AVD_CL_CB *cb_ cb_now->sync_required = true; delete evt; + + TRACE_LEAVE(); } /** diff --git a/osaf/services/saf/amf/amfd/mds.cc b/osaf/services/saf/amf/amfd/mds.cc --- a/osaf/services/saf/amf/amfd/mds.cc +++ b/osaf/services/saf/amf/amfd/mds.cc @@ -45,13 +45,13 @@ const MDS_CLIENT_MSG_FORMAT_VER avd_avnd_msg_fmt_map_table[] = { AVSV_AVD_AVND_MSG_FMT_VER_1, AVSV_AVD_AVND_MSG_FMT_VER_2, AVSV_AVD_AVND_MSG_FMT_VER_3, AVSV_AVD_AVND_MSG_FMT_VER_4, - AVSV_AVD_AVND_MSG_FMT_VER_5 + AVSV_AVD_AVND_MSG_FMT_VER_5, AVSV_AVD_AVND_MSG_FMT_VER_6 }; const MDS_CLIENT_MSG_FORMAT_VER avd_avd_msg_fmt_map_table[] = { AVD_AVD_MSG_FMT_VER_1, AVD_AVD_MSG_FMT_VER_2, AVD_AVD_MSG_FMT_VER_3, AVD_AVD_MSG_FMT_VER_4, - AVD_AVD_MSG_FMT_VER_5}; + AVD_AVD_MSG_FMT_VER_5, AVD_AVD_MSG_FMT_VER_6}; /* fwd decl */ diff --git a/osaf/services/saf/amf/amfd/ndfsm.cc b/osaf/services/saf/amf/amfd/ndfsm.cc --- a/osaf/services/saf/amf/amfd/ndfsm.cc +++ b/osaf/services/saf/amf/amfd/ndfsm.cc @@ -33,11 +33,221 @@ AmfDb<uint32_t, AVD_FAIL_OVER_NODE> *node_list_db = 0; /* SaClmNodeIdT index */ /***************************************************************************** - * Function: avd_node_up_func + * Function: avd_process_state_info_queue + * + * Purpose: This function will pull out the queue event and looking for sync + * info (sisu, compcsi) event to recover the SI/CSI assignment + * + * Input: cb - the AVD control block + * + * Returns: None. + * + * NOTES: + * + * + **************************************************************************/ +void avd_process_state_info_queue(AVD_CL_CB *cb) +{ + uint32_t i; + const auto queue_size = cb->evt_queue.size(); + AVD_EVT_QUEUE *queue_evt = nullptr; + + TRACE_ENTER(); + + TRACE("queue_size before processing: %lu", queue_size); + + // recover assignments from state info + for(i=0 ; i<queue_size ; i++) { + queue_evt = cb->evt_queue.front(); + osafassert(queue_evt->evt); + cb->evt_queue.pop(); + + TRACE("rcv_evt: %u", queue_evt->evt->rcv_evt); + + if (queue_evt->evt->rcv_evt == AVD_EVT_ND_SISU_STATE_INFO_MSG || + queue_evt->evt->rcv_evt == AVD_EVT_ND_CSICOMP_STATE_INFO_MSG) { + + AVD_DND_MSG* n2d_msg = queue_evt->evt->info.avnd_msg; + + TRACE("msg_type: %u", n2d_msg->msg_type); + + switch(n2d_msg->msg_type) { + case AVSV_N2D_ND_SISU_STATE_INFO_MSG: + avd_susi_recreate(&n2d_msg->msg_info.n2d_nd_sisu_state_info); + break; + case AVSV_N2D_ND_CSICOMP_STATE_INFO_MSG: + avd_compcsi_recreate(&n2d_msg->msg_info.n2d_nd_csicomp_state_info); + break; + default: + break; + } + + avsv_dnd_msg_free(n2d_msg); + + delete queue_evt->evt; + delete queue_evt; + } else { + cb->evt_queue.push(queue_evt); + } + } + + // Once active amfd looks up the state info from queue, that means node sync + // finishes. Therefore, if the queue is empty, this active amfd is coming + // from a cluster restart, the alarm state should be reset. + // Otherwise, amfd is coming from SC recovery from headless, SI alarm state + // should be re-evalutated and raise the alarm in case it's still unassigned. + if (queue_size == 0) { + for (std::map<std::string, AVD_SI*>::const_iterator it = si_db->begin(); + it != si_db->end(); it++) { + AVD_SI *si = it->second; + if (si->alarm_sent == true) { + si->update_alarm_state(false, false); + } + } + } + else { + for (std::map<std::string, AVD_SI*>::const_iterator it = si_db->begin(); + it != si_db->end(); it++) { + AVD_SI *si = it->second; + if (si->alarm_sent == false && + si->saAmfSIAssignmentState == SA_AMF_ASSIGNMENT_UNASSIGNED) { + si->update_alarm_state(true); + } + } + } + TRACE("queue_size after processing: %lu", cb->evt_queue.size()); + TRACE_LEAVE(); +} +/***************************************************************************** + * Function: avd_count_sync_node_size + * + * Purpose: Helper function count the maximum number of node in cluster + * to be synced from headless + * + * Input: cb - the AVD control block + * + * Returns: Number of nd + * + * NOTES: + * + **************************************************************************/ +uint32_t avd_count_sync_node_size(AVD_CL_CB *cb) +{ + uint32_t twon_ncs_su_count = 0; + uint32_t count = 0; + TRACE_ENTER(); + + for (std::map<std::string, AVD_AVND *>::const_iterator it = node_name_db->begin(); + it != node_name_db->end(); it++) { + AVD_AVND *avnd = it->second; + osafassert(avnd); + for (const auto& su :avnd->list_of_ncs_su) { + if (su->sg_of_su->sg_redundancy_model == SA_AMF_2N_REDUNDANCY_MODEL) { + twon_ncs_su_count++; + continue; + } + } + } + // cluster can have 1 SC or more SCs which hosting 2N Opensaf SU + // so twon_ncs_su_count at least is 1 + osafassert(twon_ncs_su_count > 0); + + if (twon_ncs_su_count == 1) { + // 1 SC, the rest of nodes could be in sync from headless + count = node_name_db->size() - 1; + } else { + // >=2 SCs, the rest of nodes could be in sync except active/standby SC + count = node_name_db->size() - 2; + } + + TRACE("sync node size:%d", count); + TRACE_LEAVE(); + return count; +} +/***************************************************************************** + * Function: avd_count_node_up + * + * Purpose: Helper function count number of nodes that sent node_up msg to + * director + * + * Input: cb - the AVD control block + * + * Returns: Number of node + * + * NOTES: + * + * + **************************************************************************/ +uint32_t avd_count_node_up(AVD_CL_CB *cb) +{ + uint32_t received_count = 0; + AVD_AVND *node = nullptr; + + TRACE_ENTER(); + + for (std::map<std::string, AVD_AVND *>::const_iterator it = node_name_db->begin(); + it != node_name_db->end(); it++) { + node = it->second; + if (node->node_up_msg_count > 0 + && node->node_info.nodeId != cb->node_id_avd + && node->node_info.nodeId != cb->node_id_avd_other) + ++received_count; + } + TRACE("Number of node director(s) that director received node_up msg:%u", + received_count); + + TRACE_LEAVE(); + return received_count; +} + +/***************************************************************************** + * Function: record_node_up_msg_info + * + * Purpose: Update the rcv_msg_id and adest which are sent from amfnd + * + * Input: avnd - ptr to the appropriate amfnd + * n2d_msg - node_up msg sent from amfnd + * + * Returns: None + * + * NOTES: + * + * + **************************************************************************/ +void record_node_up_msg_info(AVD_AVND *avnd, const AVD_DND_MSG *n2d_msg) +{ + osafassert(avnd != nullptr); + + avnd->adest = n2d_msg->msg_info.n2d_node_up.adest_address; + + if (n2d_msg->msg_info.n2d_node_up.msg_id >= avnd->rcv_msg_id) { + LOG_NO("Received node_up from %x: msg_id %u", + n2d_msg->msg_info.n2d_node_up.node_id, + n2d_msg->msg_info.n2d_node_up.msg_id); + + avnd->rcv_msg_id = n2d_msg->msg_info.n2d_node_up.msg_id; + } else { + // This is expected after recovering from a headless state. + // NODE_UPs will not be processed until all PLs are up. + // In the mean time, we may get other messages from amfnd + // that pushes up rcv_msg_id + LOG_NO("NODE UP from %x: msg_id out of order. rcv_msg_id %u, msg_id %u", + n2d_msg->msg_info.n2d_node_up.node_id, + avnd->rcv_msg_id, + n2d_msg->msg_info.n2d_node_up.msg_id); + } +} + + + +/***************************************************************************** + * Function: avd_node_up_evh * * Purpose: This function is the handler for node up event indicating * the arrival of the node_up message. Based on the state machine either - * It will ignore the message or send all the reg messages to the node. + * It will ignore the message or send all the reg messages to the node + * or order the node reboot if the node_up message arrives after the node + * sync window has closed. * * Input: cb - the AVD control block * evt - The event information. @@ -54,8 +264,61 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_ AVD_AVND *avnd = nullptr; AVD_DND_MSG *n2d_msg = evt->info.avnd_msg; uint32_t rc = NCSCC_RC_SUCCESS; + uint32_t sync_nd_size = avd_count_sync_node_size(cb); + bool act_nd; - TRACE_ENTER2("from %x", n2d_msg->msg_info.n2d_node_up.node_id); + TRACE_ENTER2("from %x, %s", n2d_msg->msg_info.n2d_node_up.node_id, + n2d_msg->msg_info.n2d_node_up.node_name.value); + + act_nd = n2d_msg->msg_info.n2d_node_up.node_id == cb->node_id_avd; + if (cb->scs_absence_max_duration > 0 && + cb->all_nodes_synced == false && + cb->node_sync_window_closed == false) { + avnd = avd_node_get(&n2d_msg->msg_info.n2d_node_up.node_name); + if (avnd == nullptr) { + LOG_ER("Invalid node_name. Check node_id"); + + // perhaps this is a node_up from an old version of amfnd without headless support + // let's check if the node_id is valid + if ((avnd = avd_node_find_nodeid(n2d_msg->msg_info.n2d_node_up.node_id)) == nullptr) { + LOG_ER("invalid node ID (%x)", n2d_msg->msg_info.n2d_node_up.node_id); + goto done; + } + } + uint32_t rc_node_up; + avnd->node_up_msg_count++; + rc_node_up = avd_count_node_up(cb); + if (rc_node_up == sync_nd_size) { + if (cb->node_sync_tmr.is_active) { + avd_stop_tmr(cb, &cb->node_sync_tmr); + TRACE("stop NodeSync timer"); + } + cb->all_nodes_synced = true; + LOG_NO("Received node_up_msg from all nodes"); + } else { + if (avnd->node_up_msg_count == 1 && + (act_nd || n2d_msg->msg_info.n2d_node_up.leds_set)) { + + // start (or restart) timer if this is the first message + // from amfnd-active-SC or amfnd-green-leds-PL + cb->node_sync_tmr.type = AVD_TMR_NODE_SYNC; + avd_start_tmr(cb, &(cb->node_sync_tmr), AVSV_DEF_NODE_SYNC_PERIOD); + + TRACE("Received node_up_msg from node:%s. Start/Restart " + " NodeSync timer waiting for remaining (%d) node(s)", + n2d_msg->msg_info.n2d_node_up.node_name.value, + sync_nd_size - rc_node_up); + goto done; + } + if (cb->node_sync_tmr.is_active == true) { + if (n2d_msg->msg_info.n2d_node_up.leds_set == false) { + TRACE("NodeSync timer is active, ignore this node_up msg (nodeid:%x)", + n2d_msg->msg_info.n2d_node_up.node_id); + goto done; + } + } + } + } /* Cannot use avd_msg_sanity_chk here since this is a special case */ if ((avnd = avd_node_find_nodeid(n2d_msg->msg_info.n2d_node_up.node_id)) == nullptr) { @@ -63,6 +326,9 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_ goto done; } + /* Retrieve the information from the message */ + record_node_up_msg_info(avnd, n2d_msg); + /* Check the AvD FSM state process node up only if AvD is in init done or * APP init state for all nodes except the primary system controller * whose node up is accepted in config done state. @@ -73,16 +339,11 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_ goto done; } - if (avnd->node_state != AVD_AVND_STATE_ABSENT) { - LOG_WA("invalid node state %u for node %x", - avnd->node_state, n2d_msg->msg_info.n2d_node_up.node_id); - goto done; + if ((n2d_msg->msg_info.n2d_node_up.node_id == cb->node_id_avd) && (cb->init_state < AVD_INIT_DONE)) { + // node up from local AVND + avd_process_state_info_queue(cb); } - /* Retrive the information from the message */ - avnd->adest = n2d_msg->msg_info.n2d_node_up.adest_address; - avnd->rcv_msg_id = n2d_msg->msg_info.n2d_node_up.msg_id; - if (avnd->node_info.member != SA_TRUE) { LOG_WA("Not a Cluster Member dropping the msg"); goto done; @@ -108,7 +369,9 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_ } /* send the Ack message to the node. */ - if (avd_snd_node_ack_msg(cb, avnd, avnd->rcv_msg_id) != NCSCC_RC_SUCCESS) { + // note: it's important to ack the msg_id received in n2d_msg, rather than avnd->rcv_msg_id. + // They will not always be same when headless option is enabled. + if (avd_snd_node_ack_msg(cb, avnd, n2d_msg->msg_info.n2d_node_up.msg_id) != NCSCC_RC_SUCCESS) { /* log error that the director is not able to send the message */ LOG_ER("%s:%u: %u", __FILE__, __LINE__, avnd->node_info.nodeId); @@ -119,6 +382,39 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_ goto done; } + if (n2d_msg->msg_info.n2d_node_up.leds_set == true) { + TRACE("node %x is already up", avnd->node_info.nodeId); + + if (cb->node_sync_window_closed == true && avnd->node_up_msg_count == 0) { + LOG_WA("Received new node_up_msg from node:%s after node sync window, " + "sending node reboot order to target node", + n2d_msg->msg_info.n2d_node_up.node_name.value); + avd_d2n_reboot_snd(avnd); + goto done; + } else if (avnd->reboot) { + // delayed node failfast + avd_d2n_reboot_snd(avnd); + avnd->reboot = false; + goto done; + } + else { + // this node is already up + avd_node_state_set(avnd, AVD_AVND_STATE_PRESENT); + avd_node_oper_state_set(avnd, SA_AMF_OPERATIONAL_ENABLED); + + // Update readiness state of all SUs which are waiting for node + // oper state + for (const auto& su :avnd->list_of_ncs_su) { + su->set_readiness_state(SA_AMF_READINESS_IN_SERVICE); + } + for (const auto& su :avnd->list_of_su) { + if (su->is_in_service()) + su->set_readiness_state(SA_AMF_READINESS_IN_SERVICE); + } + goto node_joined; + } + } + /* Send role change to this controller AvND */ if (avnd->node_info.nodeId == cb->node_id_avd) { /* Here obviously the role will be ACT. */ @@ -153,10 +449,11 @@ void avd_node_up_evh(AVD_CL_CB *cb, AVD_ goto done; } + avd_node_state_set(avnd, AVD_AVND_STATE_NO_CONFIG); + +node_joined: LOG_NO("Node '%s' joined the cluster", avnd->node_name); - avd_node_state_set(avnd, AVD_AVND_STATE_NO_CONFIG); - /* checkpoint the node. */ m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(cb, avnd, AVSV_CKPT_AVD_NODE_CONFIG); diff --git a/osaf/services/saf/amf/amfd/ndmsg.cc b/osaf/services/saf/amf/amfd/ndmsg.cc --- a/osaf/services/saf/amf/amfd/ndmsg.cc +++ b/osaf/services/saf/amf/amfd/ndmsg.cc @@ -357,7 +357,23 @@ uint32_t avd_n2d_msg_rcv(AVD_DND_MSG *rc cb->peer_msg_fmt_ver = msg_fmt_ver; } - evt->rcv_evt = static_cast<AVD_EVT_TYPE>((rcv_msg->msg_type - AVSV_N2D_NODE_UP_MSG) + AVD_EVT_NODE_UP_MSG); + switch (rcv_msg->msg_type) { + case AVSV_N2D_ND_SISU_STATE_INFO_MSG: + // 'offset lookup' can't be used for this + evt->rcv_evt = AVD_EVT_ND_SISU_STATE_INFO_MSG; + break; + case AVSV_N2D_ND_CSICOMP_STATE_INFO_MSG: + // 'offset lookup' can't be used for this + evt->rcv_evt = AVD_EVT_ND_CSICOMP_STATE_INFO_MSG; + break; + default: + evt->rcv_evt = static_cast<AVD_EVT_TYPE>((rcv_msg->msg_type - AVSV_N2D_NODE_UP_MSG) + AVD_EVT_NODE_UP_MSG); + break; + } + + osafassert((AVD_EVT_INVALID < evt->rcv_evt) + && (evt->rcv_evt < AVD_EVT_MAX)); + evt->info.avnd_msg = rcv_msg; if (m_NCS_IPC_SEND(&cb->avd_mbx, evt, NCS_IPC_PRIORITY_HIGH) != NCSCC_RC_SUCCESS) { diff --git a/osaf/services/saf/amf/amfd/ndproc.cc b/osaf/services/saf/amf/amfd/ndproc.cc --- a/osaf/services/saf/amf/amfd/ndproc.cc +++ b/osaf/services/saf/amf/amfd/ndproc.cc @@ -68,8 +68,9 @@ AVD_AVND *avd_msg_sanity_chk(AVD_EVT* ev } if ((node->rcv_msg_id + 1) != msg_id) { - LOG_WA("%s: invalid msg id %u, from %x should be %u", - __FUNCTION__, msg_id, node_id, node->rcv_msg_id + 1); + LOG_WA("%s: invalid msg id %u, msg type %u, from %x should be %u", + __FUNCTION__, msg_id, evt->info.avnd_msg->msg_type, + node_id, node->rcv_msg_id + 1); return nullptr; } @@ -278,6 +279,94 @@ void avd_oper_req_evh(AVD_CL_CB *cb, AVD TRACE_LEAVE(); } +/***************************************************************************** + * Function: avd_nd_sisu_state_info_evh + * + * Purpose: This function is the handler for the sync sisu_state_info event. + * No process on the event, push in queue or ignore it if the node sync window + * has closed. + * + * Input: cb - the AVD control block + * evt - The event information. + * + * Returns: None. + * + * NOTES: + * + * + **************************************************************************/ + +void avd_nd_sisu_state_info_evh(AVD_CL_CB *cb, AVD_EVT *evt) +{ + AVD_DND_MSG *n2d_msg = evt->info.avnd_msg; + AVD_EVT_QUEUE* state_info_evt; + + TRACE_ENTER(); + LOG_NO("Receive message with event type:%u, msg_type:%u, from node:%x, msg_id:%u", + evt->rcv_evt, + evt->info.avnd_msg->msg_type, + evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.node_id, + evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.msg_id); + + if (cb->node_sync_window_closed == false) { + state_info_evt = new AVD_EVT_QUEUE(); + state_info_evt->evt = new AVD_EVT(); + memcpy(state_info_evt->evt, evt, sizeof(AVD_EVT)); + state_info_evt->evt->info.avnd_msg = n2d_msg; + cb->evt_queue.push(state_info_evt); + } + else { + LOG_WA("Ignore this sisu_state_info message since node sync window has closed"); + avsv_dnd_msg_free(n2d_msg); + } + + TRACE_LEAVE(); +} + +/***************************************************************************** + * Function: avd_nd_compcsi_state_info_evh + * + * Purpose: This function is the handler for the sync compcsi_state_info event. + * No process on the event, push in queue or ignore it if the node sync window + * has closed. + * + * Input: cb - the AVD control block + * evt - The event information. + * + * Returns: None. + * + * NOTES: + * + * + **************************************************************************/ + +void avd_nd_compcsi_state_info_evh(AVD_CL_CB *cb, AVD_EVT *evt) +{ + AVD_DND_MSG *n2d_msg = evt->info.avnd_msg; + AVD_EVT_QUEUE* state_info_evt; + + TRACE_ENTER(); + LOG_NO("Receive message with event type:%u, msg_type:%u, from node:%x, msg_id:%u", + evt->rcv_evt, + evt->info.avnd_msg->msg_type, + evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.node_id, + evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.msg_id); + + if (cb->node_sync_window_closed == false) { + state_info_evt = new AVD_EVT_QUEUE(); + state_info_evt->evt = new AVD_EVT(); + memcpy(state_info_evt->evt, evt, sizeof(AVD_EVT)); + state_info_evt->evt->info.avnd_msg = n2d_msg; + cb->evt_queue.push(state_info_evt); + } + else { + LOG_WA("Ignore this compcsi_state_info message since node sync window has closed"); + avsv_dnd_msg_free(n2d_msg); + } + + TRACE_LEAVE(); +} + /** * handler to report error response to imm for any pending admin operation on comp * @@ -721,7 +810,10 @@ void avd_data_update_req_evh(AVD_CL_CB * } if ((node->node_state == AVD_AVND_STATE_ABSENT) || (node->node_state == AVD_AVND_STATE_GO_DOWN)) { - LOG_ER("%s: invalid node state %u", __FUNCTION__, node->node_state); + LOG_ER("%s: node %x, receive msg_id(%u) in invalid node state %u", + __FUNCTION__, node->node_info.nodeId, + n2d_msg->msg_info.n2d_data_req.msg_id, node->node_state); + goto done; } @@ -883,6 +975,11 @@ void avd_data_update_req_evh(AVD_CL_CB * l_val = ntohl(*((uint32_t *)&n2d_msg->msg_info.n2d_data_req.param_info.value[0])); su->set_oper_state(l_val); } + + if (su->is_in_service() == true) { + su->set_readiness_state(SA_AMF_READINESS_IN_SERVICE); + } + break; case saAmfSUPresenceState_ID: TRACE("su pres state"); diff --git a/osaf/services/saf/amf/amfd/node.cc b/osaf/services/saf/amf/amfd/node.cc --- a/osaf/services/saf/amf/amfd/node.cc +++ b/osaf/services/saf/amf/amfd/node.cc @@ -57,6 +57,7 @@ uint32_t avd_node_add_nodeid(AVD_AVND *n if ((node_id_db->find(node->node_info.nodeId) == nullptr) && (node->node_info.nodeId != 0)) { + TRACE("added node %d", node->node_info.nodeId); rc = node_id_db->insert(node->node_info.nodeId, node); osafassert(rc == NCSCC_RC_SUCCESS); } @@ -161,11 +162,11 @@ AVD_AVND::~AVD_AVND() { // AVD_AVND *avd_node_new(const SaNameT *dn) { - AVD_AVND *node; - - node = new AVD_AVND(dn); - - return node; + AVD_AVND *node; + node = new AVD_AVND(dn); + node->node_up_msg_count = 0; + node->reboot = false; + return node; } void avd_node_delete(AVD_AVND *node) @@ -409,8 +410,10 @@ void avd_node_state_set(AVD_AVND *node, osafassert(node_state <= AVD_AVND_STATE_NCS_INIT); TRACE_ENTER2("'%s' %s => %s", node->name.value, node_state_name[node->node_state], node_state_name[node_state]); - node->node_state = node_state; - m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, AVSV_CKPT_AVND_NODE_STATE); + if (node->node_state != node_state) { + node->node_state = node_state; + m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, AVSV_CKPT_AVND_NODE_STATE); + } TRACE_LEAVE(); } diff --git a/osaf/services/saf/amf/amfd/sg.cc b/osaf/services/saf/amf/amfd/sg.cc --- a/osaf/services/saf/amf/amfd/sg.cc +++ b/osaf/services/saf/amf/amfd/sg.cc @@ -2000,6 +2000,63 @@ done: return rc; } +/** + * @Brief Check if SI Dependency relations between SIs are still valid + * as cluster has just being came from headless. The dependent SIs + * which have any unassigned sponsor SI will remove the assignments. + * + * @param None + * @Return None +*/ +void adjust_delayed_sidep(void) +{ + AVD_SU_SI_REL *curr_susi; + bool rescan = true; + TRACE_ENTER(); + + // Searching for the dependent SIs which's sponsored SI(s) + // had removed assignment in headless. + while (rescan) { + rescan = false; + for (std::map<std::string, AVD_SG*>::const_iterator it = sg_db->begin(); + it != sg_db->end(); it++) { + AVD_SG *i_sg = it->second; + if (i_sg->list_of_su.empty() || (i_sg->sg_ncs_spec == true)) { + continue; + } + for (const auto& si : i_sg->list_of_si) { + if (si->list_of_sisu && + si->saAmfSIAssignmentState != SA_AMF_ASSIGNMENT_UNASSIGNED) { + bool any_unassign_spsi = false; + AVD_SPONS_SI_NODE *spsi_node = si->spons_si_list; + for(; spsi_node != nullptr && !any_unassign_spsi; + spsi_node = spsi_node->next) { + // if any of sponsored si has no assignment + // @si will remove its all assignments + if ((spsi_node->si->curr_standby_assignments() + + spsi_node->si->curr_active_assignments()) == 0) { + // remove all sisu of @si + for(curr_susi = si->list_of_sisu; curr_susi; + curr_susi = curr_susi->si_next) { + if (curr_susi->fsm == AVD_SU_SI_STATE_ASGND) { + LOG_NO("Remove '%s' from '%s' due to sponsor SI %s is unassigned", + curr_susi->si->name.value, curr_susi->su->name.value, + spsi_node->si->name.value); + avd_susi_del_send(curr_susi); + any_unassign_spsi = true; + // Need to rescan due to recursive dependencies + rescan = true; + } + } + } + } + } + } + } + } + + TRACE_LEAVE(); +} AVD_SU* AVD_SG::first_su() { diff --git a/osaf/services/saf/amf/amfd/sg_2n_fsm.cc b/osaf/services/saf/amf/amfd/sg_2n_fsm.cc --- a/osaf/services/saf/amf/amfd/sg_2n_fsm.cc +++ b/osaf/services/saf/amf/amfd/sg_2n_fsm.cc @@ -3534,6 +3534,146 @@ done: TRACE_LEAVE(); } +void SG_2N::adjust_delayed_failover(AVD_CL_CB *cb) { + AVD_SU_SI_REL *curr_susi; + TRACE_ENTER(); + + // Check AdminState of node/sg/su whether is LOCKED or SHUTTING_DOWN + // which states will cause removal of assignment + for (const auto& su : list_of_su) { + SaAmfHAStateT su_ha_state; + TRACE("Check AdminState of SU/SG/Node, SU:'%s', saAmfSUAdminState:%u, " + "saAmfSGAdminState:%u, saAmfNodeAdminState:%u, " + "saAmfSUNumCurrActiveSIs:%u, saAmfSUNumCurrStandbySIs:%u", + su->name.value, + su->saAmfSUAdminState, + su->sg_of_su->saAmfSGAdminState, + su->su_on_node->saAmfNodeAdminState, + su->saAmfSUNumCurrActiveSIs, + su->saAmfSUNumCurrStandbySIs); + + if (su->saAmfSUAdminState == SA_AMF_ADMIN_LOCKED || + su->sg_of_su->saAmfSGAdminState == SA_AMF_ADMIN_LOCKED || + su->su_on_node->saAmfNodeAdminState == SA_AMF_ADMIN_LOCKED || + su->saAmfSUAdminState == SA_AMF_ADMIN_SHUTTING_DOWN || + su->sg_of_su->saAmfSGAdminState == SA_AMF_ADMIN_SHUTTING_DOWN || + su->su_on_node->saAmfNodeAdminState == SA_AMF_ADMIN_SHUTTING_DOWN) { + + if (su->list_of_susi) { + su_ha_state = avd_su_state_determine(su); + if (su_ha_state == SA_AMF_HA_QUIESCED || + su_ha_state == SA_AMF_HA_STANDBY || + su_ha_state == SA_AMF_HA_QUIESCING) { + // remove all susi belong to this su + avd_sg_su_si_del_snd(cb, su); + } else if (su_ha_state == SA_AMF_HA_ACTIVE) { + // quiesced this su + avd_sg_su_si_mod_snd(cb, su, SA_AMF_HA_QUIESCED); + } + avd_sg_su_oper_list_add(cb, su, false); + set_fsm_state(AVD_SG_FSM_SG_REALIGN); + } + // directly move from SHUTTING_DOWN to LOCKED + if (su->saAmfSUAdminState == SA_AMF_ADMIN_SHUTTING_DOWN) + su->set_admin_state(SA_AMF_ADMIN_LOCKED); + if (su->sg_of_su->saAmfSGAdminState == SA_AMF_ADMIN_SHUTTING_DOWN) + avd_sg_admin_state_set(su->sg_of_su, SA_AMF_ADMIN_LOCKED); + if (su->su_on_node->saAmfNodeAdminState == SA_AMF_ADMIN_SHUTTING_DOWN) + node_admin_state_set(su->su_on_node, SA_AMF_ADMIN_LOCKED); + } + } + + // Check AdminState of si whether is LOCKED or SHUTTING_DOWN + for (const auto& si : list_of_si) { + + AVD_SU_SI_REL *curr_active_susi = nullptr; + AVD_SU_SI_REL *curr_quiesce_susi = nullptr; + AVD_SU_SI_REL *curr_standby_susi = nullptr; + + TRACE("Check SI:'%s', saAmfSIAdminState:%u, saAmfSINumCurrActiveAssignments:%u, " + "saAmfSINumCurrStandbyAssignments:%u", + si->name.value, + si->saAmfSIAdminState, + si->saAmfSINumCurrActiveAssignments, + si->saAmfSINumCurrStandbyAssignments); + + for (curr_susi = si->list_of_sisu; curr_susi; + curr_susi = curr_susi->si_next) { + + TRACE("Check SUSI:'%s,%s', HaState:%u", curr_susi->su->name.value, + curr_susi->si->name.value, + curr_susi->state); + + if (si->saAmfSIAdminState == SA_AMF_ADMIN_SHUTTING_DOWN || + si->saAmfSIAdminState == SA_AMF_ADMIN_LOCKED) { + // only process assigned susi, ignore the others due to + // being modified or unassigned, ... + if (curr_susi->fsm == AVD_SU_SI_STATE_ASGND) { + if (curr_susi->state == SA_AMF_HA_STANDBY || + curr_susi->state == SA_AMF_HA_QUIESCED || + curr_susi->state == SA_AMF_HA_QUIESCING) { + // remove one susi + avd_susi_del_send(curr_susi); + } else if (curr_susi->state == SA_AMF_HA_ACTIVE) { + // quiesced one susi + avd_susi_mod_send(curr_susi, SA_AMF_HA_QUIESCED); + } + set_fsm_state(AVD_SG_FSM_SG_REALIGN); + avd_sg_su_oper_list_add(cb, curr_susi->su, false); + } + // directly move from SHUTTING_DOWN to LOCKED + if (si->saAmfSIAdminState == SA_AMF_ADMIN_SHUTTING_DOWN) + si->set_admin_state(SA_AMF_ADMIN_LOCKED); + } + if (curr_susi->fsm != AVD_SU_SI_STATE_ASGND) + continue; + + if (curr_susi->state == SA_AMF_HA_ACTIVE) + curr_active_susi = curr_susi; + else if (curr_susi->state == SA_AMF_HA_STANDBY) + curr_standby_susi = curr_susi; + else + curr_quiesce_susi = curr_susi; + } + + if (!curr_active_susi && !curr_quiesce_susi && !curr_standby_susi) + continue; + + // at this point in time, there could be inappropriate susi in term of + // HA state due to uncompleted failover/si-swap around the time cluster + // was going into headless. + // Adjust the HA state if neccessary. + if (curr_active_susi) { + if (!curr_standby_susi && curr_quiesce_susi) { + avd_sg_su_si_mod_snd(cb, curr_quiesce_susi->su, SA_AMF_HA_STANDBY); + avd_sg_su_oper_list_add(cb, curr_quiesce_susi->su, false); + set_fsm_state(AVD_SG_FSM_SG_REALIGN); + } + } else { + if (curr_standby_susi && curr_quiesce_susi) { + avd_sg_su_si_mod_snd(cb, curr_quiesce_susi->su, SA_AMF_HA_STANDBY); + avd_sg_su_oper_list_add(cb, curr_quiesce_susi->su, false); + + avd_sg_su_si_mod_snd(cb, curr_standby_susi->su, SA_AMF_HA_ACTIVE); + avd_sg_su_oper_list_add(cb, curr_standby_susi->su, false); + + set_fsm_state(AVD_SG_FSM_SG_REALIGN); + } else if (!curr_standby_susi && curr_quiesce_susi) { + avd_sg_su_si_mod_snd(cb, curr_quiesce_susi->su, SA_AMF_HA_ACTIVE); + avd_sg_su_oper_list_add(cb, curr_quiesce_susi->su, false); + + set_fsm_state(AVD_SG_FSM_SG_REALIGN); + } else if (curr_standby_susi && !curr_quiesce_susi) { + avd_sg_su_si_mod_snd(cb, curr_standby_susi->su, SA_AMF_HA_ACTIVE); + avd_sg_su_oper_list_add(cb, curr_standby_susi->su, false); + + set_fsm_state(AVD_SG_FSM_SG_REALIGN); + } + } + } + TRACE_LEAVE(); +} + uint32_t SG_2N::su_admin_down(AVD_CL_CB *cb, AVD_SU *su, AVD_AVND *avnd) { uint32_t rc = NCSCC_RC_FAILURE; diff --git a/osaf/services/saf/amf/amfd/sg_nored_fsm.cc b/osaf/services/saf/amf/amfd/sg_nored_fsm.cc --- a/osaf/services/saf/amf/amfd/sg_nored_fsm.cc +++ b/osaf/services/saf/amf/amfd/sg_nored_fsm.cc @@ -949,6 +949,12 @@ void SG_NORED::node_fail(AVD_CL_CB *cb, TRACE_LEAVE(); } +void SG_NORED::adjust_delayed_failover(AVD_CL_CB *cb) { + TRACE_ENTER(); + TRACE("Currently not applicable"); + TRACE_LEAVE(); +} + uint32_t SG_NORED::su_admin_down(AVD_CL_CB *cb, AVD_SU *su, AVD_AVND *avnd) { TRACE_ENTER2("%u", su->sg_of_su->sg_fsm_state); diff --git a/osaf/services/saf/amf/amfd/sg_npm_fsm.cc b/osaf/services/saf/amf/amfd/sg_npm_fsm.cc --- a/osaf/services/saf/amf/amfd/sg_npm_fsm.cc +++ b/osaf/services/saf/amf/amfd/sg_npm_fsm.cc @@ -4200,6 +4200,30 @@ void SG_NPM::node_fail(AVD_CL_CB *cb, AV return; } +void SG_NPM::adjust_delayed_failover(AVD_CL_CB *cb) { + uint32_t stb_si_count = 0; + uint32_t act_si_count = 0; + TRACE_ENTER(); + + for (const auto& su : list_of_su) { + stb_si_count += su->saAmfSUNumCurrStandbySIs; + act_si_count += su->saAmfSUNumCurrActiveSIs; + } + + if (stb_si_count > 0 && act_si_count == 0) { + // TODO (minhchau): smoothly transfer the assignment from STANDBY to ACTIVE + // as well as satisfying the configurations by: + // saAmfSGNumPrefAssignedSUs, saAmfSGMaxActiveSIsperSU, + // saAmfSGNumPrefActiveSUs, saAmfSIPrefActiveAssignments + for (const auto& si : list_of_si) { + si->delete_assignments(cb); + } + set_fsm_state(AVD_SG_FSM_SG_REALIGN); + } + + TRACE_LEAVE(); +} + uint32_t SG_NPM::su_admin_down(AVD_CL_CB *cb, AVD_SU *su, AVD_AVND *avnd) { uint32_t rc; diff --git a/osaf/services/saf/amf/amfd/sg_nway_fsm.cc b/osaf/services/saf/amf/amfd/sg_nway_fsm.cc --- a/osaf/services/saf/amf/amfd/sg_nway_fsm.cc +++ b/osaf/services/saf/amf/amfd/sg_nway_fsm.cc @@ -521,6 +521,30 @@ done: TRACE_LEAVE(); } +void SG_NWAY::adjust_delayed_failover(AVD_CL_CB *cb) { + uint32_t stb_si_count = 0; + uint32_t act_si_count = 0; + TRACE_ENTER(); + + for (const auto& su : list_of_su) { + stb_si_count += su->saAmfSUNumCurrStandbySIs; + act_si_count += su->saAmfSUNumCurrActiveSIs; + } + + if (stb_si_count > 0 && act_si_count == 0) { + // TODO(minhchau): smoothly transfer the assignment from STANDBY to ACTIVE + // as well as satisfying the configurations by: + // saAmfSGNumPrefAssignedSUs, saAmfSGMaxActiveSIsperSU, + // saAmfSGNumPrefActiveSUs, saAmfSIPrefActiveAssignments + for (const auto& si : list_of_si) { + si->delete_assignments(cb); + } + set_fsm_state(AVD_SG_FSM_SG_REALIGN); + } + + TRACE_LEAVE(); +} + uint32_t SG_NWAY::su_admin_down(AVD_CL_CB *cb, AVD_SU *su, AVD_AVND *avnd) { AVD_SU_SI_REL *curr_susi = 0; SaAmfHAStateT state; diff --git a/osaf/services/saf/amf/amfd/sg_nwayact_fsm.cc b/osaf/services/saf/amf/amfd/sg_nwayact_fsm.cc --- a/osaf/services/saf/amf/amfd/sg_nwayact_fsm.cc +++ b/osaf/services/saf/amf/amfd/sg_nwayact_fsm.cc @@ -1507,6 +1507,12 @@ void SG_NACV::node_fail(AVD_CL_CB *cb, A return; } +void SG_NACV::adjust_delayed_failover(AVD_CL_CB *cb) { + TRACE_ENTER(); + TRACE("Currently not applicable"); + TRACE_LEAVE(); +} + uint32_t SG_NACV::su_admin_down(AVD_CL_CB *cb, AVD_SU *su, AVD_AVND *avnd) { TRACE_ENTER2("%u", su->sg_of_su->sg_fsm_state); diff --git a/osaf/services/saf/amf/amfd/sgproc.cc b/osaf/services/saf/amf/amfd/sgproc.cc --- a/osaf/services/saf/amf/amfd/sgproc.cc +++ b/osaf/services/saf/amf/amfd/sgproc.cc @@ -278,7 +278,7 @@ done: * @param[in] su * **/ -static void su_try_repair(const AVD_SU *su) +void su_try_repair(const AVD_SU *su) { TRACE_ENTER2("Repair for SU:'%s'", su->name.value); @@ -288,11 +288,11 @@ static void su_try_repair(const AVD_SU * (su->saAmfSUPresenceState != SA_AMF_PRESENCE_TERMINATION_FAILED)) { saflog(LOG_NOTICE, amfSvcUsrName, "Ordering Auto repair of '%s' as sufailover repair action", - su->sg_of_su->name.value); + su->name.value); avd_admin_op_msg_snd(&su->name, AVSV_SA_AMF_SU, static_cast<SaAmfAdminOperationIdT>(SA_AMF_ADMIN_REPAIRED), su->su_on_node); } else { - saflog(LOG_NOTICE, amfSvcUsrName, "Autorepair not done for '%s'", su->sg_of_su->name.value); + saflog(LOG_NOTICE, amfSvcUsrName, "Autorepair not done for '%s'", su->name.value); } TRACE_LEAVE(); @@ -480,8 +480,11 @@ static uint32_t sg_su_failover_func(AVD_ } } + TRACE("init_state %u", avd_cb->init_state); + /*If the AvD is in AVD_APP_STATE then reassign all the SUSI assignments for this SU */ - if (avd_cb->init_state == AVD_APP_STATE) { + if (avd_cb->init_state == AVD_APP_STATE || + avd_cb->scs_absence_max_duration > 0) { /* Unlike active, quiesced and standby HA states, assignment counters in quiescing HA state are updated when AMFD receives assignment response from AMFND. During sufailover amfd will not receive @@ -489,6 +492,9 @@ static uint32_t sg_su_failover_func(AVD_ So if any SU is under going modify operation then update assignment counters for those SUSIs which are in quiescing state in the SU. */ + TRACE("Reassign SUSI assignments for %s, init_state %u", + su->name.value, avd_cb->init_state); + for (AVD_SU_SI_REL *susi = su->list_of_susi; susi; susi = susi->su_next) { if ((susi->fsm == AVD_SU_SI_STATE_MODIFY) && (susi->state == SA_AMF_HA_QUIESCING)) { @@ -738,7 +744,9 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb * disabled. */ - if (cb->init_state == AVD_INIT_DONE) { + TRACE("init_state %u", cb->init_state); + + if (cb->init_state == AVD_INIT_DONE && cb->scs_absence_max_duration == 0) { su->set_oper_state(SA_AMF_OPERATIONAL_DISABLED); su->set_readiness_state(SA_AMF_READINESS_OUT_OF_SERVICE); if (n2d_msg->msg_info.n2d_opr_state.node_oper_state == SA_AMF_OPERATIONAL_DISABLED) { @@ -752,7 +760,11 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb } } /* if (n2d_msg->msg_info.n2d_opr_state.node_oper_state == SA_AMF_OPERATIONAL_DISABLED) */ } /* if(cb->init_state == AVD_INIT_DONE) */ - else if (cb->init_state == AVD_APP_STATE) { + else if (cb->init_state == AVD_APP_STATE || + (cb->init_state == AVD_INIT_DONE && cb->scs_absence_max_duration > 0)) { + + TRACE("Setting SU to disabled in init_state %u", cb->init_state); + su->set_oper_state(SA_AMF_OPERATIONAL_DISABLED); su->set_readiness_state(SA_AMF_READINESS_OUT_OF_SERVICE); if (n2d_msg->msg_info.n2d_opr_state.node_oper_state == SA_AMF_OPERATIONAL_DISABLED) { @@ -873,6 +885,20 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb } } } else { /* if(su->sg_of_su->sg_ncs_spec == true) */ + if (avd_cb->scs_absence_max_duration > 0 && + su->saAmfSUPresenceState == SA_AMF_PRESENCE_UNINSTANTIATED && + su->saAmfSUPreInstantiable == false) { + // this is to allow non NPI SUs to be repaired if + // headless mode is enabled. Otherwise, the code + // following will assume the SU is already in service + // when it failed to instantiate while headless + + if (cb->init_state == AVD_APP_STATE) { + LOG_NO("Setting NPI SU '%s' to OOS after headless state", su->name.value); + su->set_readiness_state(SA_AMF_READINESS_OUT_OF_SERVICE); + } + } + /* If oper state of Uninstantiated SU got ENABLED so try to instantiate it after evaluating SG. */ if (su->saAmfSUPresenceState == SA_AMF_PRESENCE_UNINSTANTIATED) { @@ -1726,6 +1752,14 @@ uint32_t avd_sg_app_su_inst_func(AVD_CL_ TRACE_ENTER2("'%s'", sg->name.value); for (const auto& i_su : sg->list_of_su) { + TRACE("Checking '%s'", i_su->name.value); + + TRACE("saAmfSuReadinessState: %u", i_su->saAmfSuReadinessState); + TRACE("saAmfSUPreInstantiable: %u", i_su->saAmfSUPreInstantiable); + TRACE("saAmfSUPresenceState: %u", i_su->saAmfSUPresenceState); + TRACE("saAmfSUOperState: %u", i_su->saAmfSUOperState); + TRACE("term_state: %u", i_su->term_state); + su_node_ptr = i_su->get_node_ptr(); num_su++; /* Check if the SU is inservice */ @@ -1746,6 +1780,7 @@ uint32_t avd_sg_app_su_inst_func(AVD_CL_ (any_ng_in_locked_in_state(su_node_ptr) == false)) { if (i_su->is_in_service() == true) { + TRACE("Calling su_insvc() for '%s'", i_su->name.value); i_su->set_readiness_state(SA_AMF_READINESS_IN_SERVICE); i_su->sg_of_su->su_insvc(cb, i_su); diff --git a/osaf/services/saf/amf/amfd/si.cc b/osaf/services/saf/amf/amfd/si.cc --- a/osaf/services/saf/amf/amfd/si.cc +++ b/osaf/services/saf/amf/amfd/si.cc @@ -614,6 +614,10 @@ static AVD_SI *si_create(SaNameT *si_nam si->saAmfSIAdminState = SA_AMF_ADMIN_UNLOCKED; } + if (immutil_getAttr(const_cast<SaImmAttrNameT>("saAmfUnassignedAlarmStatus"), attributes, 0, &si->alarm_sent) != SA_AIS_OK) { + /* Empty, assign default value */ + si->alarm_sent = false; + } rc = 0; done: @@ -653,6 +657,7 @@ SaAisErrorT avd_si_config_get(AVD_APP *a const_cast<SaImmAttrNameT>("saAmfSIPrefActiveAssignments"), const_cast<SaImmAttrNameT>("saAmfSIPrefStandbyAssignments"), const_cast<SaImmAttrNameT>("saAmfSIAdminState"), + const_cast<SaImmAttrNameT>("saAmfUnassignedAlarmStatus"), nullptr }; @@ -1273,8 +1278,6 @@ void AVD_SI::update_ass_state() if (saAmfSINumCurrActiveAssignments == 0) { newState = SA_AMF_ASSIGNMENT_UNASSIGNED; } else { - osafassert(saAmfSINumCurrActiveAssignments == 1); - osafassert(saAmfSINumCurrStandbyAssignments == 0); newState = SA_AMF_ASSIGNMENT_FULLY_ASSIGNED; } break; @@ -1294,21 +1297,14 @@ void AVD_SI::update_ass_state() /* alarm & notifications */ if (saAmfSIAssignmentState == SA_AMF_ASSIGNMENT_UNASSIGNED) { - avd_send_si_unassigned_alarm(&name); - alarm_sent = true; - m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, this, AVSV_CKPT_SI_ALARM_SENT); + update_alarm_state(true); } else { avd_send_si_assigned_ntf(&name, oldState, saAmfSIAssignmentState); - /* Clear of alarm */ if ((oldState == SA_AMF_ASSIGNMENT_UNASSIGNED) && alarm_sent) { - avd_alarm_clear(&name, SA_AMF_NTFID_SI_UNASSIGNED, SA_NTF_SOFTWARE_ERROR); - m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, this, AVSV_CKPT_SI_ALARM_SENT); + update_alarm_state(false); } - - /* always reset in case the SI has been recycled */ - alarm_sent = false; } avd_saImmOiRtObjectUpdate(&name, "saAmfSIAssignmentState", @@ -1499,4 +1495,29 @@ const AVD_SIRANKEDSU *AVD_SI::get_si_ran } return sirankedsu; +} + +/* + * @brief Update alarm_sent by new value of @alarm_state, + * then update saAmfUnassignedAlarmStatus IMM attribute + * and raise/clear SI unassigned alarm (if specified) accordingly + * @param [in] @alarm_state: Indication of alarm raising/clearing + * @param [in] @sent_notification: Indication of sending alarm + * raising/clearing notification + */ +void AVD_SI::update_alarm_state(bool alarm_state, bool sent_notification) +{ + alarm_sent = alarm_state; + avd_saImmOiRtObjectUpdate(&name, "saAmfUnassignedAlarmStatus", + SA_IMM_ATTR_SAUINT32T, &alarm_sent); + m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, this, AVSV_CKPT_SI_ALARM_SENT); + + if (sent_notification == true) { + if (alarm_sent == true) { + avd_send_si_unassigned_alarm(&name); + } + else { + avd_alarm_clear(&name, SA_AMF_NTFID_SI_UNASSIGNED, SA_NTF_SOFTWARE_ERROR); + } + } } \ No newline at end of file diff --git a/osaf/services/saf/amf/amfd/siass.cc b/osaf/services/saf/amf/amfd/siass.cc --- a/osaf/services/saf/amf/amfd/siass.cc +++ b/osaf/services/saf/amf/amfd/siass.cc @@ -775,3 +775,124 @@ done: TRACE_LEAVE2("quiesc_role:%u",quiesc_role); return quiesc_role; } + +/** + * Clean up SUSI objects by searching for SaAmfSIAssignment instances in IMM + * @return SA_AIS_OK when OK + */ +SaAisErrorT avd_susi_cleanup(void) +{ + SaAisErrorT rc; + SaImmSearchHandleT searchHandle; + SaImmSearchParametersT_2 searchParam; + const char *className = "SaAmfSIAssignment"; + + TRACE_ENTER(); + + searchParam.searchOneAttr.attrName = const_cast<SaImmAttrNameT>("SaImmAttrClassName"); + searchParam.searchOneAttr.attrValueType = SA_IMM_ATTR_SASTRINGT; + searchParam.searchOneAttr.attrValue = &className; + + if ((rc = immutil_saImmOmSearchInitialize_2(avd_cb->immOmHandle, nullptr, SA_IMM_SUBTREE, + SA_IMM_SEARCH_ONE_ATTR | SA_IMM_SEARCH_GET_NO_ATTR, &searchParam, + nullptr, &searchHandle)) != SA_AIS_OK) { + LOG_ER("%s: saImmOmSearchInitialize_2 failed: %u", __FUNCTION__, rc); + goto done; + } + + SaNameT siass_name; + const SaImmAttrValuesT_2 **attributes; + while ((rc = immutil_saImmOmSearchNext_2(searchHandle, &siass_name, + (SaImmAttrValuesT_2 ***)&attributes)) == SA_AIS_OK) { + avd_saImmOiRtObjectDelete(&siass_name); + } + + (void)immutil_saImmOmSearchFinalize(searchHandle); + +done: + TRACE_LEAVE(); + return SA_AIS_OK; +} + +/** + * Recreates SUSI objects by with information retrieved from node directors. + * Update relevant runtime attributes + * @return SA_AIS_OK when OK + */ +SaAisErrorT avd_susi_recreate(AVSV_N2D_ND_SISU_STATE_MSG_INFO* info) +{ + TRACE_ENTER2("msg_id: %u node_id: %u num_sisu: %u", info->msg_id, + info->node_id, info->num_sisu); + AVD_SU_SI_REL *susi = nullptr; + AVD_AVND *node = nullptr; + + const AVSV_SISU_STATE_MSG *susi_state = nullptr; + const AVSV_SU_STATE_MSG *su_state = nullptr; + + node = avd_node_find_nodeid(info->node_id); + if (node == 0) { + LOG_ER("Node %" PRIx32 " has left the cluster", info->node_id); + return SA_AIS_ERR_NOT_EXIST; + } + + for (su_state = info->su_list; su_state != nullptr; + su_state = su_state->next) { + + AVD_SU *su = su_db->find(Amf::to_string(&su_state->safSU)); + osafassert(su); + + // present state + su->set_pres_state(static_cast<SaAmfPresenceStateT>(su_state->su_pres_state)); + + // oper state + su->set_oper_state(su_state->su_oper_state); + + // . readiness state is updated when node_up of PL is accepted + // . saAmfSUHostedByNode does not need to update since mapping + // su to node should reserve the same order + // . saAmfSUPreInstantiable wouldn't change during headless + // so they need not to update + // . saAmfSUNumCurrActiveSIs & saAmfSUNumCurrStandbySIs to be updated + // during avd_susi_create() + + // restart count + su->saAmfSURestartCount = su_state->su_restart_cnt; + avd_saImmOiRtObjectUpdate(&su->name, + const_cast<SaImmAttrNameT>("saAmfSURestartCount"), SA_IMM_ATTR_SAUINT32T, + &su->saAmfSURestartCount); + m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, su, AVSV_CKPT_SU_RESTART_COUNT); + } + + for (susi_state = info->sisu_list; susi_state != nullptr; + susi_state = susi_state->next) { + + assert(susi_state->safSI.length > 0); + AVD_SI *si = si_db->find(Amf::to_string(&susi_state->safSI)); + osafassert(si); + + AVD_SU *su = su_db->find(Amf::to_string(&susi_state->safSU)); + osafassert(su); + + SaAmfHAStateT ha_state = susi_state->saAmfSISUHAState; + + susi = avd_su_susi_find(avd_cb, su, &susi_state->safSI); + if (susi == nullptr) { + susi = avd_susi_create(avd_cb, si, su, ha_state, false); + osafassert(susi); + } else { + avd_susi_ha_state_set(susi, ha_state); + } + susi->fsm = AVD_SU_SI_STATE_ASGND; + + if (susi->state == SA_AMF_HA_QUIESCING) { + susi->su->inc_curr_act_si(); + susi->si->inc_curr_act_ass(); + } + + m_AVSV_SEND_CKPT_UPDT_ASYNC_ADD(avd_cb, susi, AVSV_CKPT_AVD_SI_ASS); + } + + + TRACE_LEAVE(); + return SA_AIS_OK; +} diff --git a/osaf/services/saf/amf/amfd/su.cc b/osaf/services/saf/amf/amfd/su.cc --- a/osaf/services/saf/amf/amfd/su.cc +++ b/osaf/services/saf/amf/amfd/su.cc @@ -1368,21 +1368,14 @@ static SaAisErrorT su_ccb_completed_modi SaAisErrorT rc = SA_AIS_OK; const SaImmAttrModificationT_2 *attr_mod; int i = 0; - bool value_is_deleted = false; while ((attr_mod = opdata->param.modify.attrMods[i++]) != nullptr) { - if ((attr_mod->modType == SA_IMM_ATTR_VALUES_DELETE) || - (attr_mod->modAttr.attrValues == nullptr)) { - /* Attribute value is deleted, revert to default value if applicable*/ - value_is_deleted = true; - } else { - /* Attribute value is modified */ - value_is_deleted = false; - } + /* Attribute value removed */ + if ((attr_mod->modType == SA_IMM_ATTR_VALUES_DELETE) || (attr_mod->modAttr.attrValues == nullptr)) + continue; + if (!strcmp(attr_mod->modAttr.attrName, "saAmfSUFailover")) { - if (value_is_deleted == true) - continue; AVD_SU *su = su_db->find(Amf::to_string(&opdata->objectName)); uint32_t su_failover = *((SaUint32T *)attr_mod->modAttr.attrValues[0]); @@ -1403,8 +1396,6 @@ static SaAisErrorT su_ccb_completed_modi goto done; } } else if (!strcmp(attr_mod->modAttr.attrName, "saAmfSUMaintenanceCampaign")) { - if (value_is_deleted == true) - continue; AVD_SU *su = su_db->find(Amf::to_string(&opdata->objectName)); if (su->saAmfSUMaintenanceCampaign.length > 0) { @@ -1414,8 +1405,6 @@ static SaAisErrorT su_ccb_completed_modi goto done; } } else if (!strcmp(attr_mod->modAttr.attrName, "saAmfSUType")) { - if (value_is_deleted == true) - continue; AVD_SU *su; SaNameT sutype_name = *(SaNameT*) attr_mod->modAttr.attrValues[0]; su = su_db->find(Amf::to_string(&opdata->objectName)); ------------------------------------------------------------------------------ Site24x7 APM Insight: Get Deep Visibility into Application Performance APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month Monitor end-to-end web transactions and take corrective actions now Troubleshoot faster and improve end-user experience. Signup Now! http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel