osaf/services/saf/amf/amfnd/di.cc | 7 +++++-- osaf/services/saf/amf/amfnd/susm.cc | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-)
This case of SC failover causes new active AMFD getting stuck in node_up messages Say first active controller is SC1, which goes down during headless sync. Therefore, the amfnd on SC2 receives mds_down of AVD, then both is_avd_down and amfd_sync_required are set to true. When SC2 takes over active role, amfnd on SC2 receives mds_up, but only is_avd_down is set to false and the variable amfd_sync_required remains true. When amfnd-SC2 finishes initiating middleware SU, it needs to send su_oper message to AMFD, but it is failed to send out due to amfd_sync_required. In this scenario of SC failover, amfd_sync_required needs to set to false when amfnd on SC2 receives su_pres message on middleware SUs. That means amfnd on active controller does not need to wait for set_leds message, to be informed that cluster initiation is done, so that amfnd can sen su_oper messages to AMFD. This logic also aligns with normal headless scenario, where amfnd on active controller has amfd_sync_required initially marked as false because no middleware SUs are initiated. When amfd_sync_required is true that means amfnd all middleware SUs are initiated and assigned before headless, thus amfnd needs to wait for cluster initiation after headless. diff --git a/osaf/services/saf/amf/amfnd/di.cc b/osaf/services/saf/amf/amfnd/di.cc --- a/osaf/services/saf/amf/amfnd/di.cc +++ b/osaf/services/saf/amf/amfnd/di.cc @@ -748,7 +748,8 @@ uint32_t avnd_di_oper_send(AVND_CB *cb, if (avnd_diq_rec_add(cb, &msg) == nullptr) { rc = NCSCC_RC_FAILURE; } - LOG_NO("avnd_di_oper_send() deferred as AMF director is offline"); + LOG_NO("avnd_di_oper_send() deferred as AMF director is offline(%d)," + " or sync is required(%d)", cb->is_avd_down, cb->amfd_sync_required); } else { // We are in normal cluster, send msg to director msg.info.avd->msg_info.n2d_opr_state.msg_id = ++(cb->snd_msg_id); @@ -881,7 +882,9 @@ uint32_t avnd_di_susi_resp_send(AVND_CB rc = NCSCC_RC_FAILURE; } m_AVND_SU_ALL_SI_RESET(su); - LOG_NO("avnd_di_susi_resp_send() deferred as AMF director is offline"); + LOG_NO("avnd_di_susi_resp_send() deferred as AMF director is offline(%d)," + " or sync is required(%d)", cb->is_avd_down, cb->amfd_sync_required); + } else { // We are in normal cluster, send msg to director msg.info.avd->msg_info.n2d_su_si_assign.msg_id = ++(cb->snd_msg_id); diff --git a/osaf/services/saf/amf/amfnd/susm.cc b/osaf/services/saf/amf/amfnd/susm.cc --- a/osaf/services/saf/amf/amfnd/susm.cc +++ b/osaf/services/saf/amf/amfnd/susm.cc @@ -1345,6 +1345,12 @@ uint32_t avnd_evt_avd_su_pres_evh(AVND_C goto done; } } else { /* => instantiate the su */ + // Do not need to wait for headless sync if there is no application SUs + // initiated. This is known because here we are receiving su_pres message + // for NCS SUs + if (su->is_ncs == true) + cb->amfd_sync_required = false; + AVND_EVT *evt_ir = 0; TRACE("Sending to Imm thread."); evt_ir = avnd_evt_create(cb, AVND_EVT_IR, 0, nullptr, &info->su_name, 0, 0); ------------------------------------------------------------------------------ Developer Access Program for Intel Xeon Phi Processors Access to Intel Xeon Phi processor-based developer platforms. With one year of Intel Parallel Studio XE. Training and support from Colfax. Order your platform today. http://sdm.link/xeonphi _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel