osaf/services/saf/amf/amfd/csi.cc | 14 +++++++++++++- osaf/services/saf/amf/amfd/include/su.h | 2 +- osaf/services/saf/amf/amfd/sg.cc | 6 +++++- osaf/services/saf/amf/amfd/su.cc | 1 + 4 files changed, 20 insertions(+), 3 deletions(-)
The problem happens if csi is deleted and component delays the csi_remove_callback after SC comes back from headless. At standby SU, this csi hasn't been removed It's because the standby SU still sends assignment info as recovery data since the component in active SU has pending the csi_remove_callback. Logically, amfnd should verify all csi being sent to amfd as recovery data. If csi is deleted, amfnd will issue remove callback and don't send deleted csi. However, verifying csi needs to initialize IMM handle, that could lead to hang amfnd (if IMMND dies) and eventually cause node synce timeout. The patch views this scenario as an inconsistency of csi between amfd and amfnd, thus the standby SU is removed assigment (including deleted csi) and re-assigned standby assignment (excluding deleted csi). diff --git a/osaf/services/saf/amf/amfd/csi.cc b/osaf/services/saf/amf/amfd/csi.cc --- a/osaf/services/saf/amf/amfd/csi.cc +++ b/osaf/services/saf/amf/amfd/csi.cc @@ -1467,7 +1467,19 @@ SaAisErrorT avd_compcsi_recreate(AVSV_N2 for (csicomp = info->csicomp_list; csicomp != nullptr; csicomp=csicomp->next) { csi = csi_db->find(Amf::to_string(&csicomp->safCSI)); - osafassert(csi); + if (csi == nullptr) { + // CSI may be not found in csi_db. csi is deleted that trigger + // csi_remove_callback, but amf component hasn't responded to callback + // and at the time cluster goes headless. Therefore, amfd gets csi + // assignment for non-existed csi + LOG_WA("CSI: %s not found in csi_db, it's currently assigned to comp: %s", + Amf::to_string(&csicomp->safCSI).c_str(), + Amf::to_string(&csicomp->safComp).c_str()); + comp = comp_db->find(Amf::to_string(&csicomp->safComp)); + osafassert(comp); + comp->su->assignment_out_of_sync = true; + continue; + } comp = comp_db->find(Amf::to_string(&csicomp->safComp)); osafassert(comp); diff --git a/osaf/services/saf/amf/amfd/include/su.h b/osaf/services/saf/amf/amfd/include/su.h --- a/osaf/services/saf/amf/amfd/include/su.h +++ b/osaf/services/saf/amf/amfd/include/su.h @@ -94,7 +94,7 @@ class AVD_SU { AVD_SUTYPE *su_type; AVD_SU *su_list_su_type_next; - + bool assignment_out_of_sync; void set_su_failover(bool value); void dec_curr_stdby_si(); void inc_curr_stdby_si(); diff --git a/osaf/services/saf/amf/amfd/sg.cc b/osaf/services/saf/amf/amfd/sg.cc --- a/osaf/services/saf/amf/amfd/sg.cc +++ b/osaf/services/saf/amf/amfd/sg.cc @@ -2116,7 +2116,11 @@ void AVD_SG::adjust_intermediate_sg(AVD_ su->su_on_node->saAmfNodeAdminState, su->saAmfSUNumCurrActiveSIs, su->saAmfSUNumCurrStandbySIs); - + if (su->assignment_out_of_sync == true) { + su_fault(cb, su); + su->assignment_out_of_sync = false; + continue; + } if (su->saAmfSUAdminState == SA_AMF_ADMIN_LOCKED || su->sg_of_su->saAmfSGAdminState == SA_AMF_ADMIN_LOCKED || su->su_on_node->saAmfNodeAdminState == SA_AMF_ADMIN_LOCKED || diff --git a/osaf/services/saf/amf/amfd/su.cc b/osaf/services/saf/amf/amfd/su.cc --- a/osaf/services/saf/amf/amfd/su.cc +++ b/osaf/services/saf/amf/amfd/su.cc @@ -64,6 +64,7 @@ void AVD_SU::initialize() { pend_cbk.invocation = 0; pend_cbk.admin_oper = (SaAmfAdminOperationIdT)0; surestart = false; + assignment_out_of_sync = false; } AVD_SU::AVD_SU() { ------------------------------------------------------------------------------ Site24x7 APM Insight: Get Deep Visibility into Application Performance APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month Monitor end-to-end web transactions and take corrective actions now Troubleshoot faster and improve end-user experience. Signup Now! http://pubads.g.doubleclick.net/gampad/clk?id=272487151&iu=/4140 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel