Hi Praveen Ack (review only + regression tests run)
thanks On 27/3/17, 8:58 pm, "[email protected]" <[email protected]> wrote: src/amf/amfd/clm.cc | 43 ++++++++++++++++++++++++++++++------------- src/amf/amfnd/clm.cc | 6 ++++-- 2 files changed, 34 insertions(+), 15 deletions(-) In reported issue, two CLM nodes are locked simultaneously. For one of the nodes, CLM lock gets timed out and user gets REPAIR_PENDING as return code. The two payloads being locked hosts Amf_demo with 2N model. When AMFD gets CLM track callback for PL-3 it starts terminating amf demo on PL-3. When termination of amf_demo still going on, user clm locks PL-4 and AMF gets another track callback with rootcausetentity as PL-4. Callback contains information of PL-3 also as this node is still in pending change phase. AMFD starts terminating amf_demo on PL-4 but at the same time it incorreclty responds for PL-3 with invocationId of PL-4 callback. CLM assumes that for PL-4 change_started completed and sends completion callback for PL-4. In this callback, AMF clears internal flags which monitors the graceful removal of nodes. Since AMF never responds for PL-3 callback, node lock timer expires in CLMD and it sends complete callback to AMF and responds user with REPAIR_PENDING. AMF thinks this is the case of nodefailover and tries to failover PL-3. Patch fixes this problem in both AMFD and AMFND: -to act on CHANGE_START step only once for a node (amfd). -to act on COMPLETE step only when rootCauseEntity matches and if it is graceful removal of node(amfd). -to act only once in tracl callback for COMPLETE step(amfnd). diff --git a/src/amf/amfd/clm.cc b/src/amf/amfd/clm.cc --- a/src/amf/amfd/clm.cc +++ b/src/amf/amfd/clm.cc @@ -203,6 +203,7 @@ static void clm_node_exit_complete(SaClm avd_node_failover(node); m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, AVSV_CKPT_AVD_NODE_CONFIG); + node->clm_change_start_preceded = false; done: TRACE_LEAVE(); @@ -218,7 +219,7 @@ static void clm_track_cb(const SaClmClus AVD_AVND *node; TRACE_ENTER2("'%llu' '%u' '%u'", invocation, step, error); - + if (error != SA_AIS_OK) { LOG_ER("ClmTrackCallback received in error"); goto done; @@ -232,10 +233,13 @@ static void clm_track_cb(const SaClmClus ** The CLM cluster can be larger than the AMF cluster thus it is not an ** error if the corresponding AMF node cannot be found. */ + TRACE("numberOfMembers:'%u', numberOfItems:'%u'", numberOfMembers, + notificationBuffer->numberOfItems); for (i = 0; i < notificationBuffer->numberOfItems; i++) { notifItem = ¬ificationBuffer->notification[i]; const std::string node_name(Amf::to_string(¬ifItem->clusterNode.nodeName)); + TRACE("i=%u, node:'%s', clusterChange:%u",i, node_name.c_str(), notifItem->clusterChange); switch(step) { case SA_CLM_CHANGE_VALIDATE: if(notifItem->clusterChange == SA_CLM_NODE_LEFT) { @@ -264,6 +268,10 @@ static void clm_track_cb(const SaClmClus } if ( notifItem->clusterChange == SA_CLM_NODE_LEFT || notifItem->clusterChange == SA_CLM_NODE_SHUTDOWN ) { + if (node->clm_change_start_preceded == true) { + TRACE_3("Already got callback for start of this change."); + continue; + } /* invocation to be used by pending clm response */ node->clm_pend_inv = invocation; clm_node_exit_start(node, notifItem->clusterChange); @@ -298,25 +306,34 @@ static void clm_track_cb(const SaClmClus } clm_node_exit_complete(notifItem->clusterNode.nodeId); } else if (strncmp(osaf_extended_name_borrow(rootCauseEntity), "safNode=", 8) == 0) { + const std::string rootCause_clm_node(Amf::to_string(rootCauseEntity)); /* This callback is because of operation on CLM.*/ - if(true == node->clm_change_start_preceded) { + if (true == node->clm_change_start_preceded) { /* We have got a completed callback with start cbk step before, so already locking applications might have been done. So, no action - is needed.*/ - node->clm_change_start_preceded = false; - node->node_info.member = SA_FALSE; - m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, AVSV_CKPT_AVD_NODE_CONFIG); - } - else - { + is needed. + Completed callback for a node going CLM admin operation may + come with completed callabck when some other CLM node leaves + cluster membership becuase of OpenSAF stop or one more node is + CLM locked and this second nodes exits first. + Act only when callback comes for this admin op node. + */ + if (rootCause_clm_node.compare(node->saAmfNodeClmNode) == 0) { + node->clm_change_start_preceded = false; + node->node_info.member = SA_FALSE; + m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, AVSV_CKPT_AVD_NODE_CONFIG); + } else { + TRACE("'%s' not mapped to rootCauseEntity '%s'.", + node->name.c_str(), rootCause_clm_node.c_str()); + continue; + } + + } else { /* We have encountered a completed callback without start step, there seems error condition, node would have gone down suddenly. */ clm_node_exit_complete(notifItem->clusterNode.nodeId); } - - - } - else { + } else { /* We shouldn't get into this situation.*/ LOG_ER("Wrong rootCauseEntity %s", osaf_extended_name_borrow(rootCauseEntity)); osafassert(0); diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc --- a/src/amf/amfnd/clm.cc +++ b/src/amf/amfnd/clm.cc @@ -52,7 +52,7 @@ static void clm_node_left(SaClmNodeIdT n TRACE_ENTER2("%u", node_id); - if(node_id == avnd_cb->node_info.nodeId) { + if (node_id == avnd_cb->node_info.nodeId) { /* if you received a node left indication from clm for self node terminate all the non_ncs components; ncs components :-TBD */ @@ -218,7 +218,9 @@ static void clm_track_cb(const SaClmClus if(false == avnd_cb->first_time_up) { /* When node reboots, we will get an exit cbk, so ignore if avnd_cb->first_time_up is false. */ - if(notifItem->clusterNode.nodeId == avnd_cb->node_info.nodeId) { + if ((notifItem->clusterNode.nodeId == avnd_cb->node_info.nodeId) && + (avnd_cb->node_info.member == SA_TRUE)) { + //Act only once on CLM callback. clm_node_left(notifItem->clusterNode.nodeId); } } ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
