Hi Praveen

Ack (review only + regression tests run)

thanks

On 27/3/17, 8:58 pm, "[email protected]" <[email protected]> 
wrote:

     src/amf/amfd/clm.cc  |  43 ++++++++++++++++++++++++++++++-------------
     src/amf/amfnd/clm.cc |   6 ++++--
     2 files changed, 34 insertions(+), 15 deletions(-)
    
    
    In reported issue, two CLM nodes are locked simultaneously. For one of the 
nodes,
    CLM lock gets timed out and user gets REPAIR_PENDING as return code. The 
two payloads
    being locked hosts Amf_demo with 2N model.
    
    When AMFD gets CLM track callback for PL-3 it starts terminating amf demo 
on PL-3. When
    termination of amf_demo still going on, user clm locks PL-4 and AMF gets 
another track callback
    with rootcausetentity as PL-4. Callback contains information of PL-3 also 
as this node is still
    in pending change phase. AMFD starts terminating amf_demo on PL-4 but at 
the same time it
    incorreclty responds for PL-3 with invocationId of PL-4 callback. CLM 
assumes that for PL-4
    change_started completed and sends completion callback for PL-4. In this 
callback,
    AMF clears internal flags which monitors the graceful removal of nodes.
    Since AMF never responds for PL-3 callback, node lock timer expires in CLMD 
and it sends
    complete callback to AMF and responds user with REPAIR_PENDING. AMF thinks 
this is
    the case of nodefailover and tries to failover PL-3.
    
    Patch fixes this problem in both AMFD and AMFND:
    -to act on CHANGE_START step only once for a node (amfd).
    -to act on COMPLETE step only when rootCauseEntity matches and if it
     is graceful removal of node(amfd).
    -to act only once in tracl callback for COMPLETE step(amfnd).
    
    diff --git a/src/amf/amfd/clm.cc b/src/amf/amfd/clm.cc
    --- a/src/amf/amfd/clm.cc
    +++ b/src/amf/amfd/clm.cc
    @@ -203,6 +203,7 @@ static void clm_node_exit_complete(SaClm
     
        avd_node_failover(node);
        m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, 
AVSV_CKPT_AVD_NODE_CONFIG);
    +   node->clm_change_start_preceded = false;
     
     done:
        TRACE_LEAVE();
    @@ -218,7 +219,7 @@ static void clm_track_cb(const SaClmClus
        AVD_AVND *node;
     
        TRACE_ENTER2("'%llu' '%u' '%u'", invocation, step, error);
    -
    +   
        if (error != SA_AIS_OK) {
                LOG_ER("ClmTrackCallback received in error");
                goto done;
    @@ -232,10 +233,13 @@ static void clm_track_cb(const SaClmClus
        ** The CLM cluster can be larger than the AMF cluster thus it is not an
        ** error if the corresponding AMF node cannot be found.
        */
    +   TRACE("numberOfMembers:'%u', numberOfItems:'%u'", numberOfMembers,
    +                   notificationBuffer->numberOfItems);
        for (i = 0; i < notificationBuffer->numberOfItems; i++)
        {
                notifItem = &notificationBuffer->notification[i];
                const std::string 
node_name(Amf::to_string(&notifItem->clusterNode.nodeName));
    +           TRACE("i=%u, node:'%s', clusterChange:%u",i, node_name.c_str(), 
notifItem->clusterChange);
                switch(step) {
                case SA_CLM_CHANGE_VALIDATE:
                        if(notifItem->clusterChange == SA_CLM_NODE_LEFT) {
    @@ -264,6 +268,10 @@ static void clm_track_cb(const SaClmClus
                        }
                        if ( notifItem->clusterChange == SA_CLM_NODE_LEFT ||
                             notifItem->clusterChange == SA_CLM_NODE_SHUTDOWN ) 
{
    +                           if (node->clm_change_start_preceded == true) {
    +                                   TRACE_3("Already got callback for start 
of this change.");
    +                                   continue;
    +                           }
                                /* invocation to be used by pending clm 
response */ 
                                node->clm_pend_inv = invocation;
                                clm_node_exit_start(node, 
notifItem->clusterChange);
    @@ -298,25 +306,34 @@ static void clm_track_cb(const SaClmClus
                                        }
                                        
clm_node_exit_complete(notifItem->clusterNode.nodeId);
                                } else if 
(strncmp(osaf_extended_name_borrow(rootCauseEntity), "safNode=", 8) == 0) {
    +                                   const std::string 
rootCause_clm_node(Amf::to_string(rootCauseEntity));
                                        /* This callback is because of 
operation on CLM.*/
    -                                   if(true == 
node->clm_change_start_preceded) {
    +                                   if (true == 
node->clm_change_start_preceded) {
                                                /* We have got a completed 
callback with start cbk step before, so 
                                                   already locking applications 
might have been done. So, no action
    -                                              is needed.*/
    -                                           node->clm_change_start_preceded 
= false; 
    -                                           node->node_info.member = 
SA_FALSE;
    -                                           
m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, AVSV_CKPT_AVD_NODE_CONFIG);
    -                                   }
    -                                   else
    -                                   {
    +                                              is needed.
    +                                              Completed callback for a 
node going CLM admin operation may 
    +                                              come with completed callabck 
when some other CLM node leaves
    +                                              cluster membership becuase 
of OpenSAF stop or one more node is
    +                                              CLM locked and this second 
nodes exits first.
    +                                              Act only when callback comes 
for this admin op node.
    +                                            */
    +                                           if 
(rootCause_clm_node.compare(node->saAmfNodeClmNode) == 0) {
    +                                                   
node->clm_change_start_preceded = false;
    +                                                   node->node_info.member 
= SA_FALSE;
    +                                                   
m_AVSV_SEND_CKPT_UPDT_ASYNC_UPDT(avd_cb, node, AVSV_CKPT_AVD_NODE_CONFIG);
    +                                           } else {
    +                                                   TRACE("'%s' not mapped 
to rootCauseEntity '%s'.",
    +                                                                   
node->name.c_str(), rootCause_clm_node.c_str());
    +                                                   continue;
    +                                           }
    +
    +                                   } else {
                                                /* We have encountered a 
completed callback without start step, there
                                                   seems error condition, node 
would have gone down suddenly. */
                                                
clm_node_exit_complete(notifItem->clusterNode.nodeId);
                                        }
    -
    -
    -                           }
    -                           else {
    +                           } else {
                                        /* We shouldn't get into this 
situation.*/
                                        LOG_ER("Wrong rootCauseEntity %s", 
osaf_extended_name_borrow(rootCauseEntity));
                                        osafassert(0);
    diff --git a/src/amf/amfnd/clm.cc b/src/amf/amfnd/clm.cc
    --- a/src/amf/amfnd/clm.cc
    +++ b/src/amf/amfnd/clm.cc
    @@ -52,7 +52,7 @@ static void clm_node_left(SaClmNodeIdT n
     
        TRACE_ENTER2("%u", node_id);
     
    -   if(node_id == avnd_cb->node_info.nodeId) {
    +   if (node_id == avnd_cb->node_info.nodeId) {
        /* if you received a node left indication from clm for self node
           terminate all the non_ncs components; ncs components :-TBD */
           
    @@ -218,7 +218,9 @@ static void clm_track_cb(const SaClmClus
                        if(false == avnd_cb->first_time_up) {
                                /* When node reboots, we will get an exit cbk, 
so ignore if avnd_cb->first_time_up
                                   is false. */
    -                           if(notifItem->clusterNode.nodeId == 
avnd_cb->node_info.nodeId) {
    +                           if ((notifItem->clusterNode.nodeId == 
avnd_cb->node_info.nodeId) &&
    +                                           (avnd_cb->node_info.member == 
SA_TRUE)) {
    +                                   //Act only once on CLM callback.
                                        
clm_node_left(notifItem->clusterNode.nodeId);   
                                }
                        }
    



------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to