Hi Ravi,

Attach the additional patch (on top of first one) that incorporates to your comments, please have a look if it's ok.

Thanks,

Minh


On 01/11/17 16:11, Ravi Sekhar Reddy Konda wrote:
Hi Minh,

I am fine with the patch, but  have couple of queries, please see my comments 
inline

Regards,
Ravi

-----Original Message-----
From: Minh Chau [mailto:[email protected]]
Sent: Friday, October 20, 2017 4:06 AM
To: [email protected]; [email protected]; 
[email protected]
Cc: [email protected]; Minh Chau <[email protected]>
Subject: [PATCH 1/1] amfd: Add retry mechanism for ClmTrackStart/Stop as job 
queue [#2631]

Currenty amfd does not retry ClmTrackStart/Stop after swicthover if the CLM 
APIs are still unavailable due to high loaded system or connectivity issue. 
This patch adds a retry mechanism for CLM APIs as other existing IMM/NTF APIs
---
  src/amf/amfd/clm.cc       | 83 +++++++++++++++++++++++++++++++++++++++++------
  src/amf/amfd/clm.h        | 27 +++++++++++++--
  src/amf/amfd/imm.h        |  1 +
  src/amf/amfd/role.cc      | 15 +++------
  src/amf/amfd/sg_2n_fsm.cc |  3 +-
  src/amf/amfd/sgproc.cc    |  3 +-
  6 files changed, 108 insertions(+), 24 deletions(-)

diff --git a/src/amf/amfd/clm.cc b/src/amf/amfd/clm.cc index d8342ca..1c82620 
100644
--- a/src/amf/amfd/clm.cc
+++ b/src/amf/amfd/clm.cc
@@ -475,13 +475,16 @@ done:
    return error;
  }
-SaAisErrorT avd_clm_track_start(void) {
+SaAisErrorT avd_clm_track_start(AVD_CL_CB* cb) {
    SaUint8T trackFlags = SA_TRACK_CURRENT | SA_TRACK_CHANGES_ONLY |
                          SA_TRACK_VALIDATE_STEP | SA_TRACK_START_STEP;
TRACE_ENTER();
-  SaAisErrorT error =
-      saClmClusterTrack_4(avd_cb->clmHandle, trackFlags, nullptr);
+  SaAisErrorT error = SA_AIS_ERR_TRY_AGAIN;
+
+  if (cb->is_clm_track_started == true) goto done;
+
+  error = saClmClusterTrack_4(cb->clmHandle, trackFlags, nullptr);
    if (error != SA_AIS_OK) {
      if (error == SA_AIS_ERR_TRY_AGAIN || error == SA_AIS_ERR_TIMEOUT ||
          error == SA_AIS_ERR_UNAVAILABLE) { @@ -490,15 +493,20 @@ SaAisErrorT 
avd_clm_track_start(void) {
        LOG_ER("Failed to start cluster tracking %u", error);
      }
    } else {
-    avd_cb->is_clm_track_started = true;
+    cb->is_clm_track_started = true;
    }
+done:
    TRACE_LEAVE();
    return error;
  }
-SaAisErrorT avd_clm_track_stop(void) {
+SaAisErrorT avd_clm_track_stop(AVD_CL_CB* cb) {
    TRACE_ENTER();
-  SaAisErrorT error = saClmClusterTrackStop(avd_cb->clmHandle);
+  SaAisErrorT error = SA_AIS_ERR_TRY_AGAIN;
+
+  if (cb->is_clm_track_started == false) goto done;
+
+  error = saClmClusterTrackStop(cb->clmHandle);
    if (error != SA_AIS_OK) {
      if (error == SA_AIS_ERR_TRY_AGAIN || error == SA_AIS_ERR_TIMEOUT ||
          error == SA_AIS_ERR_UNAVAILABLE) { @@ -506,15 +514,16 @@ SaAisErrorT 
avd_clm_track_stop(void) {
      } else if (error == SA_AIS_ERR_NOT_EXIST) {
        /* track changes was not started or stopped successfully */
        LOG_WA("Failed to stop cluster tracking %u", error);
-      avd_cb->is_clm_track_started = false;
+      cb->is_clm_track_started = false;
+      error = SA_AIS_OK;
      } else {
        LOG_ER("Failed to stop cluster tracking %u", error);
      }
    } else {
      TRACE("Sucessfully stops cluster tracking");
-    avd_cb->is_clm_track_started = false;
+    cb->is_clm_track_started = false;
    }
-
+done:
    TRACE_LEAVE();
    return error;
  }
@@ -550,7 +559,7 @@ static void *avd_clm_init_thread(void *arg) {
if (cb->avail_state_avd == SA_AMF_HA_ACTIVE) {
      for (;;) {
-      error = avd_clm_track_start();
+      error = avd_clm_track_start(cb);
        if (error == SA_AIS_ERR_TRY_AGAIN || error == SA_AIS_ERR_TIMEOUT ||
            error == SA_AIS_ERR_UNAVAILABLE) {
          osaf_nanosleep(&kHundredMilliseconds);
@@ -584,3 +593,57 @@ SaAisErrorT avd_start_clm_init_bg(void) {
    pthread_attr_destroy(&attr);
    return SA_AIS_OK;
  }
+
+AvdJobDequeueResultT ClmTrackStart::exec(const AVD_CL_CB* cb) {
+  AvdJobDequeueResultT res;
+  TRACE_ENTER();
+
+  SaAisErrorT rc = avd_clm_track_start(const_cast<AVD_CL_CB*>(cb));
+  if (rc == SA_AIS_OK) {
+    delete Fifo::dequeue();
+    res = JOB_EXECUTED;
+  } else if (rc == SA_AIS_ERR_TRY_AGAIN) {
+    TRACE("TRY-AGAIN");
+    res = JOB_ETRYAGAIN;
+  } else if (rc == SA_AIS_ERR_TIMEOUT) {
+    TRACE("TIMEOUT");
+    res = JOB_ETRYAGAIN;
+  } else if (rc == SA_AIS_ERR_UNAVAILABLE) {
+    TRACE("UNAVAILABLE");
+    res = JOB_ETRYAGAIN;
+  } else {
+    delete Fifo::dequeue();
+    LOG_ER("%s: ClmTrackStart FAILED %u", __FUNCTION__, rc);
+    res = JOB_ERR;
+  }
+
+  TRACE_LEAVE();
+  return res;
+}
+
+AvdJobDequeueResultT ClmTrackStop::exec(const AVD_CL_CB* cb) {
+  AvdJobDequeueResultT res;
+  TRACE_ENTER();
+
+  SaAisErrorT rc = avd_clm_track_stop(const_cast<AVD_CL_CB*>(cb));
+  if (rc == SA_AIS_OK) {
+    delete Fifo::dequeue();
+    res = JOB_EXECUTED;
+  } else if (rc == SA_AIS_ERR_TRY_AGAIN) {
+    TRACE("TRY-AGAIN");
+    res = JOB_ETRYAGAIN;
+  } else if (rc == SA_AIS_ERR_TIMEOUT) {
+    TRACE("TIMEOUT");
+    res = JOB_ETRYAGAIN;
+  } else if (rc == SA_AIS_ERR_UNAVAILABLE) {
+    TRACE("UNAVAILABLE");
+    res = JOB_ETRYAGAIN;
+  } else {
+    delete Fifo::dequeue();
+    LOG_ER("%s: ClmTrackStop FAILED %u", __FUNCTION__, rc);
+    res = JOB_ERR;
+  }
+
+  TRACE_LEAVE();
+  return res;
+}
diff --git a/src/amf/amfd/clm.h b/src/amf/amfd/clm.h index 6adf796..2bbe320 
100644
--- a/src/amf/amfd/clm.h
+++ b/src/amf/amfd/clm.h
@@ -1,6 +1,7 @@
  /*      -*- OpenSAF  -*-
   *
   * (C) Copyright 2010 The OpenSAF Foundation
+ * Copyright Ericsson AB 2017 - All Rights Reserved.
   *
   * This program is distributed in the hope that it will be useful, but
   * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
@@ -16,6 +17,7 @@
   */
#include <saClm.h>
+#include "amf/amfd/imm.h"
#ifndef AMF_AMFD_CLM_H_
  #define AMF_AMFD_CLM_H_
@@ -23,9 +25,30 @@
  struct cl_cb_tag;
extern SaAisErrorT avd_clm_init(struct cl_cb_tag *); -extern SaAisErrorT avd_clm_track_start(void); -extern SaAisErrorT avd_clm_track_stop(void);
+extern SaAisErrorT avd_clm_track_start(struct cl_cb_tag *); extern
+SaAisErrorT avd_clm_track_stop(struct cl_cb_tag *);
  extern void clm_node_terminate(AVD_AVND *node);  extern SaAisErrorT 
avd_start_clm_init_bg(void);
+class ClmJob : public Job {
+public:
+  ClmJob() {}
+  AvdJobTypeT getJobType() { return JOB_TYPE_CLM; }
+  bool isRunnable(const AVD_CL_CB *cb) { return true;} };
+
+class ClmTrackStart : public ClmJob {
+ public:
+  ClmTrackStart() : ClmJob(){};
+  AvdJobDequeueResultT exec(const struct cl_cb_tag *cb);
+  ~ClmTrackStart() {}
+};
+
+class ClmTrackStop : public ClmJob {
+ public:
+  ClmTrackStop() : ClmJob(){};
+  AvdJobDequeueResultT exec(const struct cl_cb_tag *cb);
+  ~ClmTrackStop() {}
+};
+
  #endif  // AMF_AMFD_CLM_H_
diff --git a/src/amf/amfd/imm.h b/src/amf/amfd/imm.h index f0152ac..4c4f88b 
100644
--- a/src/amf/amfd/imm.h
+++ b/src/amf/amfd/imm.h
@@ -56,6 +56,7 @@ typedef enum {
  typedef enum {
    JOB_TYPE_IMM = 1,  /* A IMM job */
    JOB_TYPE_NTF = 2, /* A NTF job */
+  JOB_TYPE_CLM = 3, /* A CLM job */
    JOB_TYPE_ANY
  } AvdJobTypeT;
diff --git a/src/amf/amfd/role.cc b/src/amf/amfd/role.cc index ec13c3b..c08033d 100644
--- a/src/amf/amfd/role.cc
+++ b/src/amf/amfd/role.cc
@@ -1066,7 +1066,8 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB *cb) {
    /*  Mark AVD as Quiesced. */
    cb->avail_state_avd = SA_AMF_HA_QUIESCED;
- avd_clm_track_stop();
+  if (avd_clm_track_stop(cb) != SA_AIS_OK)
+    Fifo::queue(new ClmTrackStop());
[Ravi]: I think we need to queue only if clm returns TRY_AGAIN or TIMEOUT
        but here, we are generalizing it to "!= SA_AIS_OK", any reason ?
/* Go ahead and set mds role as already the NCS SU has been switched */
    if (NCSCC_RC_SUCCESS !=
@@ -1105,7 +1106,6 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB *cb) {
uint32_t amfd_switch_qsd_stdby(AVD_CL_CB *cb) {
    uint32_t status = NCSCC_RC_SUCCESS;
-  SaAisErrorT ais_rc;
    TRACE_ENTER();
    LOG_NO("Switching Quiesced --> StandBy");
@@ -1138,12 +1138,8 @@ uint32_t amfd_switch_qsd_stdby(AVD_CL_CB *cb) {
      avd_pg_node_csi_del_all(cb, avnd);
    }
- if (cb->is_clm_track_started == true) {
-    ais_rc = avd_clm_track_stop();
-    if (ais_rc != SA_AIS_OK && ais_rc != SA_AIS_ERR_NOT_EXIST) {
-      LOG_ER("Failed to stop cluster tracking after switch over");
-    }
-  }
+  if (avd_clm_track_stop(cb) != SA_AIS_OK)
+    Fifo::queue(new ClmTrackStop());
[Ravi]: same as above
LOG_NO("Controller switch over done");
    saflog(LOG_NOTICE, amfSvcUsrName, "Controller switch over done at %x", @@ 
-1274,8 +1270,7 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
    if (NCSCC_RC_SUCCESS != avd_rde_set_role(SA_AMF_HA_ACTIVE)) {
      LOG_ER("rde role change failed from stdy -> Active");
    }
-
-  if (avd_clm_track_start() != SA_AIS_OK) {
+  if (avd_clm_track_start(cb) != SA_AIS_OK)
[Ravi]: any reason why queuing is not done when role changes from standby to 
active
{
      LOG_ER("Switch Standby --> Active, clm track start failed");
      avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
      return NCSCC_RC_FAILURE;
diff --git a/src/amf/amfd/sg_2n_fsm.cc b/src/amf/amfd/sg_2n_fsm.cc index 
3a7609e..44177d2 100644
--- a/src/amf/amfd/sg_2n_fsm.cc
+++ b/src/amf/amfd/sg_2n_fsm.cc
@@ -1767,7 +1767,8 @@ uint32_t SG_2N::susi_success_sg_realign(AVD_SU *su, 
AVD_SU_SI_REL *susi,
        if ((state == SA_AMF_HA_ACTIVE) &&
            (cb->node_id_avd == su->su_on_node->node_info.nodeId)) {
          /* This is as a result of failover, start CLM tracking*/
-        (void)avd_clm_track_start();
+        if (avd_clm_track_start(cb) != SA_AIS_OK)
+          Fifo::queue(new ClmTrackStart());
[Ravi]: same as first comment
        }
// Set active_services_exist at error conditions e.g. controller fail-over diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc index d305b22..5f31194 100644
--- a/src/amf/amfd/sgproc.cc
+++ b/src/amf/amfd/sgproc.cc
@@ -2201,7 +2201,8 @@ void avd_node_down_mw_susi_failover(AVD_CL_CB *cb, 
AVD_AVND *avnd) {
         in avd_sg_2n_susi_sucss_sg_reln, so start here.*/
      if ((i_su->sg_of_su->sg_redundancy_model == SA_AMF_2N_REDUNDANCY_MODEL) &&
          (i_su->sg_of_su->sg_fsm_state == AVD_SG_FSM_STABLE))
-      (void)avd_clm_track_start();
+      if (avd_clm_track_start(cb) != SA_AIS_OK)
+        Fifo::queue(new ClmTrackStart());
      /* Free all the SU SI assignments*/
i_su->delete_all_susis();
--
2.7.4


diff --git a/src/amf/amfd/clm.cc b/src/amf/amfd/clm.cc
index 1c82620..ffef3ab 100644
--- a/src/amf/amfd/clm.cc
+++ b/src/amf/amfd/clm.cc
@@ -489,6 +489,7 @@ SaAisErrorT avd_clm_track_start(AVD_CL_CB* cb) {
     if (error == SA_AIS_ERR_TRY_AGAIN || error == SA_AIS_ERR_TIMEOUT ||
         error == SA_AIS_ERR_UNAVAILABLE) {
       LOG_WA("Failed to start cluster tracking %u", error);
+      error = SA_AIS_ERR_TRY_AGAIN;
     } else {
       LOG_ER("Failed to start cluster tracking %u", error);
     }
@@ -511,6 +512,7 @@ SaAisErrorT avd_clm_track_stop(AVD_CL_CB* cb) {
     if (error == SA_AIS_ERR_TRY_AGAIN || error == SA_AIS_ERR_TIMEOUT ||
         error == SA_AIS_ERR_UNAVAILABLE) {
       LOG_WA("Failed to stop cluster tracking %u", error);
+      error = SA_AIS_ERR_TRY_AGAIN;
     } else if (error == SA_AIS_ERR_NOT_EXIST) {
       /* track changes was not started or stopped successfully */
       LOG_WA("Failed to stop cluster tracking %u", error);
@@ -605,12 +607,6 @@ AvdJobDequeueResultT ClmTrackStart::exec(const AVD_CL_CB* cb) {
   } else if (rc == SA_AIS_ERR_TRY_AGAIN) {
     TRACE("TRY-AGAIN");
     res = JOB_ETRYAGAIN;
-  } else if (rc == SA_AIS_ERR_TIMEOUT) {
-    TRACE("TIMEOUT");
-    res = JOB_ETRYAGAIN;
-  } else if (rc == SA_AIS_ERR_UNAVAILABLE) {
-    TRACE("UNAVAILABLE");
-    res = JOB_ETRYAGAIN;
   } else {
     delete Fifo::dequeue();
     LOG_ER("%s: ClmTrackStart FAILED %u", __FUNCTION__, rc);
@@ -632,12 +628,6 @@ AvdJobDequeueResultT ClmTrackStop::exec(const AVD_CL_CB* cb) {
   } else if (rc == SA_AIS_ERR_TRY_AGAIN) {
     TRACE("TRY-AGAIN");
     res = JOB_ETRYAGAIN;
-  } else if (rc == SA_AIS_ERR_TIMEOUT) {
-    TRACE("TIMEOUT");
-    res = JOB_ETRYAGAIN;
-  } else if (rc == SA_AIS_ERR_UNAVAILABLE) {
-    TRACE("UNAVAILABLE");
-    res = JOB_ETRYAGAIN;
   } else {
     delete Fifo::dequeue();
     LOG_ER("%s: ClmTrackStop FAILED %u", __FUNCTION__, rc);
diff --git a/src/amf/amfd/role.cc b/src/amf/amfd/role.cc
index c08033d..865d89d 100644
--- a/src/amf/amfd/role.cc
+++ b/src/amf/amfd/role.cc
@@ -1066,7 +1066,7 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB *cb) {
   /*  Mark AVD as Quiesced. */
   cb->avail_state_avd = SA_AMF_HA_QUIESCED;
 
-  if (avd_clm_track_stop(cb) != SA_AIS_OK)
+  if (avd_clm_track_stop(cb) == SA_AIS_ERR_TRY_AGAIN)
     Fifo::queue(new ClmTrackStop());
 
   /* Go ahead and set mds role as already the NCS SU has been switched */
@@ -1138,7 +1138,7 @@ uint32_t amfd_switch_qsd_stdby(AVD_CL_CB *cb) {
     avd_pg_node_csi_del_all(cb, avnd);
   }
 
-  if (avd_clm_track_stop(cb) != SA_AIS_OK)
+  if (avd_clm_track_stop(cb) == SA_AIS_ERR_TRY_AGAIN)
     Fifo::queue(new ClmTrackStop());
 
   LOG_NO("Controller switch over done");
@@ -1270,8 +1270,9 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) {
   if (NCSCC_RC_SUCCESS != avd_rde_set_role(SA_AMF_HA_ACTIVE)) {
     LOG_ER("rde role change failed from stdy -> Active");
   }
-  if (avd_clm_track_start(cb) != SA_AIS_OK) {
+  if (avd_clm_track_start(cb) == SA_AIS_ERR_TRY_AGAIN) {
     LOG_ER("Switch Standby --> Active, clm track start failed");
+    Fifo::queue(new ClmTrackStart());
     avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE);
     return NCSCC_RC_FAILURE;
   }
diff --git a/src/amf/amfd/sg_2n_fsm.cc b/src/amf/amfd/sg_2n_fsm.cc
index 44177d2..c7d5844 100644
--- a/src/amf/amfd/sg_2n_fsm.cc
+++ b/src/amf/amfd/sg_2n_fsm.cc
@@ -1767,7 +1767,7 @@ uint32_t SG_2N::susi_success_sg_realign(AVD_SU *su, AVD_SU_SI_REL *susi,
       if ((state == SA_AMF_HA_ACTIVE) &&
           (cb->node_id_avd == su->su_on_node->node_info.nodeId)) {
         /* This is as a result of failover, start CLM tracking*/
-        if (avd_clm_track_start(cb) != SA_AIS_OK)
+        if (avd_clm_track_start(cb) == SA_AIS_ERR_TRY_AGAIN)
           Fifo::queue(new ClmTrackStart());
       }
 
diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc
index 5f31194..610c205 100644
--- a/src/amf/amfd/sgproc.cc
+++ b/src/amf/amfd/sgproc.cc
@@ -2201,7 +2201,7 @@ void avd_node_down_mw_susi_failover(AVD_CL_CB *cb, AVD_AVND *avnd) {
        in avd_sg_2n_susi_sucss_sg_reln, so start here.*/
     if ((i_su->sg_of_su->sg_redundancy_model == SA_AMF_2N_REDUNDANCY_MODEL) &&
         (i_su->sg_of_su->sg_fsm_state == AVD_SG_FSM_STABLE))
-      if (avd_clm_track_start(cb) != SA_AIS_OK)
+      if (avd_clm_track_start(cb) == SA_AIS_ERR_TRY_AGAIN)
         Fifo::queue(new ClmTrackStart());
     /* Free all the SU SI assignments*/
 
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to