In the event of network partitioning that results in new etcd leader
change, the 'get' api in the bigger partition is not available for a
few seconds. Therefore, the SC in bigger partition can not promote
but self-fence instead.

This patch adds etcd_tolerance_timeout so the SC in bigger partition
can retry the promotion. However, the SC meanwhile in the smaller
partiton also shares the same etcd_tolerance_timeout retries, hence
the etcd_tolerance_timeout delays the self-fence of SC in smaller
partition. The patch therefore checks the healthiness of self endpoint
where the SC should apply the etcd_tolerance_timeout retries.
---
 src/osaf/consensus/plugins/etcd3.plugin | 44 +++++++++++++++----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/osaf/consensus/plugins/etcd3.plugin 
b/src/osaf/consensus/plugins/etcd3.plugin
index 6252eedcb..34a975e05 100644
--- a/src/osaf/consensus/plugins/etcd3.plugin
+++ b/src/osaf/consensus/plugins/etcd3.plugin
@@ -23,6 +23,7 @@ readonly directory="/opensaf/"
 readonly etcd_options=""
 readonly etcd_timeout="3s"
 readonly heartbeat_interval=2
+readonly etcd_tolerance_timeout=6
 
 export ETCDCTL_API=3
 
@@ -332,11 +333,10 @@ unlock() {
 #   non-zero - failure
 watch() {
   readonly watch_key="$1"
-
   # get baseline
   orig_value=$(get "$watch_key")
   result=$?
-
+  tol_counter=0
   if [ "$result" -le 1 ]; then
       if [ "$result" -eq 0 ] && [ "$watch_key" == "$takeover_request" ]; then
         state=$(echo $orig_value | awk '{print $4}')
@@ -353,25 +353,33 @@ watch() {
       current_value=$(get "$watch_key")
       result=$?
       if [ "$result" -gt 1 ]; then
-        # etcd down?
-        if [ "$watch_key" == "$takeover_request" ]; then
-          hostname=`cat $node_name_file`
-          echo "$hostname SC-0 10000000 UNDEFINED"
-          return 0
-        else
-          return 1
+        # etcd down?, check the healthiness of self endpoint
+        $(etcdctl endpoint health >/dev/null 2>&1)
+        is_healthy=$?
+        ((tol_counter=tol_counter+heartbeat_interval))
+        if [ $tol_counter -ge $etcd_tolerance_timeout ] || [ $is_healthy -ne 0 
]; then
+          if [ "$watch_key" == "$takeover_request" ]; then
+            hostname=`cat $node_name_file`
+            echo "$hostname SC-0 10000000 UNDEFINED"
+            return 0
+          else
+            return 1
+          fi
         fi
-      elif [ "$orig_value" != "$current_value" ]; then
-        if [ "$watch_key" == "$takeover_request" ]; then
-          state=$(echo $orig_value | awk '{print $4}')
-          if [ "$state" == "REJECTED" ] && [ -z "$current_value" ]; then
-            # value is cleared after lease time, keep watching
-            orig_value=""
-            continue
+      else
+        tol_counter=0
+        if [ "$orig_value" != "$current_value" ]; then
+          if [ "$watch_key" == "$takeover_request" ]; then
+            state=$(echo $orig_value | awk '{print $4}')
+            if [ "$state" == "REJECTED" ] && [ -z "$current_value" ]; then
+              # value is cleared after lease time, keep watching
+              orig_value=""
+              continue
+            fi
           fi
+          echo $current_value
+          return 0
         fi
-        echo $current_value
-        return 0
       fi
     done
   else
-- 
2.20.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to