Sometimes etcd server take long time to handle the request and command "etcdctl 
get"
will be timeout. Consensus understands that etcd may be downed and need to
self-fence event though etcd server is handling the request.

The "etcdctl get" command  in "watch" should be do the retries and the dial
timeout should be increased.
---
 src/osaf/consensus/plugins/etcd3.plugin | 119 ++++++++++++++++++++------------
 1 file changed, 76 insertions(+), 43 deletions(-)

diff --git a/src/osaf/consensus/plugins/etcd3.plugin 
b/src/osaf/consensus/plugins/etcd3.plugin
index d92688557..56f15a06a 100644
--- a/src/osaf/consensus/plugins/etcd3.plugin
+++ b/src/osaf/consensus/plugins/etcd3.plugin
@@ -21,7 +21,7 @@ readonly takeover_request="takeover_request"
 readonly node_name_file="/etc/opensaf/node_name"
 readonly directory="/opensaf/"
 readonly etcd_options=""
-readonly etcd_timeout="3s"
+readonly etcd_timeout="5s"
 readonly heartbeat_interval=2
 
 export ETCDCTL_API=3
@@ -131,9 +131,13 @@ create_key() {
     put \""$directory$key"\" \""$value"\" "$lease_param"
 
   "
-  output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< 
"$transaction")
-  if [[ "$output" == *"OK"* ]]; then
-    return 0
+  if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< 
"$transaction")
+  then
+    if [[ "$output" == *"OK"* ]]; then
+      return 0
+    fi
+  else
+    return 3
   fi
 
   if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout get 
"$directory$key" | tail -n1)
@@ -227,9 +231,15 @@ lock() {
     put \""$directory$keyname"\" \""$owner"\"
 
   "
-  output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< 
"$transaction")
-  if [[ "$output" == *"OK"* ]]; then
-    return 0
+  if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< 
"$transaction")
+  then
+    if [[ "$output" == *"OK"* ]]; then
+      return 0
+    fi
+  else
+    # Lost connectivity with etcd server
+    echo "$output"
+    return 2
   fi
 
   # key already exists, make sure it's empty
@@ -238,9 +248,15 @@ lock() {
     put \""$directory$keyname"\" \""$owner"\"
 
   "
-  output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< 
"$transaction")
-  if [[ "$output" == *"OK"* ]]; then
-    return 0
+  if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< 
"$transaction")
+  then
+    if [[ "$output" == *"OK"* ]]; then
+      return 0
+    fi
+  else
+    # Lost connectivity with etcd server
+    echo "$output"
+    return 2
   fi
 
   current_owner=$(etcdctl $etcd_options --dial-timeout $etcd_timeout get 
"$directory$keyname" | tail -n1)
@@ -294,9 +310,15 @@ unlock() {
     put \""$directory$keyname"\" \"\"
 
     "
-    output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< 
"$transaction")
-    if [[ "$output" == *"OK"* ]]; then
-      return 0
+    if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< 
"$transaction")
+    then
+      if [[ "$output" == *"OK"* ]]; then
+        return 0
+      fi
+    else
+      # Lost connectivity with etcd server
+      echo "$output"
+      return 2
     fi
 
     # failed! check we own the lock
@@ -332,41 +354,52 @@ unlock() {
 #   non-zero - failure
 watch() {
   readonly watch_key="$1"
-
-  # get baseline
-  orig_value=$(get "$watch_key")
-  result=$?
-
-  if [ "$result" -le "1" ]; then
-    while true
-    do
-      sleep $heartbeat_interval
-      current_value=$(get "$watch_key")
-      result=$?
-      if [ "$result" -gt "1" ]; then
-        # etcd down?
-        if [ "$watch_key" == "$takeover_request" ]; then
-          hostname=`cat $node_name_file`
-          echo "$hostname SC-0 10000000 UNDEFINED"
+  orig_value=0
+  num_tried=0
+
+  while true
+  do
+    # get baseline
+    output=$(get "$watch_key")
+    result=$?
+
+    if [ "$result" -le "1" ]; then
+      if [ "$orig_value" == "0" ]; then
+        orig_value=$output
+      fi
+      while true
+      do
+        sleep $heartbeat_interval
+        current_value=$(get "$watch_key")
+        result=$?
+        if [ "$result" -gt "1" ]; then
+          # Lost connectivity with etcd server. Try again
+          if [ "$watch_key" == "$takeover_request" ]; then
+            num_tried=1
+            break
+          else
+            return 1
+          fi
+        elif [ "$orig_value" != "$current_value" ]; then
+          echo $current_value
           return 0
-        else
-          return 1
         fi
-      elif [ "$orig_value" != "$current_value" ]; then
-        echo $current_value
+      done
+    else
+      # Lost connectivity with etcd server
+      num_tried=$((num_tried + 1))
+      if [ $num_tried -lt 2 ]; then
+        continue
+      fi
+      if [ "$watch_key" == "$takeover_request" ]; then
+        hostname=`cat $node_name_file`
+        echo "$hostname SC-0 10000000 UNDEFINED"
         return 0
+      else
+        return 1
       fi
-    done
-  else
-    # etcd down?
-    if [ "$watch_key" == "$takeover_request" ]; then
-      hostname=`cat $node_name_file`
-      echo "$hostname SC-0 10000000 UNDEFINED"
-      return 0
-    else
-      return 1
     fi
-  fi
+  done
 }
 
 # argument parsing
-- 
2.15.1



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to