Sometimes etcd server take long time to handle the request and command "etcdctl get" will be timeout. Consensus understands that etcd may be downed and need to self-fence event though etcd server is handling the request.
The "etcdctl get" command in "watch" should be do the retries and the dial timeout should be increased. --- src/osaf/consensus/plugins/etcd3.plugin | 119 ++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 43 deletions(-) diff --git a/src/osaf/consensus/plugins/etcd3.plugin b/src/osaf/consensus/plugins/etcd3.plugin index d92688557..56f15a06a 100644 --- a/src/osaf/consensus/plugins/etcd3.plugin +++ b/src/osaf/consensus/plugins/etcd3.plugin @@ -21,7 +21,7 @@ readonly takeover_request="takeover_request" readonly node_name_file="/etc/opensaf/node_name" readonly directory="/opensaf/" readonly etcd_options="" -readonly etcd_timeout="3s" +readonly etcd_timeout="5s" readonly heartbeat_interval=2 export ETCDCTL_API=3 @@ -131,9 +131,13 @@ create_key() { put \""$directory$key"\" \""$value"\" "$lease_param" " - output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< "$transaction") - if [[ "$output" == *"OK"* ]]; then - return 0 + if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< "$transaction") + then + if [[ "$output" == *"OK"* ]]; then + return 0 + fi + else + return 3 fi if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout get "$directory$key" | tail -n1) @@ -227,9 +231,15 @@ lock() { put \""$directory$keyname"\" \""$owner"\" " - output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< "$transaction") - if [[ "$output" == *"OK"* ]]; then - return 0 + if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< "$transaction") + then + if [[ "$output" == *"OK"* ]]; then + return 0 + fi + else + # Lost connectivity with etcd server + echo "$output" + return 2 fi # key already exists, make sure it's empty @@ -238,9 +248,15 @@ lock() { put \""$directory$keyname"\" \""$owner"\" " - output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< "$transaction") - if [[ "$output" == *"OK"* ]]; then - return 0 + if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< "$transaction") + then + if [[ "$output" == *"OK"* ]]; then + return 0 + fi + else + # Lost connectivity with etcd server + echo "$output" + return 2 fi current_owner=$(etcdctl $etcd_options --dial-timeout $etcd_timeout get "$directory$keyname" | tail -n1) @@ -294,9 +310,15 @@ unlock() { put \""$directory$keyname"\" \"\" " - output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< "$transaction") - if [[ "$output" == *"OK"* ]]; then - return 0 + if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<< "$transaction") + then + if [[ "$output" == *"OK"* ]]; then + return 0 + fi + else + # Lost connectivity with etcd server + echo "$output" + return 2 fi # failed! check we own the lock @@ -332,41 +354,52 @@ unlock() { # non-zero - failure watch() { readonly watch_key="$1" - - # get baseline - orig_value=$(get "$watch_key") - result=$? - - if [ "$result" -le "1" ]; then - while true - do - sleep $heartbeat_interval - current_value=$(get "$watch_key") - result=$? - if [ "$result" -gt "1" ]; then - # etcd down? - if [ "$watch_key" == "$takeover_request" ]; then - hostname=`cat $node_name_file` - echo "$hostname SC-0 10000000 UNDEFINED" + orig_value=0 + num_tried=0 + + while true + do + # get baseline + output=$(get "$watch_key") + result=$? + + if [ "$result" -le "1" ]; then + if [ "$orig_value" == "0" ]; then + orig_value=$output + fi + while true + do + sleep $heartbeat_interval + current_value=$(get "$watch_key") + result=$? + if [ "$result" -gt "1" ]; then + # Lost connectivity with etcd server. Try again + if [ "$watch_key" == "$takeover_request" ]; then + num_tried=1 + break + else + return 1 + fi + elif [ "$orig_value" != "$current_value" ]; then + echo $current_value return 0 - else - return 1 fi - elif [ "$orig_value" != "$current_value" ]; then - echo $current_value + done + else + # Lost connectivity with etcd server + num_tried=$((num_tried + 1)) + if [ $num_tried -lt 2 ]; then + continue + fi + if [ "$watch_key" == "$takeover_request" ]; then + hostname=`cat $node_name_file` + echo "$hostname SC-0 10000000 UNDEFINED" return 0 + else + return 1 fi - done - else - # etcd down? - if [ "$watch_key" == "$takeover_request" ]; then - hostname=`cat $node_name_file` - echo "$hostname SC-0 10000000 UNDEFINED" - return 0 - else - return 1 fi - fi + done } # argument parsing -- 2.15.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel