In the event of network partitioning that results in new etcd leader change, the 'get' api in the bigger partition is not available for a few seconds. Therefore, the SC in bigger partition can not promote but self-fence instead.
This patch adds etcd_tolerance_timeout so the SC in bigger partition can retry the promotion. However, the SC meanwhile in the smaller partiton also shares the same etcd_tolerance_timeout retries, hence the etcd_tolerance_timeout delays the self-fence of SC in smaller partition. The patch therefore checks the healthiness of self endpoint where the SC should apply the etcd_tolerance_timeout retries. --- src/osaf/consensus/plugins/etcd3.plugin | 44 +++++++++++++++---------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/osaf/consensus/plugins/etcd3.plugin b/src/osaf/consensus/plugins/etcd3.plugin index 6252eedcb..34a975e05 100644 --- a/src/osaf/consensus/plugins/etcd3.plugin +++ b/src/osaf/consensus/plugins/etcd3.plugin @@ -23,6 +23,7 @@ readonly directory="/opensaf/" readonly etcd_options="" readonly etcd_timeout="3s" readonly heartbeat_interval=2 +readonly etcd_tolerance_timeout=6 export ETCDCTL_API=3 @@ -332,11 +333,10 @@ unlock() { # non-zero - failure watch() { readonly watch_key="$1" - # get baseline orig_value=$(get "$watch_key") result=$? - + tol_counter=0 if [ "$result" -le 1 ]; then if [ "$result" -eq 0 ] && [ "$watch_key" == "$takeover_request" ]; then state=$(echo $orig_value | awk '{print $4}') @@ -353,25 +353,33 @@ watch() { current_value=$(get "$watch_key") result=$? if [ "$result" -gt 1 ]; then - # etcd down? - if [ "$watch_key" == "$takeover_request" ]; then - hostname=`cat $node_name_file` - echo "$hostname SC-0 10000000 UNDEFINED" - return 0 - else - return 1 + # etcd down?, check the healthiness of self endpoint + $(etcdctl endpoint health >/dev/null 2>&1) + is_healthy=$? + ((tol_counter=tol_counter+heartbeat_interval)) + if [ $tol_counter -ge $etcd_tolerance_timeout ] || [ $is_healthy -ne 0 ]; then + if [ "$watch_key" == "$takeover_request" ]; then + hostname=`cat $node_name_file` + echo "$hostname SC-0 10000000 UNDEFINED" + return 0 + else + return 1 + fi fi - elif [ "$orig_value" != "$current_value" ]; then - if [ "$watch_key" == "$takeover_request" ]; then - state=$(echo $orig_value | awk '{print $4}') - if [ "$state" == "REJECTED" ] && [ -z "$current_value" ]; then - # value is cleared after lease time, keep watching - orig_value="" - continue + else + tol_counter=0 + if [ "$orig_value" != "$current_value" ]; then + if [ "$watch_key" == "$takeover_request" ]; then + state=$(echo $orig_value | awk '{print $4}') + if [ "$state" == "REJECTED" ] && [ -z "$current_value" ]; then + # value is cleared after lease time, keep watching + orig_value="" + continue + fi fi + echo $current_value + return 0 fi - echo $current_value - return 0 fi done else -- 2.20.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel