This is an automated email from the ASF dual-hosted git repository.
wutao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pegasus.git
The following commit(s) were added to refs/heads/master by this push:
new 17f6a55 fix(script): operation return before the cluster balance when
using pegasus_rolling_update.sh (#585)
17f6a55 is described below
commit 17f6a5540a9797d6b18e15bb74c067b735b6417d
Author: Wu Tao <[email protected]>
AuthorDate: Tue Aug 25 13:30:50 2020 +0800
fix(script): operation return before the cluster balance when using
pegasus_rolling_update.sh (#585)
---
scripts/pegasus_add_node_list.sh | 72 +++-----------------
scripts/pegasus_check_arguments.sh | 6 --
scripts/pegasus_offline_node_list.sh | 10 ++-
scripts/pegasus_rebalance_cluster.sh | 128 +++++++++++++++++++++++++++++++++++
scripts/pegasus_rolling_update.sh | 43 ++----------
5 files changed, 149 insertions(+), 110 deletions(-)
diff --git a/scripts/pegasus_add_node_list.sh b/scripts/pegasus_add_node_list.sh
index 1f358d8..a8ebaee 100755
--- a/scripts/pegasus_add_node_list.sh
+++ b/scripts/pegasus_add_node_list.sh
@@ -14,6 +14,12 @@ if [ $# -le 2 ]; then
exit 1
fi
+echo "UID=$UID"
+echo "PID=$PID"
+echo "Start time: `date`"
+add_node_start_time=$((`date +%s`))
+echo
+
cluster=$1
meta_list=$2
replica_task_id_list=$3
@@ -49,70 +55,10 @@ do
echo
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
done
-echo "Set meta.lb.only_move_primary true"
-echo "This remote-command tells the meta-server to ignore copying primaries
during rebalancing."
-echo "So the following steps only include move_primary and copy_secondary."
-echo "remote_command -l $pmeta meta.lb.only_move_primary true" | ./run.sh
shell --cluster $meta_list
&>/tmp/$UID.$PID.pegasus.add_node_list.only_move_primary
-set_ok=`grep OK /tmp/$UID.$PID.pegasus.add_node_list.only_move_primary | wc -l`
-if [ $set_ok -ne 1 ]; then
- echo "ERROR: meta.lb.only_move_primary true"
- exit 1
-fi
-echo
-
-echo "Set meta level to lively..."
-echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list
&>/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level
-set_ok=`grep 'control meta level ok'
/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level | wc -l`
-if [ $set_ok -ne 1 ]; then
- echo "ERROR: set meta level to lively failed"
- exit 1
-fi
-
-echo "Wait cluster to become balanced..."
-echo "Wait for 3 minutes to do load balance..."
-sleep 180
-while true; do
- op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list |
grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2)
- if [ -z "op_count" ]; then
- break
- fi
- if [ $op_count -eq 0 ]; then
- echo "Cluster may be balanced, try wait 30 seconds..."
- sleep 30
- op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list |
grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2)
- if [ $op_count -eq 0 ]; then
- echo "Cluster becomes balanced."
- break
- fi
- else
- echo "Still $op_count balance operations to do..."
- sleep 1
- fi
-done
-echo
-
-
-
-echo "Set meta level to steady..."
-echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list
&>/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level
-set_ok=`grep 'control meta level ok'
/tmp/$UID.$PID.pegasus.add_node_list.set_meta_level | wc -l`
-if [ $set_ok -ne 1 ]; then
- echo "ERROR: set meta level to steady failed"
- exit 1
-fi
-
-echo "Set meta.lb.only_move_primary false"
-echo "This remote-command tells the meta-server to rebalance with copying
primaries."
-echo "remote_command -l $pmeta meta.lb.only_move_primary false" | ./run.sh
shell --cluster $meta_list
&>/tmp/$UID.$PID.pegasus.add_node_list.only_move_primary
-set_ok=`grep OK /tmp/$UID.$PID.pegasus.add_node_list.only_move_primary | wc -l`
-if [ $set_ok -ne 1 ]; then
- echo "ERROR: meta.lb.only_move_primary false"
- exit 1
-fi
-echo
+./scripts/pegasus_rebalance_cluster.sh $cluster $meta_list true
echo "Finish time: `date`"
-all_finish_time=$((`date +%s`))
-echo "add node list done, elasped time is $((all_finish_time -
all_start_time)) seconds."
+add_node_finish_time=$((`date +%s`))
+echo "add node list done, elasped time is $((add_node_finish_time -
add_node_start_time)) seconds."
rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
diff --git a/scripts/pegasus_check_arguments.sh
b/scripts/pegasus_check_arguments.sh
index de22f48..5b3f37b 100755
--- a/scripts/pegasus_check_arguments.sh
+++ b/scripts/pegasus_check_arguments.sh
@@ -31,12 +31,6 @@ if [ $? -ne 0 ]; then
exit 1
fi
-echo "UID=$UID"
-echo "PID=$PID"
-echo "Start time: `date`"
-all_start_time=$((`date +%s`))
-echo
-
id_list_file="/tmp/$UID.$PID.pegasus.$check_type.id_list"
echo "Generating $id_list_file..."
minos_show_replica $cluster $id_list_file
diff --git a/scripts/pegasus_offline_node_list.sh
b/scripts/pegasus_offline_node_list.sh
index f8883f2..093fe16 100755
--- a/scripts/pegasus_offline_node_list.sh
+++ b/scripts/pegasus_offline_node_list.sh
@@ -14,6 +14,12 @@ if [ $# -le 2 ]; then
exit 1
fi
+echo "UID=$UID"
+echo "PID=$PID"
+echo "Start time: `date`"
+offline_node_start_time=$((`date +%s`))
+echo
+
cluster=$1
meta_list=$2
replica_task_id_list=$3
@@ -68,8 +74,8 @@ if [ $set_ok -ne 1 ]; then
exit 1
fi
-all_finish_time=$((`date +%s`))
+offline_finish_time=$((`date +%s`))
echo "Offline replica server task list done."
-echo "Elapsed time is $((all_finish_time - all_start_time)) seconds."
+echo "Elapsed time is $((offline_finish_time - offline_node_start_time))
seconds."
rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
diff --git a/scripts/pegasus_rebalance_cluster.sh
b/scripts/pegasus_rebalance_cluster.sh
new file mode 100755
index 0000000..99112fe
--- /dev/null
+++ b/scripts/pegasus_rebalance_cluster.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+#
+# Pegasus cluster rebalance
+#
+
+PID=$$
+
+if [ $# -le 1 ]; then
+ echo "USAGE: $0 <cluster-name> <cluster-meta-list>
<only-move-primary>(default false)"
+ echo
+ echo "for example:"
+ echo " $0 onebox 127.0.0.1:34601,127.0.0.1:34602 true"
+ echo
+ exit 1
+fi
+
+cluster=$1
+meta_list=$2
+
+if [ -z $3 ]; then
+ only_move_primary=false
+else
+ only_move_primary=$3
+fi
+
+pwd="$( cd "$( dirname "$0" )" && pwd )"
+shell_dir="$( cd $pwd/.. && pwd )"
+cd $shell_dir
+
+source ./scripts/minos_common.sh
+find_cluster $cluster
+if [ $? -ne 0 ]; then
+ echo "ERROR: cluster \"$cluster\" not found"
+ exit 1
+fi
+
+echo "UID=$UID"
+echo "PID=$PID"
+echo "Start time: `date`"
+rebalance_start_time=$((`date +%s`))
+echo
+
+echo "Generating /tmp/$UID.$PID.pegasus.rebalance.cluster_info..."
+echo cluster_info | ./run.sh shell --cluster $meta_list 2>&1 | sed 's/ *$//'
>/tmp/$UID.$PID.pegasus.rebalance.cluster_info
+cname=`grep zookeeper_root /tmp/$UID.$PID.pegasus.rebalance.cluster_info |
grep -o '/[^/]*$' | grep -o '[^/]*$'`
+if [ "$cname" != "$cluster" ]; then
+ echo "ERROR: cluster name and meta list not matched"
+ exit 1
+fi
+pmeta=`grep primary_meta_server /tmp/$UID.$PID.pegasus.rebalance.cluster_info
| grep -o '[0-9.:]*$'`
+if [ "$pmeta" == "" ]; then
+ echo "ERROR: extract primary_meta_server by shell failed"
+ exit 1
+fi
+
+if [ "$only_move_primary" == "true" ]; then
+ echo "Set meta.lb.only_move_primary true"
+ echo "This remote-command tells the meta-server to ignore copying primaries
during rebalancing."
+ echo "So the following steps only include move_primary and copy_secondary."
+ echo "remote_command -l $pmeta meta.lb.only_move_primary true" | ./run.sh
shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.only_move_primary
+ set_ok=`grep OK /tmp/$UID.$PID.pegasus.rebalance.only_move_primary | wc -l`
+ if [ $set_ok -ne 1 ]; then
+ echo "ERROR: meta.lb.only_move_primary true"
+ exit 1
+ fi
+fi
+echo
+
+echo "Set meta level to lively..."
+echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list
&>/tmp/$UID.$PID.pegasus.rebalance.set_meta_level
+set_ok=`grep 'control meta level ok'
/tmp/$UID.$PID.pegasus.rebalance.set_meta_level | wc -l`
+if [ $set_ok -ne 1 ]; then
+ echo "ERROR: set meta level to lively failed"
+ exit 1
+fi
+
+echo "Wait cluster to become balanced..."
+echo "Wait for 3 minutes to do load balance..."
+sleep 180
+## Number of check times for balanced state, in case that op_count is 0 but
+## the cluster is in fact unbalanced. Each check waits for 30 secs.
+op_count_check_remain_times=1
+while true; do
+ op_count=$(echo "cluster_info" | ./run.sh shell --cluster $meta_list |
grep balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2)
+ if [ -z $op_count ]; then
+ break
+ fi
+
+ if [ $op_count -eq 0 ]; then
+ if [ $op_count_check_remain_times -eq 0 ]; then
+ break
+ else
+ echo "Cluster may be balanced, try wait 30 seconds..."
+ ((op_count_check_remain_times--))
+ sleep 30
+ fi
+ else
+ echo "Still $op_count balance operations to do..."
+ sleep 10
+ fi
+done
+echo
+
+echo "Set meta level to steady..."
+echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list
&>/tmp/$UID.$PID.pegasus.rebalance.set_meta_level
+set_ok=`grep 'control meta level ok'
/tmp/$UID.$PID.pegasus.rebalance.set_meta_level | wc -l`
+if [ $set_ok -ne 1 ]; then
+ echo "ERROR: set meta level to steady failed"
+ exit 1
+fi
+
+if [ "$only_move_primary" == "true" ]; then
+ echo "Set meta.lb.only_move_primary false"
+ echo "This remote-command tells the meta-server to rebalance with copying
primaries."
+ echo "remote_command -l $pmeta meta.lb.only_move_primary false" | ./run.sh
shell --cluster $meta_list &>/tmp/$UID.$PID.pegasus.rebalance.only_move_primary
+ set_ok=`grep OK /tmp/$UID.$PID.pegasus.rebalance.only_move_primary | wc -l`
+ if [ $set_ok -ne 1 ]; then
+ echo "ERROR: meta.lb.only_move_primary false"
+ exit 1
+ fi
+ echo
+fi
+
+echo "Finish time: `date`"
+rebalance_finish_time=$((`date +%s`))
+echo "rebalance done, elasped time is $((rebalance_finish_time -
rebalance_start_time)) seconds."
+
+rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
diff --git a/scripts/pegasus_rolling_update.sh
b/scripts/pegasus_rolling_update.sh
index 9523119..a7ba7f5 100755
--- a/scripts/pegasus_rolling_update.sh
+++ b/scripts/pegasus_rolling_update.sh
@@ -46,7 +46,7 @@ fi
echo "UID=$UID"
echo "PID=$PID"
echo "Start time: `date`"
-all_start_time=$((`date +%s`))
+rolling_start_time=$((`date +%s`))
echo
rs_list_file="/tmp/$UID.$PID.pegasus.rolling_update.rs.list"
@@ -279,46 +279,11 @@ if [ "$type" = "all" ]; then
echo "Rolling update collectors done."
echo
- echo "Set meta level to lively..."
- echo "set_meta_level lively" | ./run.sh shell --cluster $meta_list
&>/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level
- set_ok=`grep 'control meta level ok'
/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level | wc -l`
- if [ $set_ok -ne 1 ]; then
- echo "ERROR: set meta level to lively failed"
- exit 1
- fi
- echo
-
- echo "Wait cluster to become balanced..."
- echo "Wait for 3 minutes to do load balance..."
- sleep 180
- while true
- do
- op_count=`echo "cluster_info" | ./run.sh shell --cluster $meta_list | grep
balance_operation_count | grep -o 'total=[0-9][0-9]*' | cut -d= -f2`
- if [ -z "op_count" ]; then
- break
- fi
- if [ $op_count -eq 0 ]; then
- echo "Cluster becomes balanced."
- break
- else
- echo "Still $op_count balance operations to do..."
- sleep 10
- fi
- done
- echo
-
- echo "Set meta level to steady..."
- echo "set_meta_level steady" | ./run.sh shell --cluster $meta_list
&>/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level
- set_ok=`grep 'control meta level ok'
/tmp/$UID.$PID.pegasus.rolling_update.set_meta_level | wc -l`
- if [ $set_ok -ne 1 ]; then
- echo "ERROR: set meta level to steady failed"
- exit 1
- fi
- echo
+ ./scripts/pegasus_rebalance_cluster.sh $cluster $meta_list
fi
echo "Finish time: `date`"
-all_finish_time=$((`date +%s`))
-echo "Rolling update $type done, elasped time is $((all_finish_time -
all_start_time)) seconds."
+rolling_finish_time=$((`date +%s`))
+echo "Rolling update $type done, elasped time is $((rolling_finish_time -
rolling_start_time)) seconds."
rm -f /tmp/$UID.$PID.pegasus.* &>/dev/null
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]