[FLINK-9674][tests] Replace hard-coded sleeps in QS E2E test

This closes #6216.
This closes #6025.
This closes #5297.
This closes #6211.
This closes #5899.
This closes #5888.
This closes #5901.


Project: http://git-wip-us.apache.org/repos/asf/flink/repo
Commit: http://git-wip-us.apache.org/repos/asf/flink/commit/41277f6b
Tree: http://git-wip-us.apache.org/repos/asf/flink/tree/41277f6b
Diff: http://git-wip-us.apache.org/repos/asf/flink/diff/41277f6b

Branch: refs/heads/master
Commit: 41277f6b7447f3542e439b7ae82b99be862df7c2
Parents: 02f016e
Author: zentol <ches...@apache.org>
Authored: Wed Jun 27 13:01:06 2018 +0200
Committer: zentol <ches...@apache.org>
Committed: Tue Jul 3 11:44:16 2018 +0200

----------------------------------------------------------------------
 flink-end-to-end-tests/test-scripts/common.sh   | 20 +++++++++++++++++++-
 .../test_queryable_state_restart_tm.sh          | 20 ++++++++++++++------
 2 files changed, 33 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/flink/blob/41277f6b/flink-end-to-end-tests/test-scripts/common.sh
----------------------------------------------------------------------
diff --git a/flink-end-to-end-tests/test-scripts/common.sh 
b/flink-end-to-end-tests/test-scripts/common.sh
index 610be2d..4ed83b0 100644
--- a/flink-end-to-end-tests/test-scripts/common.sh
+++ b/flink-end-to-end-tests/test-scripts/common.sh
@@ -260,9 +260,27 @@ function stop_cluster {
   fi
 }
 
+function wait_for_job_state_transition {
+  local job=$1
+  local initial_state=$2
+  local next_state=$3
+    
+  echo "Waiting for job ($job) to switch from state ${initial_state} to state 
${next_state} ..."
+
+  while : ; do
+    N=$(grep -o "($job) switched from state ${initial_state} to ${next_state}" 
$FLINK_DIR/log/*standalonesession*.log | tail -1)
+
+    if [[ -z $N ]]; then
+      sleep 1
+    else
+      break
+    fi
+  done
+}
+
 function wait_job_running {
   for i in {1..10}; do
-    JOB_LIST_RESULT=$("$FLINK_DIR"/bin/flink list | grep "$1")
+    JOB_LIST_RESULT=$("$FLINK_DIR"/bin/flink list -r | grep "$1")
 
     if [[ "$JOB_LIST_RESULT" == "" ]]; then
       echo "Job ($1) is not yet running."

http://git-wip-us.apache.org/repos/asf/flink/blob/41277f6b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
----------------------------------------------------------------------
diff --git 
a/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh 
b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
index f3bdcd3..d8d08df 100755
--- a/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
+++ b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh
@@ -55,6 +55,11 @@ function run_test() {
     clean_log_files
     clean_stdout_files
 
+    backup_config
+    # speeds up TM loss detection
+    set_conf "heartbeat.interval" "2000"
+    set_conf "heartbeat.timeout" "10000"
+
     link_queryable_state_lib
     start_cluster
 
@@ -85,20 +90,23 @@ function run_test() {
         exit 1
     fi
 
-    local 
current_num_checkpoints=current_num_checkpoints$(get_completed_number_of_checkpoints
 ${JOB_ID})
-
     kill_random_taskmanager
 
     latest_snapshot_count=$(cat $FLINK_DIR/log/*out* | grep "on snapshot" | 
tail -n 1 | awk '{print $4}')
     echo "Latest snapshot count was ${latest_snapshot_count}"
 
-    sleep 65 # this is a little longer than the heartbeat timeout so that the 
TM is gone
+    # wait until the TM loss was detected
+    wait_for_job_state_transition ${JOB_ID} "RESTARTING" "CREATED"
 
     start_and_wait_for_tm
 
+    wait_job_running ${JOB_ID}
+
+    local current_num_checkpoints="$(get_completed_number_of_checkpoints 
${JOB_ID})"
     # wait for some more checkpoint to have happened
-    ((current_num_checkpoints+=2))
-    wait_for_number_of_checkpoints ${JOB_ID} ${current_num_checkpoints} 60
+    local expected_num_checkpoints=$((current_num_checkpoints + 5))
+
+    wait_for_number_of_checkpoints ${JOB_ID} ${expected_num_checkpoints} 60
 
     local num_entries_in_map_state_after=$(java -jar 
${QUERYABLE_STATE_CLIENT_JAR} \
         --host ${SERVER} \
@@ -135,7 +143,7 @@ function wait_for_number_of_checkpoints {
     local timeout=$3
     local count=0
 
-    echo "Starting to wait for checkpoints"
+    echo "Starting to wait for completion of ${expected_num_checkpoints} 
checkpoints"
     while (($(get_completed_number_of_checkpoints ${job_id}) < 
${expected_num_checkpoints})); do
 
         if [[ ${count} -gt ${timeout} ]]; then

Reply via email to