[FLINK-9674][tests] Replace hard-coded sleeps in QS E2E test This closes #6216. This closes #6025. This closes #5297. This closes #6211. This closes #5899. This closes #5888. This closes #5901.
Project: http://git-wip-us.apache.org/repos/asf/flink/repo Commit: http://git-wip-us.apache.org/repos/asf/flink/commit/41277f6b Tree: http://git-wip-us.apache.org/repos/asf/flink/tree/41277f6b Diff: http://git-wip-us.apache.org/repos/asf/flink/diff/41277f6b Branch: refs/heads/master Commit: 41277f6b7447f3542e439b7ae82b99be862df7c2 Parents: 02f016e Author: zentol <ches...@apache.org> Authored: Wed Jun 27 13:01:06 2018 +0200 Committer: zentol <ches...@apache.org> Committed: Tue Jul 3 11:44:16 2018 +0200 ---------------------------------------------------------------------- flink-end-to-end-tests/test-scripts/common.sh | 20 +++++++++++++++++++- .../test_queryable_state_restart_tm.sh | 20 ++++++++++++++------ 2 files changed, 33 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/flink/blob/41277f6b/flink-end-to-end-tests/test-scripts/common.sh ---------------------------------------------------------------------- diff --git a/flink-end-to-end-tests/test-scripts/common.sh b/flink-end-to-end-tests/test-scripts/common.sh index 610be2d..4ed83b0 100644 --- a/flink-end-to-end-tests/test-scripts/common.sh +++ b/flink-end-to-end-tests/test-scripts/common.sh @@ -260,9 +260,27 @@ function stop_cluster { fi } +function wait_for_job_state_transition { + local job=$1 + local initial_state=$2 + local next_state=$3 + + echo "Waiting for job ($job) to switch from state ${initial_state} to state ${next_state} ..." + + while : ; do + N=$(grep -o "($job) switched from state ${initial_state} to ${next_state}" $FLINK_DIR/log/*standalonesession*.log | tail -1) + + if [[ -z $N ]]; then + sleep 1 + else + break + fi + done +} + function wait_job_running { for i in {1..10}; do - JOB_LIST_RESULT=$("$FLINK_DIR"/bin/flink list | grep "$1") + JOB_LIST_RESULT=$("$FLINK_DIR"/bin/flink list -r | grep "$1") if [[ "$JOB_LIST_RESULT" == "" ]]; then echo "Job ($1) is not yet running." http://git-wip-us.apache.org/repos/asf/flink/blob/41277f6b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh ---------------------------------------------------------------------- diff --git a/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh index f3bdcd3..d8d08df 100755 --- a/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh +++ b/flink-end-to-end-tests/test-scripts/test_queryable_state_restart_tm.sh @@ -55,6 +55,11 @@ function run_test() { clean_log_files clean_stdout_files + backup_config + # speeds up TM loss detection + set_conf "heartbeat.interval" "2000" + set_conf "heartbeat.timeout" "10000" + link_queryable_state_lib start_cluster @@ -85,20 +90,23 @@ function run_test() { exit 1 fi - local current_num_checkpoints=current_num_checkpoints$(get_completed_number_of_checkpoints ${JOB_ID}) - kill_random_taskmanager latest_snapshot_count=$(cat $FLINK_DIR/log/*out* | grep "on snapshot" | tail -n 1 | awk '{print $4}') echo "Latest snapshot count was ${latest_snapshot_count}" - sleep 65 # this is a little longer than the heartbeat timeout so that the TM is gone + # wait until the TM loss was detected + wait_for_job_state_transition ${JOB_ID} "RESTARTING" "CREATED" start_and_wait_for_tm + wait_job_running ${JOB_ID} + + local current_num_checkpoints="$(get_completed_number_of_checkpoints ${JOB_ID})" # wait for some more checkpoint to have happened - ((current_num_checkpoints+=2)) - wait_for_number_of_checkpoints ${JOB_ID} ${current_num_checkpoints} 60 + local expected_num_checkpoints=$((current_num_checkpoints + 5)) + + wait_for_number_of_checkpoints ${JOB_ID} ${expected_num_checkpoints} 60 local num_entries_in_map_state_after=$(java -jar ${QUERYABLE_STATE_CLIENT_JAR} \ --host ${SERVER} \ @@ -135,7 +143,7 @@ function wait_for_number_of_checkpoints { local timeout=$3 local count=0 - echo "Starting to wait for checkpoints" + echo "Starting to wait for completion of ${expected_num_checkpoints} checkpoints" while (($(get_completed_number_of_checkpoints ${job_id}) < ${expected_num_checkpoints})); do if [[ ${count} -gt ${timeout} ]]; then