[
https://issues.apache.org/jira/browse/FLINK-8974?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16555566#comment-16555566
]
ASF GitHub Bot commented on FLINK-8974:
---------------------------------------
Github user dawidwys commented on a diff in the pull request:
https://github.com/apache/flink/pull/6415#discussion_r205084257
--- Diff: flink-end-to-end-tests/test-scripts/test_ha_dataset.sh ---
@@ -0,0 +1,139 @@
+#!/usr/bin/env bash
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+source "$(dirname "$0")"/common.sh
+
+TEST_PROGRAM_JAR=${END_TO_END_DIR}/flink-dataset-allround-test/target/DataSetAllroundTestProgram.jar
+
+JM_WATCHDOG_PID=0
+
+# flag indicating if we have already cleared up things after a test
+CLEARED=0
+
+function stop_cluster_and_watchdog() {
+ if [ ${CLEARED} -eq 0 ]; then
+
+ if ! [ ${JM_WATCHDOG_PID} -eq 0 ]; then
+ echo "Killing JM watchdog @ ${JM_WATCHDOG_PID}"
+ kill ${JM_WATCHDOG_PID} 2> /dev/null
+ wait ${JM_WATCHDOG_PID} 2> /dev/null
+ fi
+
+ CLEARED=1
+ fi
+}
+
+function verify_logs() {
+ local OUTPUT=$FLINK_DIR/log/*.out
+ local JM_FAILURES=$1
+ local EXIT_CODE=0
+
+ # verify that we have no alerts
+ if ! [ `cat ${OUTPUT} | wc -l` -eq 0 ]; then
+ echo "FAILURE: Alerts found at the general purpose DataSet job."
+ EXIT_CODE=1
+ fi
+
+ # checks that all apart from the first JM recover the failed jobgraph.
+ if ! [ `grep -r --include '*standalonesession*.log' 'Recovered
SubmittedJobGraph' "${FLINK_DIR}/log/" | cut -d ":" -f 1 | uniq | wc -l` -eq
${JM_FAILURES} ]; then
+ echo "FAILURE: A JM did not take over."
+ EXIT_CODE=1
+ fi
+
+ if [[ $EXIT_CODE != 0 ]]; then
+ echo "One or more tests FAILED."
+ exit $EXIT_CODE
+ fi
+}
+
+function jm_watchdog() {
+ local EXPECTED_JMS=$1
+ local IP_PORT=$2
+
+ while true; do
+ local RUNNING_JMS=`jps | grep 'StandaloneSessionClusterEntrypoint'
| wc -l`;
+ local MISSING_JMS=$((EXPECTED_JMS-RUNNING_JMS))
+ for (( c=0; c<MISSING_JMS; c++ )); do
+ "$FLINK_DIR"/bin/jobmanager.sh start "localhost" ${IP_PORT}
+ done
+ sleep 1;
+ done
+}
+
+function kill_jm {
+ local JM_PIDS=`jps | grep 'StandaloneSessionClusterEntrypoint' | cut
-d " " -f 1`
+ local JM_PIDS=(${JM_PIDS[@]})
+ local PID=${JM_PIDS[0]}
+ kill -9 ${PID}
+
+ echo "Killed JM @ ${PID}"
+}
+
+function run_ha_test() {
+ local PARALLELISM=$1
+
+ local JM_KILLS=3
+
+ CLEARED=0
+ mkdir -p ${TEST_DATA_DIR}/control
+ touch ${TEST_DATA_DIR}/control/test.txt
+
+ # start the cluster on HA mode
+ start_ha_cluster
+
+ echo "Running on HA mode: parallelism=${PARALLELISM}."
+
+ # submit a job in detached mode and let it run
+ local JOB_ID=$($FLINK_DIR/bin/flink run -d -p ${PARALLELISM} \
+ $TEST_PROGRAM_JAR \
+ --loadFactor 4 \
+ --outputPath $TEST_DATA_DIR/out/dataset_allround \
+ --source ${TEST_DATA_DIR}/control/test.txt \
+ | grep "Job has been submitted with JobID" | sed 's/.* //g')
+
+ wait_job_running ${JOB_ID}
+
+ # start the watchdog that keeps the number of JMs stable
+ jm_watchdog 1 "8081" &
+ JM_WATCHDOG_PID=$!
+ echo "Running JM watchdog @ ${JM_WATCHDOG_PID}"
+
+ for (( c=0; c<${JM_KILLS}; c++ )); do
+ # kill the JM and wait for watchdog to
+ # create a new one which will take over
+ kill_jm
+ sleep 20
--- End diff --
Actually I would even remove the sleep completely.
> End-to-end test: Run general purpose DataSet job with failures in standalone
> mode
> ---------------------------------------------------------------------------------
>
> Key: FLINK-8974
> URL: https://issues.apache.org/jira/browse/FLINK-8974
> Project: Flink
> Issue Type: Sub-task
> Components: Tests
> Affects Versions: 1.5.0
> Reporter: Till Rohrmann
> Assignee: Tuo Wang
> Priority: Blocker
> Labels: pull-request-available
> Fix For: 1.6.0
>
>
> Similar to FLINK-8973, we should setup an end-to-end test where we run the
> general purpose DataSet job from FLINK-8972 in a HA standalone setting with
> failures and process kills.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)