Repository: impala Updated Branches: refs/heads/2.x eda1f33cd -> 0ba181e5b
IMPALA-6394: Restart HDFS when blocks are under replicated HDFS sometimes fails to fully replicate all the blocks in 30 seconds and no progress is made. This patch tries to restart HDFS several times before aborting the data loading. Change-Id: Iefd4c2fc6c287f054e385de52bdc42b0bdbd7915 Reviewed-on: http://gerrit.cloudera.org:8080/9469 Reviewed-by: Alex Behm <alex.b...@cloudera.com> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/10fced44 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/10fced44 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/10fced44 Branch: refs/heads/2.x Commit: 10fced446c7b6f78b86ff8396186101e094a0ec6 Parents: eda1f33 Author: Tianyi Wang <tw...@cloudera.com> Authored: Fri Mar 2 14:13:49 2018 -0800 Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org> Committed: Fri Mar 9 23:10:17 2018 +0000 ---------------------------------------------------------------------- testdata/bin/create-load-data.sh | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/10fced44/testdata/bin/create-load-data.sh ---------------------------------------------------------------------- diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh index 404bdfe..787baca 100755 --- a/testdata/bin/create-load-data.sh +++ b/testdata/bin/create-load-data.sh @@ -450,20 +450,26 @@ function copy-and-load-ext-data-source { } function wait-hdfs-replication { - FAIL_COUNT=0 - while [[ "$FAIL_COUNT" -ne "6" ]] ; do + MAX_RETRIES=6 + for ((RESTART_COUNT = 0; RESTART_COUNT <= MAX_RETRIES; ++RESTART_COUNT)); do + sleep "$((RESTART_COUNT * 10))" FSCK_OUTPUT="$(hdfs fsck /test-warehouse)" echo "$FSCK_OUTPUT" if grep "Under-replicated blocks:[[:space:]]*0" <<< "$FSCK_OUTPUT"; then + # All the blocks are fully-replicated. The data loading can continue. return fi - let FAIL_COUNT="$FAIL_COUNT"+1 - sleep 5 + if [[ "$RESTART_COUNT" -eq "$MAX_RETRIES" ]] ; then + echo "Some HDFS blocks are still under-replicated after restarting HDFS"\ + "$MAX_RETRIES times." + echo "Some tests cannot pass without fully-replicated blocks (IMPALA-3887)." + echo "Failing the data loading." + exit 1 + fi + echo "There are under-replicated blocks in HDFS. Attempting to restart HDFS to"\ + "resolve this issue." + ${IMPALA_HOME}/testdata/bin/run-mini-dfs.sh done - echo "Some HDFS blocks are still under replicated after 30s." - echo "Some tests cannot pass without fully replicated blocks (IMPALA-3887)." - echo "Failing the data loading." - exit 1 } # For kerberized clusters, use kerberos