Repository: impala
Updated Branches:
  refs/heads/2.x eda1f33cd -> 0ba181e5b


IMPALA-6394: Restart HDFS when blocks are under replicated

HDFS sometimes fails to fully replicate all the blocks in 30 seconds
and no progress is made. This patch tries to restart HDFS several times
before aborting the data loading.

Change-Id: Iefd4c2fc6c287f054e385de52bdc42b0bdbd7915
Reviewed-on: http://gerrit.cloudera.org:8080/9469
Reviewed-by: Alex Behm <alex.b...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/10fced44
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/10fced44
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/10fced44

Branch: refs/heads/2.x
Commit: 10fced446c7b6f78b86ff8396186101e094a0ec6
Parents: eda1f33
Author: Tianyi Wang <tw...@cloudera.com>
Authored: Fri Mar 2 14:13:49 2018 -0800
Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org>
Committed: Fri Mar 9 23:10:17 2018 +0000

----------------------------------------------------------------------
 testdata/bin/create-load-data.sh | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/10fced44/testdata/bin/create-load-data.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 404bdfe..787baca 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -450,20 +450,26 @@ function copy-and-load-ext-data-source {
 }
 
 function wait-hdfs-replication {
-  FAIL_COUNT=0
-  while [[ "$FAIL_COUNT" -ne "6" ]] ; do
+  MAX_RETRIES=6
+  for ((RESTART_COUNT = 0; RESTART_COUNT <= MAX_RETRIES; ++RESTART_COUNT)); do
+    sleep "$((RESTART_COUNT * 10))"
     FSCK_OUTPUT="$(hdfs fsck /test-warehouse)"
     echo "$FSCK_OUTPUT"
     if grep "Under-replicated blocks:[[:space:]]*0" <<< "$FSCK_OUTPUT"; then
+      # All the blocks are fully-replicated. The data loading can continue.
       return
     fi
-    let FAIL_COUNT="$FAIL_COUNT"+1
-    sleep 5
+    if [[ "$RESTART_COUNT" -eq "$MAX_RETRIES" ]] ; then
+      echo "Some HDFS blocks are still under-replicated after restarting HDFS"\
+          "$MAX_RETRIES times."
+      echo "Some tests cannot pass without fully-replicated blocks 
(IMPALA-3887)."
+      echo "Failing the data loading."
+      exit 1
+    fi
+    echo "There are under-replicated blocks in HDFS. Attempting to restart 
HDFS to"\
+        "resolve this issue."
+    ${IMPALA_HOME}/testdata/bin/run-mini-dfs.sh
   done
-  echo "Some HDFS blocks are still under replicated after 30s."
-  echo "Some tests cannot pass without fully replicated blocks (IMPALA-3887)."
-  echo "Failing the data loading."
-  exit 1
 }
 
 # For kerberized clusters, use kerberos

Reply via email to