IMPALA-6093: diagnostics for flaky TestHashJoinTimer

We don't know the root cause yet but try to improve things:
* Eliminate one possible cause of flakiness - unfinished fragments left
  from previous queries.
* Print a profile if an assertion fails so we can see why it failed.

Testing:
Ran core tests.

Change-Id: Ic332dddd96931db807abb960db43b99e5fd0f256
Reviewed-on: http://gerrit.cloudera.org:8080/8403
Reviewed-by: Tim Armstrong <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5810c759
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5810c759
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5810c759

Branch: refs/heads/master
Commit: 5810c759ca03d3e3b59776f3fe3abe11234cefd2
Parents: 30629fd
Author: Tim Armstrong <[email protected]>
Authored: Thu Oct 26 16:44:02 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Sat Oct 28 01:55:44 2017 +0000

----------------------------------------------------------------------
 tests/query_test/test_hash_join_timer.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5810c759/tests/query_test/test_hash_join_timer.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_hash_join_timer.py 
b/tests/query_test/test_hash_join_timer.py
index f3d4d79..c111c03 100644
--- a/tests/query_test/test_hash_join_timer.py
+++ b/tests/query_test/test_hash_join_timer.py
@@ -18,12 +18,15 @@
 import pytest
 import re
 
+from tests.common.impala_cluster import ImpalaCluster
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.test_vector import ImpalaTestDimension
+from tests.verifiers.metric_verifier import MetricVerifier
+
 
 class TestHashJoinTimer(ImpalaTestSuite):
   """Tests that the local time in hash join is correct in the ExecSummary, 
average
-   reporting and invidiaul fragment reporting."""
+   reporting and individual fragment reporting."""
 
   # There are two cases that we are interested in verifying that the profile 
is returning
   # a correct timing:
@@ -60,7 +63,7 @@ class TestHashJoinTimer(ImpalaTestSuite):
               " where a.id>b.id and a.id=99",
               "NESTED LOOP JOIN"]
              ]
-  # IMPALA-2973: For non-code-coverage builds, 1000 milliseconds are 
sufficent, but more
+  # IMPALA-2973: For non-code-coverage builds, 1000 milliseconds are 
sufficient, but more
   # time is needed in code-coverage builds.
   HASH_JOIN_UPPER_BOUND_MS = 2000
   # IMPALA-2973: Temporary workaround: when timers are using Linux COARSE 
clockid_t, very
@@ -90,11 +93,17 @@ class TestHashJoinTimer(ImpalaTestSuite):
   def test_hash_join_timer(self, vector):
     # This test runs serially because it requires the query to come back within
     # some amount of time. Running this with other tests makes it hard to bound
-    # that time.
+    # that time. It also assumes that it will be able to get a thread token to
+    # execute the join build in parallel.
     test_case = vector.get_value('test cases')
     query = test_case[0]
     join_type = test_case[1]
 
+    # Ensure that the cluster is idle before starting the test query.
+    for impalad in ImpalaCluster().impalads:
+      verifier = MetricVerifier(impalad.service)
+      verifier.wait_for_metric("impala-server.num-fragments-in-flight", 0)
+
     # Execute async to get a handle. Wait until the query has completed.
     handle = self.execute_query_async(query, vector.get_value('exec_option'))
     self.impalad_test_service.wait_for_query_state(self.client, handle,
@@ -111,6 +120,7 @@ class TestHashJoinTimer(ImpalaTestSuite):
     profile = self.client.get_runtime_profile(handle)
     check_execsummary_count = 0
     check_fragment_count = 0
+    asyn_build = False
 
     for line in profile.split("\n"):
         # Matching for ExecSummary
@@ -137,9 +147,11 @@ class TestHashJoinTimer(ImpalaTestSuite):
         if ("Join Build-Side Prepared Asynchronously" in line):
             asyn_build = True;
 
-    assert (asyn_build), "Join is not prepared asynchronously"
-    assert (check_fragment_count > 1), "Unable to verify Fragment or Average 
Fragment"
-    assert (check_execsummary_count == 1), "Unable to verify ExecSummary" % 
profile
+    assert (asyn_build), "Join is not prepared asynchronously: 
{0}".format(profile)
+    assert (check_fragment_count > 1), \
+        "Unable to verify Fragment or Average Fragment: {0}".format(profile)
+    assert (check_execsummary_count == 1), \
+        "Unable to verify ExecSummary: {0}".format(profile)
 
   def __verify_join_time(self, duration, comment):
     duration_ms = self.__parse_duration_ms(duration)

Reply via email to