IMPALA-6093: diagnostics for flaky TestHashJoinTimer We don't know the root cause yet but try to improve things: * Eliminate one possible cause of flakiness - unfinished fragments left from previous queries. * Print a profile if an assertion fails so we can see why it failed.
Testing: Ran core tests. Change-Id: Ic332dddd96931db807abb960db43b99e5fd0f256 Reviewed-on: http://gerrit.cloudera.org:8080/8403 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5810c759 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5810c759 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5810c759 Branch: refs/heads/master Commit: 5810c759ca03d3e3b59776f3fe3abe11234cefd2 Parents: 30629fd Author: Tim Armstrong <[email protected]> Authored: Thu Oct 26 16:44:02 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Sat Oct 28 01:55:44 2017 +0000 ---------------------------------------------------------------------- tests/query_test/test_hash_join_timer.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5810c759/tests/query_test/test_hash_join_timer.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_hash_join_timer.py b/tests/query_test/test_hash_join_timer.py index f3d4d79..c111c03 100644 --- a/tests/query_test/test_hash_join_timer.py +++ b/tests/query_test/test_hash_join_timer.py @@ -18,12 +18,15 @@ import pytest import re +from tests.common.impala_cluster import ImpalaCluster from tests.common.impala_test_suite import ImpalaTestSuite from tests.common.test_vector import ImpalaTestDimension +from tests.verifiers.metric_verifier import MetricVerifier + class TestHashJoinTimer(ImpalaTestSuite): """Tests that the local time in hash join is correct in the ExecSummary, average - reporting and invidiaul fragment reporting.""" + reporting and individual fragment reporting.""" # There are two cases that we are interested in verifying that the profile is returning # a correct timing: @@ -60,7 +63,7 @@ class TestHashJoinTimer(ImpalaTestSuite): " where a.id>b.id and a.id=99", "NESTED LOOP JOIN"] ] - # IMPALA-2973: For non-code-coverage builds, 1000 milliseconds are sufficent, but more + # IMPALA-2973: For non-code-coverage builds, 1000 milliseconds are sufficient, but more # time is needed in code-coverage builds. HASH_JOIN_UPPER_BOUND_MS = 2000 # IMPALA-2973: Temporary workaround: when timers are using Linux COARSE clockid_t, very @@ -90,11 +93,17 @@ class TestHashJoinTimer(ImpalaTestSuite): def test_hash_join_timer(self, vector): # This test runs serially because it requires the query to come back within # some amount of time. Running this with other tests makes it hard to bound - # that time. + # that time. It also assumes that it will be able to get a thread token to + # execute the join build in parallel. test_case = vector.get_value('test cases') query = test_case[0] join_type = test_case[1] + # Ensure that the cluster is idle before starting the test query. + for impalad in ImpalaCluster().impalads: + verifier = MetricVerifier(impalad.service) + verifier.wait_for_metric("impala-server.num-fragments-in-flight", 0) + # Execute async to get a handle. Wait until the query has completed. handle = self.execute_query_async(query, vector.get_value('exec_option')) self.impalad_test_service.wait_for_query_state(self.client, handle, @@ -111,6 +120,7 @@ class TestHashJoinTimer(ImpalaTestSuite): profile = self.client.get_runtime_profile(handle) check_execsummary_count = 0 check_fragment_count = 0 + asyn_build = False for line in profile.split("\n"): # Matching for ExecSummary @@ -137,9 +147,11 @@ class TestHashJoinTimer(ImpalaTestSuite): if ("Join Build-Side Prepared Asynchronously" in line): asyn_build = True; - assert (asyn_build), "Join is not prepared asynchronously" - assert (check_fragment_count > 1), "Unable to verify Fragment or Average Fragment" - assert (check_execsummary_count == 1), "Unable to verify ExecSummary" % profile + assert (asyn_build), "Join is not prepared asynchronously: {0}".format(profile) + assert (check_fragment_count > 1), \ + "Unable to verify Fragment or Average Fragment: {0}".format(profile) + assert (check_execsummary_count == 1), \ + "Unable to verify ExecSummary: {0}".format(profile) def __verify_join_time(self, duration, comment): duration_ms = self.__parse_duration_ms(duration)
