IMPALA-3912: test_random_rpc_timeout is flaky. Datastream sender default timeout is 2 mins which could block some fragments to complete until the timeout and cause the metric "num-fragments-in-flight" not back to 0 after 60 seconds. Decrease the sender timeout to 30 seconds and adding some logging.
Change-Id: I19f8b3fea66c5a0398e3476a46f060be9f951983 Reviewed-on: http://gerrit.cloudera.org:8080/4080 Reviewed-by: Juan Yu <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/9b3f43b9 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/9b3f43b9 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/9b3f43b9 Branch: refs/heads/master Commit: 9b3f43b9fd006fe5d72281b0862abe42e0a0a061 Parents: a42d18d Author: Juan Yu <[email protected]> Authored: Mon Aug 22 11:49:18 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Thu Sep 15 21:46:45 2016 +0000 ---------------------------------------------------------------------- be/src/service/fragment-mgr.cc | 2 ++ tests/custom_cluster/test_rpc_timeout.py | 11 +++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b3f43b9/be/src/service/fragment-mgr.cc ---------------------------------------------------------------------- diff --git a/be/src/service/fragment-mgr.cc b/be/src/service/fragment-mgr.cc index 5ae5845..a98bbf9 100644 --- a/be/src/service/fragment-mgr.cc +++ b/be/src/service/fragment-mgr.cc @@ -100,6 +100,8 @@ void FragmentMgr::FragmentThread(TUniqueId fragment_instance_id) { // the fragment exec state. ImpaladMetrics::IMPALA_SERVER_NUM_FRAGMENTS_IN_FLIGHT->Increment(-1L); + VLOG_QUERY << "PlanFragment completed. instance_id=" << fragment_instance_id; + #ifndef ADDRESS_SANITIZER // tcmalloc and address sanitizer can not be used together if (FLAGS_log_mem_usage_interval > 0) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/9b3f43b9/tests/custom_cluster/test_rpc_timeout.py ---------------------------------------------------------------------- diff --git a/tests/custom_cluster/test_rpc_timeout.py b/tests/custom_cluster/test_rpc_timeout.py index 76eb3d6..4ea5351 100644 --- a/tests/custom_cluster/test_rpc_timeout.py +++ b/tests/custom_cluster/test_rpc_timeout.py @@ -78,7 +78,8 @@ class TestRPCTimeout(CustomClusterTestSuite): @pytest.mark.execute_serially @CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000" - " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=1") + " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=1" + " --datastream_sender_timeout_ms=30000") def test_execplanfragment_timeout(self, vector): for i in range(3): ex= self.execute_query_expect_failure(self.client, self.TEST_QUERY) @@ -91,7 +92,8 @@ class TestRPCTimeout(CustomClusterTestSuite): @pytest.mark.execute_serially @CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000" - " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=2") + " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=2" + " --datastream_sender_timeout_ms=30000") def test_cancelplanfragment_timeout(self, vector): query = "select * from tpch.lineitem limit 5000" self.execute_query_then_cancel(query, vector) @@ -117,12 +119,13 @@ class TestRPCTimeout(CustomClusterTestSuite): @pytest.mark.execute_serially @CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000" " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=6" - " --status_report_interval=1 ") + " --status_report_interval=1") def test_reportexecstatus_timeout(self, vector): self.execute_query_verify_metrics(self.TEST_QUERY) @pytest.mark.execute_serially @CustomClusterTestSuite.with_args("--backend_client_rpc_timeout_ms=1000" - " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=7") + " --fault_injection_rpc_delay_ms=3000 --fault_injection_rpc_type=7" + " --datastream_sender_timeout_ms=30000") def test_random_rpc_timeout(self, vector): self.execute_query_verify_metrics(self.TEST_QUERY, 10)
