This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch branch-4.1.1 in repository https://gitbox.apache.org/repos/asf/impala.git
commit 2aeb6013fa44e53031d82b7e7ca59d771037d60a Author: stiga-huang <[email protected]> AuthorDate: Wed May 18 16:51:37 2022 +0800 IMPALA-7864: (Addendum) Deflake test_replan_limit by postponing catalog fetches TestLocalCatalogRetries.test_replan_limit runs REFRESH and SELECT queries concurrently on a table, and expects one of the query hits inconsistent metadata. This patch increases the chance of inconsistent metadata by injecting a latency (500ms) before each catalog fetch. So it's more likely that a request is fetching stale metadata. Also bump up the timeout of thread.join() so we can try out all the attempts. Test - Run test_replan_limit 1000 times without any error. - Run all tests of TestLocalCatalogRetries 100 times without any error. Change-Id: Ia5bdca7402039f1f24b7bf19595c2541fa32d0ad Reviewed-on: http://gerrit.cloudera.org:8080/18537 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Reviewed-on: http://gerrit.cloudera.org:8080/18951 Reviewed-by: Csaba Ringhofer <[email protected]> Tested-by: Quanlong Huang <[email protected]> --- be/src/exec/catalog-op-executor.cc | 5 +++++ tests/custom_cluster/test_local_catalog.py | 12 ++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/be/src/exec/catalog-op-executor.cc b/be/src/exec/catalog-op-executor.cc index 646e6aa52..c6c245428 100644 --- a/be/src/exec/catalog-op-executor.cc +++ b/be/src/exec/catalog-op-executor.cc @@ -55,6 +55,8 @@ DECLARE_int32(catalog_client_connection_num_retries); DECLARE_int32(catalog_client_rpc_timeout_ms); DECLARE_int32(catalog_client_rpc_retry_interval_ms); +DEFINE_int32_hidden(inject_latency_before_catalog_fetch_ms, 0, + "Latency (ms) to be injected before fetching catalog data from the catalogd"); DEFINE_int32_hidden(inject_latency_after_catalog_fetch_ms, 0, "Latency (ms) to be injected after fetching catalog data from the catalogd"); @@ -366,6 +368,9 @@ Status CatalogOpExecutor::GetPartialCatalogObject( DCHECK(FLAGS_use_local_catalog || TestInfo::is_test()); const TNetworkAddress& address = MakeNetworkAddress(FLAGS_catalog_service_host, FLAGS_catalog_service_port); + if (FLAGS_inject_latency_before_catalog_fetch_ms > 0) { + SleepForMs(FLAGS_inject_latency_before_catalog_fetch_ms); + } int attempt = 0; // Used for debug action only. CatalogServiceConnection::RpcStatus rpc_status = CatalogServiceConnection::DoRpcWithRetry(env_->catalogd_client_cache(), address, diff --git a/tests/custom_cluster/test_local_catalog.py b/tests/custom_cluster/test_local_catalog.py index 63b0cbb91..6e74a4de0 100644 --- a/tests/custom_cluster/test_local_catalog.py +++ b/tests/custom_cluster/test_local_catalog.py @@ -273,8 +273,9 @@ class TestLocalCatalogRetries(CustomClusterTestSuite): q = random.choice(queries) attempt += 1 try: + print 'Attempt', attempt, 'client', str(client) ret = self.execute_query_unchecked(client, q) - except Exception, e: + except Exception as e: if 'InconsistentMetadataFetchException' in str(e): with inconsistent_seen_lock: inconsistent_seen[0] += 1 @@ -287,7 +288,8 @@ class TestLocalCatalogRetries(CustomClusterTestSuite): t.start() for t in threads: # When there are failures, they're observed quickly. - t.join(30) + # 600s is enough for 200 attempts. + t.join(600) assert failed_queries.empty(),\ "Failed query count non zero: %s" % list(failed_queries.queue) @@ -318,7 +320,8 @@ class TestLocalCatalogRetries(CustomClusterTestSuite): @pytest.mark.execute_serially @CustomClusterTestSuite.with_args( - impalad_args="--use_local_catalog=true --local_catalog_max_fetch_retries=0", + impalad_args="--use_local_catalog=true --local_catalog_max_fetch_retries=0" + " --inject_latency_before_catalog_fetch_ms=500", catalogd_args="--catalog_topic_mode=minimal") def test_replan_limit(self): """ @@ -326,7 +329,8 @@ class TestLocalCatalogRetries(CustomClusterTestSuite): an inconsistent metadata exception when running concurrent reads/writes is seen. With the max retries set to 0, no retries are expected and with the concurrent read/write workload, an inconsistent metadata exception is - expected. + expected. Setting inject_latency_before_catalog_fetch_ms to increases the + possibility of a stale request which throws the expected exception. """ queries = [ 'refresh functional.alltypes',
