This is an automated email from the ASF dual-hosted git repository.

wzhou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new bafd19030 IMPALA-13143: Fix flaky test_catalogd_failover_with_sync_ddl
bafd19030 is described below

commit bafd1903069163f38812d7fa42f9c4d2f7218fcf
Author: wzhou-code <[email protected]>
AuthorDate: Thu Jun 6 21:56:49 2024 -0700

    IMPALA-13143: Fix flaky test_catalogd_failover_with_sync_ddl
    
    The test_catalogd_failover_with_sync_ddl test which was added to
    custom_cluster/test_catalogd_ha.py in IMPALA-13134 failed on s3.
    The test relies on specific timing with a sleep injected via a
    debug action so that the DDL query is still running when catalogd
    failover is triggered. The failures were caused by slowly restarting
    for catalogd on s3 so that the query finished before catalogd
    failover was triggered.
    
    This patch fixed the issue by increasing the sleep time for s3 builds
    and other slow builds.
    
    Testing:
     - Ran the test 100 times in a loop on s3.
    
    Change-Id: I15bb6aae23a2f544067f993533e322969372ebd5
    Reviewed-on: http://gerrit.cloudera.org:8080/21491
    Reviewed-by: Riza Suminto <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 tests/custom_cluster/test_catalogd_ha.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/custom_cluster/test_catalogd_ha.py 
b/tests/custom_cluster/test_catalogd_ha.py
index f8ebff6b9..10c1109a6 100644
--- a/tests/custom_cluster/test_catalogd_ha.py
+++ b/tests/custom_cluster/test_catalogd_ha.py
@@ -20,16 +20,24 @@ import json
 import logging
 import re
 import requests
+import time
 
 from beeswaxd.BeeswaxService import QueryState
+from builtins import round
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.environ import build_flavor_timeout
-from tests.util.filesystem_utils import get_fs_path
+from tests.util.filesystem_utils import IS_S3, get_fs_path
 from time import sleep
 
 LOG = logging.getLogger('catalogd_ha_test')
 DEFAULT_STATESTORE_SERVICE_PORT = 24000
 DEFAULT_CATALOG_SERVICE_PORT = 26000
+SLOW_BUILD_SYNC_DDL_DELAY_S = 20
+SYNC_DDL_DELAY_S = build_flavor_timeout(
+    10, slow_build_timeout=SLOW_BUILD_SYNC_DDL_DELAY_S)
+# s3 can behave as a slow build.
+if IS_S3:
+  SYNC_DDL_DELAY_S = SLOW_BUILD_SYNC_DDL_DELAY_S
 
 
 class TestCatalogdHA(CustomClusterTestSuite):
@@ -438,7 +446,8 @@ class TestCatalogdHA(CustomClusterTestSuite):
 
   @CustomClusterTestSuite.with_args(
     statestored_args="--use_subscriber_id_as_catalogd_priority=true",
-    
catalogd_args="--debug_actions='catalogd_wait_sync_ddl_version_delay:SLEEP@5000'",
+    
catalogd_args="--debug_actions='catalogd_wait_sync_ddl_version_delay:SLEEP@{0}'"
+                  .format(SYNC_DDL_DELAY_S * 1000),
     start_args="--enable_catalogd_ha")
   def test_catalogd_failover_with_sync_ddl(self, unique_database):
     """Tests for Catalog Service force fail-over when running DDL with SYNC_DDL
@@ -459,6 +468,7 @@ class TestCatalogdHA(CustomClusterTestSuite):
     handle = client.execute_async(ddl_query.format(database=unique_database))
 
     # Restart standby catalogd with force_catalogd_active as true.
+    start_s = time.time()
     catalogds[1].kill()
     catalogds[1].start(wait_until_ready=True,
                        additional_args="--force_catalogd_active=true")
@@ -467,9 +477,14 @@ class TestCatalogdHA(CustomClusterTestSuite):
     catalogd_service_1.wait_for_metric_value(
         "catalog-server.active-status", expected_value=False, timeout=15)
     assert(not 
catalogd_service_1.get_metric_value("catalog-server.active-status"))
+    elapsed_s = time.time() - start_s
+    assert elapsed_s < SYNC_DDL_DELAY_S, \
+        "Catalogd failover took %s seconds to complete" % (elapsed_s)
+    LOG.info("Catalogd failover took %s seconds to complete" % 
round(elapsed_s, 1))
 
     # Verify that the query is failed due to the Catalogd HA fail-over.
-    self.wait_for_state(handle, QueryState.EXCEPTION, 30, client=client)
+    self.wait_for_state(
+        handle, QueryState.EXCEPTION, SYNC_DDL_DELAY_S * 2 + 10, client=client)
     client.close()
 
   @CustomClusterTestSuite.with_args(

Reply via email to