This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 564f2ced7330e6ccb807b39c47a42ea691a1f733
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Tue Feb 27 11:29:55 2024 -0800

    IMPALA-12848: Fixed flaky test test_catalogd_ha_failover
    
    TestExtDataSources::test_catalogd_ha_failover failed to delete data
    source object after catalog service failed over to standby catalogd.
    Log messages showed that coordinator tried to submit the DDL request
    to original active catalogd since it did not receive failover
    notification from statestored yet.
    
    To fix the flaky test, wait until coordinator receive failover
    notification from statestored before executing DDL request to drop
    data source.
    
    Testing:
     - Looped to run the test for more than hundred times without failure.
    
    Change-Id: Ia6225271357740c055c25fdd349f1dc9162c2f53
    Reviewed-on: http://gerrit.cloudera.org:8080/21078
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 tests/custom_cluster/test_ext_data_sources.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/custom_cluster/test_ext_data_sources.py 
b/tests/custom_cluster/test_ext_data_sources.py
index 765f1bb26..2c7d405ca 100644
--- a/tests/custom_cluster/test_ext_data_sources.py
+++ b/tests/custom_cluster/test_ext_data_sources.py
@@ -24,6 +24,7 @@ import subprocess
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.environ import build_flavor_timeout
 from tests.common.skip import SkipIfApacheHive
+from time import sleep
 
 
 class TestExtDataSources(CustomClusterTestSuite):
@@ -104,6 +105,7 @@ class TestExtDataSources(CustomClusterTestSuite):
   @CustomClusterTestSuite.with_args(
     statestored_args="--use_subscriber_id_as_catalogd_priority=true "
                      "--statestore_heartbeat_frequency_ms=1000",
+    catalogd_args="--catalogd_ha_reset_metadata_on_failover=false",
     start_args="--enable_catalogd_ha")
   def test_catalogd_ha_failover(self):
     """The test case for cluster started with catalogd HA enabled."""
@@ -136,6 +138,23 @@ class TestExtDataSources(CustomClusterTestSuite):
         "catalog-server.active-status", expected_value=True, timeout=30)
     assert(catalogd_service_2.get_metric_value("catalog-server.active-status"))
 
+    # Wait until coordinator receive failover notification.
+    coordinator_service = self.cluster.impalads[0].service
+    expected_catalog_service_port = 
catalogd_service_2.get_catalog_service_port()
+    received_failover_notification = False
+    retry_count = 30
+    while (retry_count > 0):
+      active_catalogd_address = \
+          
coordinator_service.get_metric_value("catalog.active-catalogd-address")
+      _, catalog_service_port = active_catalogd_address.split(":")
+      if (int(catalog_service_port) == expected_catalog_service_port):
+        received_failover_notification = True
+        break
+      retry_count -= 1
+      sleep(1)
+    assert received_failover_notification, \
+        "Coordinator did not receive notification of Catalog service failover."
+
     # Verify that the data source object is available in the catalogd of HA 
pair.
     result = self.execute_query(SHOW_DATA_SOURCE_QUERY)
     assert result.success, str(result)

Reply via email to