This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit d649959b1f3bf4e19da0dc5a46f5e6aa97ef6cbf
Author: hari krishna <[email protected]>
AuthorDate: Mon Apr 4 19:28:03 2022 +0530

    gprecoverseg rebalance is failed with timeout while promoting mirrors
    
    Currently gprecoverseg -r waits for a maximum of 30 seconds for preferred 
primaries to be promoted. This may not be enough and results in an exception
    
    gprecoverseg rebalance workflow
    1. All the segments are up and running but the roles are switched
    1. User runs gprecoverseg -r
    1. gprecoverseg shuts down all the acting primaries(preferred mirrors)
    1. gprecoverseg triggers fts probe
    1. gprecoverseg sends a test query (CREATE TEMP TABLE) to start a 
distributed transaction to test if all the preferred primaries were promoted
    1. gprecoverseg waits for a max of 30 secs and checks if the CREATE TABLE 
query succeeded, else raises timeout exception
    1. gprecoverseg starts the stopped preferred mirrors
    
    The problem is that in some cases, the preferred primary may not be 
promoted in 30 seconds which results in an exception. For now we have decided 
to increase the timeout to 10 mins which should be enough for most use cases.
    Eventually we will revisit this logic and make it a bit more robust.
---
 gpMgmt/bin/gppylib/operations/rebalanceSegments.py             |  2 +-
 .../operations/test/unit/test_unit_segment_reconfigurer.py     | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/gpMgmt/bin/gppylib/operations/rebalanceSegments.py 
b/gpMgmt/bin/gppylib/operations/rebalanceSegments.py
index b391049a57..a46f0d69d1 100644
--- a/gpMgmt/bin/gppylib/operations/rebalanceSegments.py
+++ b/gpMgmt/bin/gppylib/operations/rebalanceSegments.py
@@ -8,7 +8,7 @@ from gppylib import gplog
 
 from gppylib.operations.segment_reconfigurer import SegmentReconfigurer
 
-MIRROR_PROMOTION_TIMEOUT=120
+MIRROR_PROMOTION_TIMEOUT=600
 
 
 class ReconfigDetectionSQLQueryCommand(base.SQLCommand):
diff --git 
a/gpMgmt/bin/gppylib/operations/test/unit/test_unit_segment_reconfigurer.py 
b/gpMgmt/bin/gppylib/operations/test/unit/test_unit_segment_reconfigurer.py
index 05bfeb55e5..9d86071baf 100644
--- a/gpMgmt/bin/gppylib/operations/test/unit/test_unit_segment_reconfigurer.py
+++ b/gpMgmt/bin/gppylib/operations/test/unit/test_unit_segment_reconfigurer.py
@@ -21,7 +21,7 @@ class SegmentReconfiguerTestCase(GpTestCase):
     port = 15432
     user = 'postgres'
     passwd = 'passwd'
-    timeout = 120
+    timeout = 600
 
     def setUp(self):
         self.conn = Mock(name='conn')
@@ -63,21 +63,21 @@ class SegmentReconfiguerTestCase(GpTestCase):
         self.conn.close.assert_any_call()
 
     @patch('time.time')
-    def test_it_gives_up_after_30_seconds(self, now_mock):
+    def test_it_gives_up_after_600_seconds(self, now_mock):
         start_datetime = datetime.datetime(2018, 5, 9, 16, 0, 0)
         start_time = time.mktime(start_datetime.timetuple())
         now_mock.configure_mock(return_value=start_time)
 
-        def fail_for_half_a_minute():
+        def fail_for_five_minutes():
             new_time = start_time
             for i in range(2):
-                # leap forward 15 seconds
+                # leap forward 300 seconds
                 new_time += self.timeout / 2
                 now_mock.configure_mock(return_value=new_time)
                 yield pgdb.DatabaseError
 
 
-        self.connect.configure_mock(side_effect=fail_for_half_a_minute())
+        self.connect.configure_mock(side_effect=fail_for_five_minutes())
 
         reconfigurer = SegmentReconfigurer(logger=self.logger,
                 worker_pool=self.worker_pool, timeout=self.timeout)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to