This is an automated email from the ASF dual-hosted git repository. maxyang pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit d649959b1f3bf4e19da0dc5a46f5e6aa97ef6cbf Author: hari krishna <[email protected]> AuthorDate: Mon Apr 4 19:28:03 2022 +0530 gprecoverseg rebalance is failed with timeout while promoting mirrors Currently gprecoverseg -r waits for a maximum of 30 seconds for preferred primaries to be promoted. This may not be enough and results in an exception gprecoverseg rebalance workflow 1. All the segments are up and running but the roles are switched 1. User runs gprecoverseg -r 1. gprecoverseg shuts down all the acting primaries(preferred mirrors) 1. gprecoverseg triggers fts probe 1. gprecoverseg sends a test query (CREATE TEMP TABLE) to start a distributed transaction to test if all the preferred primaries were promoted 1. gprecoverseg waits for a max of 30 secs and checks if the CREATE TABLE query succeeded, else raises timeout exception 1. gprecoverseg starts the stopped preferred mirrors The problem is that in some cases, the preferred primary may not be promoted in 30 seconds which results in an exception. For now we have decided to increase the timeout to 10 mins which should be enough for most use cases. Eventually we will revisit this logic and make it a bit more robust. --- gpMgmt/bin/gppylib/operations/rebalanceSegments.py | 2 +- .../operations/test/unit/test_unit_segment_reconfigurer.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gpMgmt/bin/gppylib/operations/rebalanceSegments.py b/gpMgmt/bin/gppylib/operations/rebalanceSegments.py index b391049a57..a46f0d69d1 100644 --- a/gpMgmt/bin/gppylib/operations/rebalanceSegments.py +++ b/gpMgmt/bin/gppylib/operations/rebalanceSegments.py @@ -8,7 +8,7 @@ from gppylib import gplog from gppylib.operations.segment_reconfigurer import SegmentReconfigurer -MIRROR_PROMOTION_TIMEOUT=120 +MIRROR_PROMOTION_TIMEOUT=600 class ReconfigDetectionSQLQueryCommand(base.SQLCommand): diff --git a/gpMgmt/bin/gppylib/operations/test/unit/test_unit_segment_reconfigurer.py b/gpMgmt/bin/gppylib/operations/test/unit/test_unit_segment_reconfigurer.py index 05bfeb55e5..9d86071baf 100644 --- a/gpMgmt/bin/gppylib/operations/test/unit/test_unit_segment_reconfigurer.py +++ b/gpMgmt/bin/gppylib/operations/test/unit/test_unit_segment_reconfigurer.py @@ -21,7 +21,7 @@ class SegmentReconfiguerTestCase(GpTestCase): port = 15432 user = 'postgres' passwd = 'passwd' - timeout = 120 + timeout = 600 def setUp(self): self.conn = Mock(name='conn') @@ -63,21 +63,21 @@ class SegmentReconfiguerTestCase(GpTestCase): self.conn.close.assert_any_call() @patch('time.time') - def test_it_gives_up_after_30_seconds(self, now_mock): + def test_it_gives_up_after_600_seconds(self, now_mock): start_datetime = datetime.datetime(2018, 5, 9, 16, 0, 0) start_time = time.mktime(start_datetime.timetuple()) now_mock.configure_mock(return_value=start_time) - def fail_for_half_a_minute(): + def fail_for_five_minutes(): new_time = start_time for i in range(2): - # leap forward 15 seconds + # leap forward 300 seconds new_time += self.timeout / 2 now_mock.configure_mock(return_value=new_time) yield pgdb.DatabaseError - self.connect.configure_mock(side_effect=fail_for_half_a_minute()) + self.connect.configure_mock(side_effect=fail_for_five_minutes()) reconfigurer = SegmentReconfigurer(logger=self.logger, worker_pool=self.worker_pool, timeout=self.timeout) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
