This is an automated email from the ASF dual-hosted git repository. maxyang pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 50b2124ae8ce60cf06ace69b2348c4c4e3b80ad0 Author: ravoorsh <[email protected]> AuthorDate: Wed May 11 15:48:12 2022 +0530 gpstate -e: Remove progress for killed recoverseg (#13412) * Problem Statement: When gprecoverseg is running gpstate -e command displays status of the recovered segments. if the user kills the gprecoverseg process then gpstate -e continues to report the status of the old gprecoverseg. Root Cause gpstate -e command checks if the recovery_progress_file exists or not before displaying the gprecoverseg progress. Since the recovery_progress_file is not removed when the gprecoverseg is forcefully killed, gpstate -e will continue to display the progress. Fix: To fix this issue, gpstate -e code is modified to verify if ~/workspace/gpdb/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/gprecoverseg.lock exists before reporting the gprecoverseg progress. Since the user is aware that the lock file should be removed if gprecoverseg process is killed, the double check ensures that gpstate -e does not report the progress of the old killed gprecoverseg. --- gpMgmt/bin/gppylib/programs/clsSystemState.py | 10 +++---- gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature | 35 +++++++--------------- gpMgmt/test/behave/mgmt_utils/gpstate.feature | 4 +++ .../test/behave/mgmt_utils/steps/gpstate_utils.py | 7 +++++ gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py | 12 ++++++++ 5 files changed, 38 insertions(+), 30 deletions(-) diff --git a/gpMgmt/bin/gppylib/programs/clsSystemState.py b/gpMgmt/bin/gppylib/programs/clsSystemState.py index 05e5416254..d6979c0f32 100644 --- a/gpMgmt/bin/gppylib/programs/clsSystemState.py +++ b/gpMgmt/bin/gppylib/programs/clsSystemState.py @@ -12,7 +12,6 @@ import re import collections import pgdb from contextlib import closing - from gppylib import gparray, gplog from gppylib.commands import base, gp from gppylib.db import dbconn @@ -23,6 +22,7 @@ from gppylib.operations.buildMirrorSegments import get_recovery_progress_file, g from gppylib.system import configurationInterface as configInterface from gppylib.system.environment import GpCoordinatorEnvironment from gppylib.utils import TableLogger +from gppylib.commands.gp import get_coordinatordatadir logger = gplog.get_default_logger() @@ -683,11 +683,12 @@ class GpSystemStateProgram: self.__addClusterDownWarning(gpArray, data) recovery_progress_file = get_recovery_progress_file(gplog) - recovery_progress_segs = self._parse_recovery_progress_data(data, recovery_progress_file, gpArray) - if recovery_progress_segs: + segments_under_recovery = self._parse_recovery_progress_data(data, recovery_progress_file, gpArray) + gprecoverseg_lock_dir = os.path.join(get_coordinatordatadir() + '/gprecoverseg.lock') + if segments_under_recovery and os.path.exists(gprecoverseg_lock_dir): logger.info("----------------------------------------------------") logger.info("Segments in recovery") - logSegments(recovery_progress_segs, False, [VALUE_RECOVERY_TYPE, VALUE_RECOVERY_COMPLETED_BYTES, VALUE_RECOVERY_TOTAL_BYTES, + logSegments(segments_under_recovery, False, [VALUE_RECOVERY_TYPE, VALUE_RECOVERY_COMPLETED_BYTES, VALUE_RECOVERY_TOTAL_BYTES, VALUE_RECOVERY_PERCENTAGE]) exitCode = 1 @@ -993,7 +994,6 @@ class GpSystemStateProgram: return recovery_progress_segs - @staticmethod def _get_unsync_segs_add_wal_remaining_bytes(data, gpArray): """ diff --git a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature index b17c37e61f..9f89c1f302 100644 --- a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature +++ b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature @@ -429,6 +429,7 @@ Feature: gprecoverseg tests And user can start transactions And all files in gpAdminLogs directory are deleted on all hosts in the cluster And a sample recovery_progress.file is created from saved lines + Then a sample gprecoverseg.lock directory is created in coordinator_data_directory When the user runs "gpstate -e" Then gpstate should print "Segments in recovery" to stdout # And gpstate output contains "incremental,incremental,incremental" entries for mirrors of content 0,1,2 @@ -438,6 +439,7 @@ Feature: gprecoverseg tests # | \S+ | [0-9]+ | incremental | [0-9]+ | [0-9]+ | [0-9]+\% | # | \S+ | [0-9]+ | incremental | [0-9]+ | [0-9]+ | [0-9]+\% | And all files in gpAdminLogs directory are deleted on all hosts in the cluster + Then the gprecoverseg lock directory is removed And user immediately stops all primary processes for content 0,1,2 And user can start transactions @@ -453,9 +455,11 @@ Feature: gprecoverseg tests And user can start transactions And a sample recovery_progress.file is created from saved lines + Then a sample gprecoverseg.lock directory is created in coordinator_data_directory When the user runs "gpstate -e" Then gpstate should print "Segments in recovery" to stdout And all files in gpAdminLogs directory are deleted on all hosts in the cluster + Then the gprecoverseg lock directory is removed @demo_cluster @concourse_cluster @@ -536,38 +540,19 @@ Feature: gprecoverseg tests And the user suspend the walsender on the primary on content 0 When the user asynchronously runs "gprecoverseg -aF" and the process is saved Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies its format - When the user asynchronously sets up to end gprecoverseg process with SIGINT - And the user waits until saved async process is completed - Then recovery_progress.file should not exist in gpAdminLogs - Then the user reset the walsender on the primary on content 0 - And the gprecoverseg lock directory is removed - And the user waits until mirror on content 0,1,2 is up - And verify that lines from recovery_progress.file are present in segment progress files in gpAdminLogs - And the cluster is rebalanced - - @demo_cluster - @concourse_cluster - Scenario: SIGINT on gprecoverseg differential recovery should delete the progress file - Given the database is running - And all the segments are running - And the segments are synchronized - And all files in gpAdminLogs directory are deleted on all hosts in the cluster - And user immediately stops all primary processes for content 0,1,2 - And user can start transactions - When the user asynchronously runs "gprecoverseg -a --differential" and the process is saved - Then the user waits until recovery_progress.file is created in gpAdminLogs and verifies its format Then verify if the gprecoverseg.lock directory is present in coordinator_data_directory When the user asynchronously sets up to end gprecoverseg process with SIGINT And the user waits until saved async process is completed Then recovery_progress.file should not exist in gpAdminLogs + Then the user reset the walsender on the primary on content 0 Then the gprecoverseg lock directory is removed And the user waits until mirror on content 0,1,2 is up + And verify that lines from recovery_progress.file are present in segment progress files in gpAdminLogs And the cluster is rebalanced - @demo_cluster @concourse_cluster - Scenario: SIGKILL on gprecoverseg should not display progress in gpstate -e + Scenario: SIGHUP on gprecoverseg should not display progress in gpstate -e Given the database is running And all the segments are running And the segments are synchronized @@ -581,13 +566,13 @@ Feature: gprecoverseg tests Then verify if the gprecoverseg.lock directory is present in coordinator_data_directory When the user runs "gpstate -e" Then gpstate should print "Segments in recovery" to stdout - When the user asynchronously sets up to end gprecoverseg process with SIGKILL + When the user asynchronously sets up to end gprecoverseg process with SIGHUP And the user waits until saved async process is completed + Then the gprecoverseg lock directory is removed When the user runs "gpstate -e" Then gpstate should not print "Segments in recovery" to stdout Then the user reset the walsender on the primary on content 0 And the user waits until mirror on content 0,1,2 is up - And the gprecoverseg lock directory is removed And the cluster is rebalanced @demo_cluster @@ -1122,7 +1107,7 @@ Feature: gprecoverseg tests And the user asynchronously sets up to end gprecoverseg process when "Recovery type" is printed in the logs And the user runs "gprecoverseg -a" Then gprecoverseg should return a return code of -15 - And the gprecoverseg lock directory is removed + Then the gprecoverseg lock directory is removed When the user runs "gprecoverseg -a" Then gprecoverseg should return a return code of 0 diff --git a/gpMgmt/test/behave/mgmt_utils/gpstate.feature b/gpMgmt/test/behave/mgmt_utils/gpstate.feature index 3d83d5330a..5cca6ec192 100644 --- a/gpMgmt/test/behave/mgmt_utils/gpstate.feature +++ b/gpMgmt/test/behave/mgmt_utils/gpstate.feature @@ -160,6 +160,7 @@ Feature: gpstate tests Given a standard local demo cluster is running Given all files in gpAdminLogs directory are deleted And a sample recovery_progress.file is created with ongoing recoveries in gpAdminLogs + And a sample gprecoverseg.lock directory is created in coordinator_data_directory When the user runs "gpstate -e" Then gpstate should print "Segments in recovery" to stdout And gpstate output contains "full,incremental" entries for mirrors of content 0,1 @@ -168,11 +169,13 @@ Feature: gpstate tests | \S+ | [0-9]+ | full | 1164848 | 1371715 | 84% | | \S+ | [0-9]+ | incremental | 1 | 1371875 | 1% | And all files in gpAdminLogs directory are deleted + And the gprecoverseg lock directory is removed Scenario: gpstate -e does not show information about segments with completed recovery Given a standard local demo cluster is running Given all files in gpAdminLogs directory are deleted And a sample recovery_progress.file is created with completed recoveries in gpAdminLogs + And a sample gprecoverseg.lock directory is created in coordinator_data_directory When the user runs "gpstate -e" Then gpstate should print "Segments in recovery" to stdout And gpstate output contains "full" entries for mirrors of content 1 @@ -182,6 +185,7 @@ Feature: gpstate tests And gpstate should not print "incremental" to stdout And gpstate should not print "All segments are running normally" to stdout And all files in gpAdminLogs directory are deleted + Then the gprecoverseg lock directory is removed Scenario: gpstate -c logs cluster info for a mirrored cluster Given a standard local demo cluster is running diff --git a/gpMgmt/test/behave/mgmt_utils/steps/gpstate_utils.py b/gpMgmt/test/behave/mgmt_utils/steps/gpstate_utils.py index 3cd4d3dfb5..098245ea28 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/gpstate_utils.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/gpstate_utils.py @@ -5,6 +5,7 @@ import re from gppylib.db import dbconn from gppylib.gparray import GpArray, ROLE_MIRROR from test.behave_utils.utils import check_stdout_msg, check_string_not_present_stdout +from gppylib.commands.gp import get_coordinatordatadir @then('a sample recovery_progress.file is created from saved lines') def impl(context): @@ -17,6 +18,12 @@ def impl(context): fp.write("full:5: 1164848/1371715 kB (84%), 0/1 tablespace (...t1/demoDataDir0/base/16384/40962)\n") fp.write("incremental:6: 1/1371875 kB (1%)") +@then('a sample gprecoverseg.lock directory is created in coordinator_data_directory') +@given('a sample gprecoverseg.lock directory is created in coordinator_data_directory') +def impl(context): + gprecoverseg_lock_dir = os.path.join(get_coordinatordatadir() + '/gprecoverseg.lock') + os.mkdir(gprecoverseg_lock_dir) + @given('a sample recovery_progress.file is created with completed recoveries in gpAdminLogs') def impl(context): with open('{}/gpAdminLogs/recovery_progress.file'.format(os.path.expanduser("~")), 'w+') as fp: diff --git a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py index aa8b054497..d96929ec07 100644 --- a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py +++ b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py @@ -481,6 +481,13 @@ def impl(context, logdir): if attempt == num_retries: raise Exception('Timed out after {} retries'.format(num_retries)) +@then( 'verify if the gprecoverseg.lock directory is present in coordinator_data_directory') +def impl(context): + gprecoverseg_lock_file = "%s/gprecoverseg.lock" % gp.get_coordinatordatadir() + if not os.path.exists(gprecoverseg_lock_file): + raise Exception('gprecoverseg.lock directory does not exist') + else: + return @then( 'verify if the gprecoverseg.lock directory is present in coordinator_data_directory') def impl(context): @@ -664,6 +671,11 @@ def impl(context, process_name, signal_name): command = "ps ux | grep bin/{0} | awk '{{print $2}}' | xargs kill -{1}".format(process_name, sig.value) run_async_command(context, command) +@when('the user asynchronously sets up to end {process_name} process with SIGHUP') +def impl(context, process_name): + command = "ps ux | grep bin/%s | awk '{print $2}' | xargs kill -9" % (process_name) + run_async_command(context, command) + @when('the user asynchronously sets up to end gpcreateseg process when it starts') def impl(context): # We keep trying to find the gpcreateseg process using ps,grep --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
