This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit 50b2124ae8ce60cf06ace69b2348c4c4e3b80ad0
Author: ravoorsh <[email protected]>
AuthorDate: Wed May 11 15:48:12 2022 +0530

    gpstate -e: Remove progress for killed recoverseg (#13412)
    
    * Problem Statement:
    When gprecoverseg is running gpstate -e command displays status of the 
recovered segments. if the user kills the gprecoverseg process then gpstate -e 
continues to report the status of the old gprecoverseg.
    
    Root Cause
    gpstate -e command checks if the recovery_progress_file exists or not 
before displaying the gprecoverseg progress.
    Since the recovery_progress_file is not removed when the gprecoverseg is 
forcefully killed, gpstate -e will continue to display the progress.
    
    Fix:
    To fix this issue, gpstate -e code is modified to verify if 
~/workspace/gpdb/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/gprecoverseg.lock 
exists before reporting the gprecoverseg progress. Since the user is aware that 
the lock file should be removed if gprecoverseg process is killed, the double 
check ensures that gpstate -e does not report the progress of the old killed 
gprecoverseg.
---
 gpMgmt/bin/gppylib/programs/clsSystemState.py      | 10 +++----
 gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature | 35 +++++++---------------
 gpMgmt/test/behave/mgmt_utils/gpstate.feature      |  4 +++
 .../test/behave/mgmt_utils/steps/gpstate_utils.py  |  7 +++++
 gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py  | 12 ++++++++
 5 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/gpMgmt/bin/gppylib/programs/clsSystemState.py 
b/gpMgmt/bin/gppylib/programs/clsSystemState.py
index 05e5416254..d6979c0f32 100644
--- a/gpMgmt/bin/gppylib/programs/clsSystemState.py
+++ b/gpMgmt/bin/gppylib/programs/clsSystemState.py
@@ -12,7 +12,6 @@ import re
 import collections
 import pgdb
 from contextlib import closing
-
 from gppylib import gparray, gplog
 from gppylib.commands import base, gp
 from gppylib.db import dbconn
@@ -23,6 +22,7 @@ from gppylib.operations.buildMirrorSegments import 
get_recovery_progress_file, g
 from gppylib.system import configurationInterface as configInterface
 from gppylib.system.environment import GpCoordinatorEnvironment
 from gppylib.utils import TableLogger
+from gppylib.commands.gp import get_coordinatordatadir
 
 logger = gplog.get_default_logger()
 
@@ -683,11 +683,12 @@ class GpSystemStateProgram:
         self.__addClusterDownWarning(gpArray, data)
 
         recovery_progress_file = get_recovery_progress_file(gplog)
-        recovery_progress_segs = self._parse_recovery_progress_data(data, 
recovery_progress_file, gpArray)
-        if recovery_progress_segs:
+        segments_under_recovery = self._parse_recovery_progress_data(data, 
recovery_progress_file, gpArray)
+        gprecoverseg_lock_dir = os.path.join(get_coordinatordatadir() + 
'/gprecoverseg.lock')
+        if segments_under_recovery and os.path.exists(gprecoverseg_lock_dir):
             logger.info("----------------------------------------------------")
             logger.info("Segments in recovery")
-            logSegments(recovery_progress_segs, False, [VALUE_RECOVERY_TYPE, 
VALUE_RECOVERY_COMPLETED_BYTES, VALUE_RECOVERY_TOTAL_BYTES,
+            logSegments(segments_under_recovery, False, [VALUE_RECOVERY_TYPE, 
VALUE_RECOVERY_COMPLETED_BYTES, VALUE_RECOVERY_TOTAL_BYTES,
                                                           
VALUE_RECOVERY_PERCENTAGE])
             exitCode = 1
 
@@ -993,7 +994,6 @@ class GpSystemStateProgram:
 
         return recovery_progress_segs
 
-
     @staticmethod
     def _get_unsync_segs_add_wal_remaining_bytes(data, gpArray):
         """
diff --git a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature 
b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature
index b17c37e61f..9f89c1f302 100644
--- a/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature
+++ b/gpMgmt/test/behave/mgmt_utils/gprecoverseg.feature
@@ -429,6 +429,7 @@ Feature: gprecoverseg tests
     And user can start transactions
     And all files in gpAdminLogs directory are deleted on all hosts in the 
cluster
     And a sample recovery_progress.file is created from saved lines
+    Then a sample gprecoverseg.lock directory is created in 
coordinator_data_directory
     When the user runs "gpstate -e"
     Then gpstate should print "Segments in recovery" to stdout
 #    And gpstate output contains "incremental,incremental,incremental" entries 
for mirrors of content 0,1,2
@@ -438,6 +439,7 @@ Feature: gprecoverseg tests
 #      | \S+     | [0-9]+ | incremental    | [0-9]+                 | [0-9]+   
          | [0-9]+\%             |
 #      | \S+     | [0-9]+ | incremental    | [0-9]+                 | [0-9]+   
          | [0-9]+\%             |
     And all files in gpAdminLogs directory are deleted on all hosts in the 
cluster
+    Then the gprecoverseg lock directory is removed
 
     And user immediately stops all primary processes for content 0,1,2
     And user can start transactions
@@ -453,9 +455,11 @@ Feature: gprecoverseg tests
     And user can start transactions
 
     And a sample recovery_progress.file is created from saved lines
+    Then a sample gprecoverseg.lock directory is created in 
coordinator_data_directory
     When the user runs "gpstate -e"
     Then gpstate should print "Segments in recovery" to stdout
     And all files in gpAdminLogs directory are deleted on all hosts in the 
cluster
+    Then the gprecoverseg lock directory is removed
 
   @demo_cluster
   @concourse_cluster
@@ -536,38 +540,19 @@ Feature: gprecoverseg tests
     And the user suspend the walsender on the primary on content 0
     When the user asynchronously runs "gprecoverseg -aF" and the process is 
saved
     Then the user waits until recovery_progress.file is created in gpAdminLogs 
and verifies its format
-    When the user asynchronously sets up to end gprecoverseg process with 
SIGINT
-    And the user waits until saved async process is completed
-    Then recovery_progress.file should not exist in gpAdminLogs
-    Then the user reset the walsender on the primary on content 0
-    And the gprecoverseg lock directory is removed
-    And the user waits until mirror on content 0,1,2 is up
-    And verify that lines from recovery_progress.file are present in segment 
progress files in gpAdminLogs
-    And the cluster is rebalanced
-
-  @demo_cluster
-  @concourse_cluster
-  Scenario:  SIGINT on gprecoverseg differential recovery should delete the 
progress file
-    Given the database is running
-    And all the segments are running
-    And the segments are synchronized
-    And all files in gpAdminLogs directory are deleted on all hosts in the 
cluster
-    And user immediately stops all primary processes for content 0,1,2
-    And user can start transactions
-    When the user asynchronously runs "gprecoverseg -a --differential" and the 
process is saved
-    Then the user waits until recovery_progress.file is created in gpAdminLogs 
and verifies its format
     Then verify if the gprecoverseg.lock directory is present in 
coordinator_data_directory
     When the user asynchronously sets up to end gprecoverseg process with 
SIGINT
     And the user waits until saved async process is completed
     Then recovery_progress.file should not exist in gpAdminLogs
+    Then the user reset the walsender on the primary on content 0
     Then the gprecoverseg lock directory is removed
     And the user waits until mirror on content 0,1,2 is up
+    And verify that lines from recovery_progress.file are present in segment 
progress files in gpAdminLogs
     And the cluster is rebalanced
 
-
   @demo_cluster
   @concourse_cluster
-  Scenario:  SIGKILL on gprecoverseg should not display progress in gpstate -e
+  Scenario:  SIGHUP on gprecoverseg should not display progress in gpstate -e
     Given the database is running
     And all the segments are running
     And the segments are synchronized
@@ -581,13 +566,13 @@ Feature: gprecoverseg tests
     Then verify if the gprecoverseg.lock directory is present in 
coordinator_data_directory
     When the user runs "gpstate -e"
     Then gpstate should print "Segments in recovery" to stdout
-    When the user asynchronously sets up to end gprecoverseg process with 
SIGKILL
+    When the user asynchronously sets up to end gprecoverseg process with 
SIGHUP
     And the user waits until saved async process is completed
+    Then the gprecoverseg lock directory is removed
     When the user runs "gpstate -e"
     Then gpstate should not print "Segments in recovery" to stdout
     Then the user reset the walsender on the primary on content 0
     And the user waits until mirror on content 0,1,2 is up
-    And the gprecoverseg lock directory is removed
     And the cluster is rebalanced
 
   @demo_cluster
@@ -1122,7 +1107,7 @@ Feature: gprecoverseg tests
           And the user asynchronously sets up to end gprecoverseg process when 
"Recovery type" is printed in the logs
           And the user runs "gprecoverseg -a"
          Then gprecoverseg should return a return code of -15
-          And the gprecoverseg lock directory is removed
+         Then the gprecoverseg lock directory is removed
 
          When the user runs "gprecoverseg -a"
          Then gprecoverseg should return a return code of 0
diff --git a/gpMgmt/test/behave/mgmt_utils/gpstate.feature 
b/gpMgmt/test/behave/mgmt_utils/gpstate.feature
index 3d83d5330a..5cca6ec192 100644
--- a/gpMgmt/test/behave/mgmt_utils/gpstate.feature
+++ b/gpMgmt/test/behave/mgmt_utils/gpstate.feature
@@ -160,6 +160,7 @@ Feature: gpstate tests
         Given a standard local demo cluster is running
         Given all files in gpAdminLogs directory are deleted
         And a sample recovery_progress.file is created with ongoing recoveries 
in gpAdminLogs
+        And a sample gprecoverseg.lock directory is created in 
coordinator_data_directory
         When the user runs "gpstate -e"
         Then gpstate should print "Segments in recovery" to stdout
         And gpstate output contains "full,incremental" entries for mirrors of 
content 0,1
@@ -168,11 +169,13 @@ Feature: gpstate tests
             | \S+     | [0-9]+ | full           | 1164848                | 
1371715            | 84%                  |
             | \S+     | [0-9]+ | incremental    | 1                      | 
1371875            | 1%                   |
         And all files in gpAdminLogs directory are deleted
+        And the gprecoverseg lock directory is removed
 
     Scenario: gpstate -e does not show information about segments with 
completed recovery
         Given a standard local demo cluster is running
         Given all files in gpAdminLogs directory are deleted
         And a sample recovery_progress.file is created with completed 
recoveries in gpAdminLogs
+        And a sample gprecoverseg.lock directory is created in 
coordinator_data_directory
         When the user runs "gpstate -e"
         Then gpstate should print "Segments in recovery" to stdout
         And gpstate output contains "full" entries for mirrors of content 1
@@ -182,6 +185,7 @@ Feature: gpstate tests
         And gpstate should not print "incremental" to stdout
         And gpstate should not print "All segments are running normally" to 
stdout
         And all files in gpAdminLogs directory are deleted
+        Then the gprecoverseg lock directory is removed
 
     Scenario: gpstate -c logs cluster info for a mirrored cluster
         Given a standard local demo cluster is running
diff --git a/gpMgmt/test/behave/mgmt_utils/steps/gpstate_utils.py 
b/gpMgmt/test/behave/mgmt_utils/steps/gpstate_utils.py
index 3cd4d3dfb5..098245ea28 100644
--- a/gpMgmt/test/behave/mgmt_utils/steps/gpstate_utils.py
+++ b/gpMgmt/test/behave/mgmt_utils/steps/gpstate_utils.py
@@ -5,6 +5,7 @@ import re
 from gppylib.db import dbconn
 from gppylib.gparray import GpArray, ROLE_MIRROR
 from test.behave_utils.utils import check_stdout_msg, 
check_string_not_present_stdout
+from gppylib.commands.gp import get_coordinatordatadir
 
 @then('a sample recovery_progress.file is created from saved lines')
 def impl(context):
@@ -17,6 +18,12 @@ def impl(context):
         fp.write("full:5: 1164848/1371715 kB (84%), 0/1 tablespace 
(...t1/demoDataDir0/base/16384/40962)\n")
         fp.write("incremental:6: 1/1371875 kB (1%)")
 
+@then('a sample gprecoverseg.lock directory is created in 
coordinator_data_directory')
+@given('a sample gprecoverseg.lock directory is created in 
coordinator_data_directory')
+def impl(context):
+    gprecoverseg_lock_dir = os.path.join(get_coordinatordatadir() + 
'/gprecoverseg.lock')
+    os.mkdir(gprecoverseg_lock_dir)
+
 @given('a sample recovery_progress.file is created with completed recoveries 
in gpAdminLogs')
 def impl(context):
     with 
open('{}/gpAdminLogs/recovery_progress.file'.format(os.path.expanduser("~")), 
'w+') as fp:
diff --git a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py 
b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py
index aa8b054497..d96929ec07 100644
--- a/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py
+++ b/gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py
@@ -481,6 +481,13 @@ def impl(context, logdir):
         if attempt == num_retries:
             raise Exception('Timed out after {} retries'.format(num_retries))
 
+@then( 'verify if the gprecoverseg.lock directory is present in 
coordinator_data_directory')
+def impl(context):
+    gprecoverseg_lock_file = "%s/gprecoverseg.lock" % 
gp.get_coordinatordatadir()
+    if not os.path.exists(gprecoverseg_lock_file):
+        raise Exception('gprecoverseg.lock directory does not exist')
+    else:
+        return
 
 @then( 'verify if the gprecoverseg.lock directory is present in 
coordinator_data_directory')
 def impl(context):
@@ -664,6 +671,11 @@ def impl(context, process_name, signal_name):
     command = "ps ux | grep bin/{0} | awk '{{print $2}}' | xargs kill 
-{1}".format(process_name, sig.value)
     run_async_command(context, command)
 
+@when('the user asynchronously sets up to end {process_name} process with 
SIGHUP')
+def impl(context, process_name):
+    command = "ps ux | grep bin/%s | awk '{print $2}' | xargs kill -9" % 
(process_name)
+    run_async_command(context, command)
+
 @when('the user asynchronously sets up to end gpcreateseg process when it 
starts')
 def impl(context):
     # We keep trying to find the gpcreateseg process using ps,grep


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to