This is an automated email from the ASF dual-hosted git repository.

avamingli pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit 6413927d1ae40db09825488ef11b747be4114692
Author: Huansong Fu <[email protected]>
AuthorDate: Thu Apr 20 12:06:52 2023 -0700

    Do not trigger fault in dtx recovery process except a few
    
    Reason for the change is due to a flaky regress test 'gpdispatch':
    ```
    --- 
/tmp/build/e18b2f02/gpdb_src/src/test/isolation2/expected/gpdispatch_1.out  
2023-04-20 17:04:33.775900075 +0000
    +++ /tmp/build/e18b2f02/gpdb_src/src/test/isolation2/results/gpdispatch.out 
    2023-04-20 17:04:33.791901566 +0000
    @@ -25,8 +25,10 @@
    
     -- session1 will be fatal.
     1: select count(*) > 0 from gp_dist_random('pg_class');
    -FATAL:  could not allocate resources for segworker communication
    -server closed the connection unexpectedly
    + ?column?
    +----------
    + t
    +(1 row)
    ```
    
    The reason is that the dtx recovery process got ahead and detonate it 
first...
    ```
    2023-04-20 17:03:26.380438 
UTC,,,p119165,th1614203008,,,,0,con2,,seg-1,,,,sx1,"LOG","00000","DTM 
Started",,,,,,,0,,"cdbdtxrecovery.c”,155
    ...
    2023-04-20 17:04:18.184150 
UTC,,,p119165,th1614203008,,,,0,con2,,seg-1,,,,sx1,"LOG","XX009","fault 
triggered, fault name:'make_dispatch_result_error' fault type:'skip' 
",,,,,,,0,,"faultinjector.c",475,
    2023-04-20 17:04:18.292955 
UTC,,,p119165,th1614203008,,,,0,con2,,seg-1,,,,sx1,"FATAL","XX000","could not 
allocate resources for segworker communication
    2023-04-20 17:04:18.293370 
UTC,,,p119165,th1614203008,,,,0,con132,,seg-1,,,,sx1,"LOG","00000","The 
previous session was reset because its gang was disconnected (session id = 2). 
The new session id = 132
    ```
    ...so the intended backend didn't:
    ```
    3-04-20 17:04:18.175160 
UTC,"gpadmin","isolation2test",p122404,th1614203008,"[local]",,2023-04-20 
17:04:18 UTC,0,con129,cmd2,seg-1,,,,sx1,"LOG","00000","injected fault 
'make_dispatch_result_error' type 'skip'",,,,,,"select 
gp_inject_fault('make_dispatch_result_error', 'skip', dbid) from 
gp_segment_configuration where role = 'p' and content = 
-1;",0,,"faultinjector.c",1077,
    ...
    2023-04-20 17:04:18.187420 
UTC,"gpadmin","isolation2test",p122404,th1614203008,"[local]",,2023-04-20 
17:04:18 UTC,0,con129,cmd3,seg-1,,,,sx1,"LOG","00000","statement: select 
count(*) > 0 from gp_dist_random('pg_class');",,,,,,,0,,"postgres.c",1729
    ```
    
    The fix is to skip all but a few injected faults in the dtx recovery
    process, like how we handle the autovacuum process. Refactor the code
    a little bit to make the skip conditions more clear. Also added notes/logs
    help dev who wants to test fault injection with background process in 
future.
---
 gpcontrib/gp_inject_fault/README                   |  4 ++
 gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql |  3 +-
 src/backend/cdb/cdbdtxrecovery.c                   |  5 ++
 src/backend/utils/misc/faultinjector.c             | 54 ++++++++++++++++------
 src/include/cdb/cdbtm.h                            |  8 ++--
 5 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/gpcontrib/gp_inject_fault/README b/gpcontrib/gp_inject_fault/README
index 14735753ac..89bf85c9da 100644
--- a/gpcontrib/gp_inject_fault/README
+++ b/gpcontrib/gp_inject_fault/README
@@ -166,3 +166,7 @@ the warning callback function needs to determine whether 
the FTS
 process is enabled on the Segment node, we need to synchronize the
 information on whether the FTS on the Master is enabled to the
 corresponding QE process.
+================================
+* We let some background process ignore all but a few faults. If one wants
+to test fault injection in background processese, add the exception in
+checkBgProcessSkipFault().
diff --git a/gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql 
b/gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql
index 775e7ec9db..4c64ee4b57 100644
--- a/gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql
+++ b/gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql
@@ -3,7 +3,8 @@
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION gp_fault_inject" to load this file. \quit
 
-CREATE FUNCTION gp_inject_fault(
+-- NOTE: we let some background process ignore all but a few faults (check 
checkBgProcessSkipFault()).
+CREATE FUNCTION @[email protected]_inject_fault(
   faultname text,
   type text,
   ddl text,
diff --git a/src/backend/cdb/cdbdtxrecovery.c b/src/backend/cdb/cdbdtxrecovery.c
index eb30598b31..7a6f6c5d01 100644
--- a/src/backend/cdb/cdbdtxrecovery.c
+++ b/src/backend/cdb/cdbdtxrecovery.c
@@ -67,6 +67,11 @@ static void doAbortInDoubt(char *gid);
 static bool doNotifyCommittedInDoubt(char *gid);
 static void AbortOrphanedPreparedTransactions(void);
 
+bool IsDtxRecoveryProcess()
+{
+       return getpid() == DtxRecoveryPID();
+}
+
 static bool
 doNotifyCommittedInDoubt(char *gid)
 {
diff --git a/src/backend/utils/misc/faultinjector.c 
b/src/backend/utils/misc/faultinjector.c
index 75bd138226..3aa202dc41 100644
--- a/src/backend/utils/misc/faultinjector.c
+++ b/src/backend/utils/misc/faultinjector.c
@@ -250,6 +250,42 @@ FaultInjector_ShmemInit(void)
        return;                                           
 }
 
+static bool
+checkBgProcessSkipFault(const char* faultName)
+{
+       if (IsAutoVacuumLauncherProcess())
+       {
+               /* autovacuum launcher process */
+               elog(LOG, "skipped fault '%s' in autovacuum launcher process", 
faultName);
+               return true;
+       }
+       else if (IsAutoVacuumWorkerProcess())
+       {
+               /* autovacuum worker process */
+                if (0 != strcmp("vacuum_update_dat_frozen_xid", faultName) &&
+                               0 != 
strcmp("auto_vac_worker_before_do_autovacuum", faultName) &&
+                               0 != 
strcmp("auto_vac_worker_after_report_activity", faultName) &&
+                               0 != strcmp("auto_vac_worker_abort", faultName) 
&&
+                               0 != strcmp("analyze_after_hold_lock", 
faultName) &&
+                               0 != strcmp("analyze_finished_one_relation", 
faultName))
+               {
+                       elog(LOG, "skipped fault '%s' in autovacuum worker 
process", faultName);
+                       return true;
+               }
+       }
+       else if (IsDtxRecoveryProcess())
+       {
+               /* dtx recovery process */
+                if (0 != strcmp("before_orphaned_check", faultName) &&
+                               0 != strcmp("after_orphaned_check", faultName))
+               {
+                       elog(LOG, "skipped fault '%s' in dtx recovery process", 
faultName);
+                       return true;
+               }
+       }
+       return false;
+}
+
 FaultInjectorType_e
 FaultInjector_InjectFaultIfSet_out_of_line(
                                                           const char*          
                 faultName,
@@ -276,20 +312,12 @@ FaultInjector_InjectFaultIfSet_out_of_line(
                elog(ERROR, "table name too long: '%s'", tableName);
 
        /*
-        * Auto-vacuum worker and launcher process, may run at unpredictable 
times
-        * while running tests. So, skip setting any faults for auto-vacuum
-        * launcher or worker. If anytime in future need to test these processes
-        * using fault injector framework, this restriction needs to be lifted 
and
-        * some other mechanism needs to be placed to avoid flaky failures.
+        * Some background processes may run at unpredictable times and hit 
faults that
+        * are desired for other backends. So, skip setting any faults for the 
processes
+        * we've found problems with, except for a few faults that we want to 
test for
+        * those processes. 
         */
-       if (IsAutoVacuumLauncherProcess() ||
-               (IsAutoVacuumWorkerProcess() &&
-                !(0 == strcmp("vacuum_update_dat_frozen_xid", faultName) ||
-                  0 == strcmp("auto_vac_worker_before_do_autovacuum", 
faultName) ||
-                  0 == strcmp("auto_vac_worker_after_report_activity", 
faultName) ||
-                  0 == strcmp("auto_vac_worker_abort", faultName) ||
-                  0 == strcmp("analyze_after_hold_lock", faultName) ||
-                  0 == strcmp("analyze_finished_one_relation", faultName))))
+       if (checkBgProcessSkipFault(faultName))
                return FaultInjectorTypeNotSpecified;
 
        /*
diff --git a/src/include/cdb/cdbtm.h b/src/include/cdb/cdbtm.h
index 5a8b3b5d73..e38edc5efe 100644
--- a/src/include/cdb/cdbtm.h
+++ b/src/include/cdb/cdbtm.h
@@ -296,9 +296,11 @@ extern slock_t *shmGxidGenLock;
 extern DistributedTransactionId *shmCommittedGxidArray;
 extern volatile int *shmNumCommittedGxacts;
 
-extern const char *DtxStateToString(DtxState state);
-extern const char *DtxProtocolCommandToString(DtxProtocolCommand command);
-extern const char *DtxContextToString(DtxContext context);
+extern bool IsDtxRecoveryProcess(void);
+
+extern char *DtxStateToString(DtxState state);
+extern char *DtxProtocolCommandToString(DtxProtocolCommand command);
+extern char *DtxContextToString(DtxContext context);
 extern void dtxDeformGid(const char    *gid,
                                                        
DistributedTransactionId                *distribXid);
 extern void dtxFormGid(char *gid, DistributedTransactionId gxid);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to