This is an automated email from the ASF dual-hosted git repository. avamingli pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 6413927d1ae40db09825488ef11b747be4114692 Author: Huansong Fu <[email protected]> AuthorDate: Thu Apr 20 12:06:52 2023 -0700 Do not trigger fault in dtx recovery process except a few Reason for the change is due to a flaky regress test 'gpdispatch': ``` --- /tmp/build/e18b2f02/gpdb_src/src/test/isolation2/expected/gpdispatch_1.out 2023-04-20 17:04:33.775900075 +0000 +++ /tmp/build/e18b2f02/gpdb_src/src/test/isolation2/results/gpdispatch.out 2023-04-20 17:04:33.791901566 +0000 @@ -25,8 +25,10 @@ -- session1 will be fatal. 1: select count(*) > 0 from gp_dist_random('pg_class'); -FATAL: could not allocate resources for segworker communication -server closed the connection unexpectedly + ?column? +---------- + t +(1 row) ``` The reason is that the dtx recovery process got ahead and detonate it first... ``` 2023-04-20 17:03:26.380438 UTC,,,p119165,th1614203008,,,,0,con2,,seg-1,,,,sx1,"LOG","00000","DTM Started",,,,,,,0,,"cdbdtxrecovery.c”,155 ... 2023-04-20 17:04:18.184150 UTC,,,p119165,th1614203008,,,,0,con2,,seg-1,,,,sx1,"LOG","XX009","fault triggered, fault name:'make_dispatch_result_error' fault type:'skip' ",,,,,,,0,,"faultinjector.c",475, 2023-04-20 17:04:18.292955 UTC,,,p119165,th1614203008,,,,0,con2,,seg-1,,,,sx1,"FATAL","XX000","could not allocate resources for segworker communication 2023-04-20 17:04:18.293370 UTC,,,p119165,th1614203008,,,,0,con132,,seg-1,,,,sx1,"LOG","00000","The previous session was reset because its gang was disconnected (session id = 2). The new session id = 132 ``` ...so the intended backend didn't: ``` 3-04-20 17:04:18.175160 UTC,"gpadmin","isolation2test",p122404,th1614203008,"[local]",,2023-04-20 17:04:18 UTC,0,con129,cmd2,seg-1,,,,sx1,"LOG","00000","injected fault 'make_dispatch_result_error' type 'skip'",,,,,,"select gp_inject_fault('make_dispatch_result_error', 'skip', dbid) from gp_segment_configuration where role = 'p' and content = -1;",0,,"faultinjector.c",1077, ... 2023-04-20 17:04:18.187420 UTC,"gpadmin","isolation2test",p122404,th1614203008,"[local]",,2023-04-20 17:04:18 UTC,0,con129,cmd3,seg-1,,,,sx1,"LOG","00000","statement: select count(*) > 0 from gp_dist_random('pg_class');",,,,,,,0,,"postgres.c",1729 ``` The fix is to skip all but a few injected faults in the dtx recovery process, like how we handle the autovacuum process. Refactor the code a little bit to make the skip conditions more clear. Also added notes/logs help dev who wants to test fault injection with background process in future. --- gpcontrib/gp_inject_fault/README | 4 ++ gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql | 3 +- src/backend/cdb/cdbdtxrecovery.c | 5 ++ src/backend/utils/misc/faultinjector.c | 54 ++++++++++++++++------ src/include/cdb/cdbtm.h | 8 ++-- 5 files changed, 57 insertions(+), 17 deletions(-) diff --git a/gpcontrib/gp_inject_fault/README b/gpcontrib/gp_inject_fault/README index 14735753ac..89bf85c9da 100644 --- a/gpcontrib/gp_inject_fault/README +++ b/gpcontrib/gp_inject_fault/README @@ -166,3 +166,7 @@ the warning callback function needs to determine whether the FTS process is enabled on the Segment node, we need to synchronize the information on whether the FTS on the Master is enabled to the corresponding QE process. +================================ +* We let some background process ignore all but a few faults. If one wants +to test fault injection in background processese, add the exception in +checkBgProcessSkipFault(). diff --git a/gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql b/gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql index 775e7ec9db..4c64ee4b57 100644 --- a/gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql +++ b/gpcontrib/gp_inject_fault/gp_inject_fault--1.0.sql @@ -3,7 +3,8 @@ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION gp_fault_inject" to load this file. \quit -CREATE FUNCTION gp_inject_fault( +-- NOTE: we let some background process ignore all but a few faults (check checkBgProcessSkipFault()). +CREATE FUNCTION @[email protected]_inject_fault( faultname text, type text, ddl text, diff --git a/src/backend/cdb/cdbdtxrecovery.c b/src/backend/cdb/cdbdtxrecovery.c index eb30598b31..7a6f6c5d01 100644 --- a/src/backend/cdb/cdbdtxrecovery.c +++ b/src/backend/cdb/cdbdtxrecovery.c @@ -67,6 +67,11 @@ static void doAbortInDoubt(char *gid); static bool doNotifyCommittedInDoubt(char *gid); static void AbortOrphanedPreparedTransactions(void); +bool IsDtxRecoveryProcess() +{ + return getpid() == DtxRecoveryPID(); +} + static bool doNotifyCommittedInDoubt(char *gid) { diff --git a/src/backend/utils/misc/faultinjector.c b/src/backend/utils/misc/faultinjector.c index 75bd138226..3aa202dc41 100644 --- a/src/backend/utils/misc/faultinjector.c +++ b/src/backend/utils/misc/faultinjector.c @@ -250,6 +250,42 @@ FaultInjector_ShmemInit(void) return; } +static bool +checkBgProcessSkipFault(const char* faultName) +{ + if (IsAutoVacuumLauncherProcess()) + { + /* autovacuum launcher process */ + elog(LOG, "skipped fault '%s' in autovacuum launcher process", faultName); + return true; + } + else if (IsAutoVacuumWorkerProcess()) + { + /* autovacuum worker process */ + if (0 != strcmp("vacuum_update_dat_frozen_xid", faultName) && + 0 != strcmp("auto_vac_worker_before_do_autovacuum", faultName) && + 0 != strcmp("auto_vac_worker_after_report_activity", faultName) && + 0 != strcmp("auto_vac_worker_abort", faultName) && + 0 != strcmp("analyze_after_hold_lock", faultName) && + 0 != strcmp("analyze_finished_one_relation", faultName)) + { + elog(LOG, "skipped fault '%s' in autovacuum worker process", faultName); + return true; + } + } + else if (IsDtxRecoveryProcess()) + { + /* dtx recovery process */ + if (0 != strcmp("before_orphaned_check", faultName) && + 0 != strcmp("after_orphaned_check", faultName)) + { + elog(LOG, "skipped fault '%s' in dtx recovery process", faultName); + return true; + } + } + return false; +} + FaultInjectorType_e FaultInjector_InjectFaultIfSet_out_of_line( const char* faultName, @@ -276,20 +312,12 @@ FaultInjector_InjectFaultIfSet_out_of_line( elog(ERROR, "table name too long: '%s'", tableName); /* - * Auto-vacuum worker and launcher process, may run at unpredictable times - * while running tests. So, skip setting any faults for auto-vacuum - * launcher or worker. If anytime in future need to test these processes - * using fault injector framework, this restriction needs to be lifted and - * some other mechanism needs to be placed to avoid flaky failures. + * Some background processes may run at unpredictable times and hit faults that + * are desired for other backends. So, skip setting any faults for the processes + * we've found problems with, except for a few faults that we want to test for + * those processes. */ - if (IsAutoVacuumLauncherProcess() || - (IsAutoVacuumWorkerProcess() && - !(0 == strcmp("vacuum_update_dat_frozen_xid", faultName) || - 0 == strcmp("auto_vac_worker_before_do_autovacuum", faultName) || - 0 == strcmp("auto_vac_worker_after_report_activity", faultName) || - 0 == strcmp("auto_vac_worker_abort", faultName) || - 0 == strcmp("analyze_after_hold_lock", faultName) || - 0 == strcmp("analyze_finished_one_relation", faultName)))) + if (checkBgProcessSkipFault(faultName)) return FaultInjectorTypeNotSpecified; /* diff --git a/src/include/cdb/cdbtm.h b/src/include/cdb/cdbtm.h index 5a8b3b5d73..e38edc5efe 100644 --- a/src/include/cdb/cdbtm.h +++ b/src/include/cdb/cdbtm.h @@ -296,9 +296,11 @@ extern slock_t *shmGxidGenLock; extern DistributedTransactionId *shmCommittedGxidArray; extern volatile int *shmNumCommittedGxacts; -extern const char *DtxStateToString(DtxState state); -extern const char *DtxProtocolCommandToString(DtxProtocolCommand command); -extern const char *DtxContextToString(DtxContext context); +extern bool IsDtxRecoveryProcess(void); + +extern char *DtxStateToString(DtxState state); +extern char *DtxProtocolCommandToString(DtxProtocolCommand command); +extern char *DtxContextToString(DtxContext context); extern void dtxDeformGid(const char *gid, DistributedTransactionId *distribXid); extern void dtxFormGid(char *gid, DistributedTransactionId gxid); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
