This is an automated email from the ASF dual-hosted git repository.

yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit a3f17fe8ab64faa21709b2b7d86aa5dc14ecb1c2
Author: Huansong Fu <fuhuans...@gmail.com>
AuthorDate: Fri Jun 30 14:53:53 2023 -0700

    Fix a flaky test die_commit_pending_replication
    
    The test was flaky with a diff like:
    
    --- 
/tmp/build/e18b2f02/gpdb_src/src/test/isolation2/expected/segwalrep/die_commit_pending_replication.out
      2023-06-30 20:21:28.988978262 +0000
    +++ 
/tmp/build/e18b2f02/gpdb_src/src/test/isolation2/results/segwalrep/die_commit_pending_replication.out
       2023-06-30 20:21:28.996979056 +0000
    @@ -60,8 +60,7 @@
     0U: select pg_terminate_backend(pid) from pg_stat_activity where 
wait_event='SyncRep' and sess_id in (select sess_id from store_session_id);
      pg_terminate_backend
     ------------
    - t
    -(1 row)
    +(0 rows)
    
     -- We expect two more occurrence: one for backend quitting and another for 
retry.
     select gp_wait_until_triggered_fault('sync_rep_query_die', 3, dbid) from 
gp_segment_configuration where role='p' and content = 0;
    @@ -72,9 +71,10 @@
    
     -- Verify that the sess_id changes due to retry.
     0U: select pid,sess_id,wait_event,query from pg_stat_activity where 
sess_id in (select sess_id from store_session_id);
    - pid | sess_id | wait_event | query
    ------+---------+------------+-------
    -(0 rows)
    + pid    | sess_id | wait_event | query
    
+--------+---------+------------+-----------------------------------------------------------
    + 245509 | 133     | SyncRep    | insert into 
die_commit_pending_replication values(2),(1);
    +(1 row)
    
    It is obvious that we were too quick to check pg_stat_activity for the 
SyncRep row.
    The first gp_inject_fault_infinite for 'sync_rep_query_die' fault was hit 
prematurely.
    Looking at the logs we can see that:
    
    2023-06-30 20:20:23.443932 
UTC,"gpadmin","isolation2test",p245520,th254449792,"10.254.0.6","39884",2023-06-30
 20:20:23 UTC,420181416,con135,cmd5,seg0,slice1,,x420181416,sx1,"LOG","XX009",
    "fault triggered, fault name:'sync_rep_query_die' fault type:'skip' 
",,,,,,"select pg_catalog.gp_acquire_sample_rows(368819, 10000, 
'f');",0,,"faultinjector.c",503,
    2023-06-30 20:20:23.549586 
UTC,"gpadmin",,p245530,th254449792,"10.254.0.6","39898",2023-06-30 20:20:23 
UTC,0,,,seg0,,,,,"LOG","XX009",
    "fault triggered 1 times, fault name:'sync_rep_query_die' fault 
type:'wait_until_triggered' ",,,,,,,0,,"faultinjector.c",868,
    
    The fault was triggered by an unrelated query which looks like to be part 
of autoanalyze.
    So running this test w/o autovacuum should fix the issue.
---
 src/test/isolation2/isolation2_schedule | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/test/isolation2/isolation2_schedule 
b/src/test/isolation2/isolation2_schedule
index ff9f861ff0..fbd44795db 100644
--- a/src/test/isolation2/isolation2_schedule
+++ b/src/test/isolation2/isolation2_schedule
@@ -215,13 +215,16 @@ test: pg_basebackup_with_tablespaces
 test: pg_basebackup_large_database_oid
 test: vacuum_progress_row
 test: vacuum_progress_column
+
+# this test contains some fault injection that might get mis-hit by autovacuum 
process,
+# so it needs to be run with autovacuum being disabled
+test: segwalrep/die_commit_pending_replication
+
 test: enable_autovacuum
 test: idle_gang_cleaner
 # test idle_in_transaction_session_timeout
 test: write_gang_idle_in_transaction_session_timeout
 
-test: segwalrep/die_commit_pending_replication
-
 # Tests for FTS
 test: fts_errors
 test: segwalrep/replication_keeps_crash


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cloudberry.apache.org
For additional commands, e-mail: commits-h...@cloudberry.apache.org

Reply via email to