This is an automated email from the ASF dual-hosted git repository. avamingli pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/cloudberry.git
commit 2b79a1b0883a646869dad1a02a1c34e45a7ffefa Author: Alexandra Wang <[email protected]> AuthorDate: Tue Apr 25 20:50:14 2023 -0700 Primary should stop waiting for replication when mirror is stopped Previously, commit 611b788d added throttling for SELECT and other read operations calls MarkBufferDirtyHint(). Since these operations generate a lot of Full Page Image (FPI) WALs, the throttling is needed to avoid large replication lag. The throttling function `wait_to_avoid_large_repl_lag()` calls `SyncRepWaitForLSN()` in which the primary waits for mirror to catch up to given LSN. However, the throttling function (wait for replication) was added in a critical section during which PGXACT->delayChkpt is set, and when this variable is set, the checkpointer process will pause. This would cause a deadlock situation if the mirror is stopped. Previous deadlock scenario: - Backend process of a SELECT query sets PGXACT->delayChkpt and be throttled (waits for synchrounous replication) - Checkpointer process paused because PGXACT->delayChkpt is set - Mirror stopped before it caught up with the primary, FTS detected mirror is down and send FTS_MSG_SYNCREP_OFF message to primary. - Checkpoint process is stuck because PGXACT->delayChkpt is still set, therefore it cannot proceed to unset WalSndCtl->sync_standbys_defined - Backend process of the SELECT query is stuck waiting for replication because WalSndCtl->sync_standbys_defined is still true, therefore it cannot proceed to unset PGXACT->delayChkpt Expected scenario after fix: - Backend process of SELECT query sets PGXACT->delayChkpt, mark the buffer dirty hint, and unset PGXACT->delayChkpt, then be throttled (waits for synchrounous replication) - Checkpointer process proceeds while the SELECT backend is waiting for synchorouous replication - Mirror stopped - Checkpointer updates the shared memory variable WalSndCtl->sync_standbys_defined to false, so that the SELECT backend stops waiting for sychrouous replication and proceed. Reported-by: Ashwin Agrawal <aashwin@vmware> Co-authored-by: Marbin Tan <[email protected]> Reviewed-by: Ashwin Agrawal <aashwin@vmware> Reviewed-by: Soumyadeep Chakraborty <[email protected]> --- src/backend/access/transam/xlog.c | 7 + src/backend/access/transam/xloginsert.c | 2 - src/backend/storage/buffer/bufmgr.c | 8 + .../expected/segwalrep/select_throttle.out | 690 ++++++++++++++++++++- src/test/isolation2/expected/setup.out | 4 + .../isolation2/sql/segwalrep/select_throttle.sql | 86 ++- src/test/isolation2/sql/setup.sql | 20 + 7 files changed, 795 insertions(+), 22 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 15ea696d1c..24810a2de0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -13888,10 +13888,17 @@ initialize_wal_bytes_written(void) * transactions starving concurrent transactions from commiting due to sync * rep. This interface provides a way for primary to avoid racing forward with * WAL generation and move at sustained speed with network and mirrors. + * + * NB: This function should never be called from inside a critical section, + * meaning caller should never have MyPgXact->delayChkpt set to true. Otherwise, + * if mirror is down, we will end up in a deadlock situation between the primary + * and the checkpointer process, because if MyPgXact->delayChkpt is set, + * checkpointer cannot proceed to unset WalSndCtl->sync_standbys_defined. */ void wait_to_avoid_large_repl_lag(void) { + Assert(!MyPgXact->delayChkpt); /* rep_lag_avoidance_threshold is defined in KB */ if (rep_lag_avoidance_threshold && wal_bytes_written > (rep_lag_avoidance_threshold * 1024)) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 98505ea675..73b93fda91 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -1020,8 +1020,6 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data, flags); recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT); - - wait_to_avoid_large_repl_lag(); } return recptr; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 577d120e21..4b1c43c450 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -4243,7 +4243,15 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) UnlockBufHdr(bufHdr, buf_state); if (delayChkpt) + { MyProc->delayChkpt = false; + /* + * Wait for wal replication only after checkpoiter is no longer + * delayed by us. Otherwise, we might end up in a deadlock situation + * if mirror is marked down while we are waiting for wal replication + */ + wait_to_avoid_large_repl_lag(); + } if (dirtied) { diff --git a/src/test/isolation2/expected/segwalrep/select_throttle.out b/src/test/isolation2/expected/segwalrep/select_throttle.out index 7e5ad17201..1eea7e35d1 100644 --- a/src/test/isolation2/expected/segwalrep/select_throttle.out +++ b/src/test/isolation2/expected/segwalrep/select_throttle.out @@ -1,19 +1,15 @@ -- Test: SELECT or other read-only operations which set hint bits on pages, -- and in turn generate Full Page Images (FPI) WAL records should be throttled. -- We will only throttle when our transaction wal exceeds --- wait_for_replication_threshold --- For this test we will: +-- wait_for_replication_threshold. While the backend is throttled waiting for +-- synchronous replication, it should not block the CHECKPOINT process, and if +-- the mirror is taken down, synchronous replication should be turned off and +-- the backend should stop waiting for synchronous replication and proceed. + +-- Setup: -- 1. Set wait_for_replication_threshold to 1kB for quicker test -- 2. create two tables (one small and one large) -- 3. set gp_disable_tuple_hints=off so buffer will be immediately marked dirty on hint bit change --- 4. Suspend walsender --- 5. Perform a read-only operation (SELECT) which would now set the hint bits --- For the small table this operation should finish, --- but for large table the SELECT should be throttled --- since it would generate a lot of WAL greater than wait_for_replication_threshold --- 6. Confirm that the query is waiting on Syncrep --- 7. Reset the walsender and the transaction should complete --- -- set wait_for_replication_threshold to 1kB for quicker test ALTER SYSTEM SET wait_for_replication_threshold = 1; @@ -38,6 +34,15 @@ INSERT 100000 1U: SET gp_disable_tuple_hints=off; SET +-- Test 1: +-- 1. Suspend walsender +-- 2. Perform a read-only operation (SELECT) which would now set the hint bits +-- For the small table this operation should finish, +-- but for large table the SELECT should be throttled +-- since it would generate a lot of WAL greater than wait_for_replication_threshold +-- 3. Confirm that the query is waiting on Syncrep +-- 4. Reset the walsender and the transaction should complete + -- flush the data to disk checkpoint; CHECKPOINT @@ -87,9 +92,146 @@ SELECT wait_until_all_segments_synchronized(); -------------------------------------- OK (1 row) + +-- Test 2: +-- 1. Suspend walsender +-- 2. Perform a read-only operation (SELECT) which would now set the hint bits +-- For the large table the SELECT should be throttled +-- since it would generate a lot of WAL greater than wait_for_replication_threshold +-- 3. Confirm that the query is waiting on Syncrep +-- 4. Perform CHECKPOINT and confirm that it does not block +-- 5. Stop the mirror +-- 6. Reset the walsender and the transaction should complete without waiting for syncrep + +-- Setup: +-- set mirror down grace period to zero to instantly mark mirror down. +-- the 1Uq and 1U pair will force a wait on the config reload. +!\retcode gpconfig -c gp_fts_mark_mirror_down_grace_period -v 2; +-- start_ignore +20230428:13:32:51:1966993 gpconfig:station6:pivotal-[INFO]:-completed successfully with parameters '-c gp_fts_mark_mirror_down_grace_period -v 0' + +-- end_ignore +(exited with code 0) +!\retcode gpstop -u; +-- start_ignore +20230428:13:32:51:1967400 gpstop:station6:pivotal-[INFO]:-Starting gpstop with args: -u +20230428:13:32:51:1967400 gpstop:station6:pivotal-[INFO]:-Gathering information and validating the environment... +20230428:13:32:51:1967400 gpstop:station6:pivotal-[INFO]:-Obtaining Greenplum Coordinator catalog information +20230428:13:32:51:1967400 gpstop:station6:pivotal-[INFO]:-Obtaining Segment details from coordinator... +20230428:13:32:51:1967400 gpstop:station6:pivotal-[INFO]:-Greenplum Version: 'postgres (Greenplum Database) 7.0.0-beta.1+dev.82.g0281df93bb6 build dev' +20230428:13:32:51:1967400 gpstop:station6:pivotal-[INFO]:-Signalling all postmaster processes to reload + +-- end_ignore +(exited with code 0) +1Uq: ... <quitting> +1U: show gp_fts_mark_mirror_down_grace_period; + gp_fts_mark_mirror_down_grace_period +-------------------------------------- + 2s +(1 row) +-- Enable tuple hints so that buffer will be marked dirty upon a hint bit change +-- (so that we don't have to wait for the tuple to age. See logic in markDirty) +1U: SET gp_disable_tuple_hints=off; +SET +Truncate select_throttle; +TRUNCATE +INSERT INTO select_throttle SELECT generate_series (1, 100000); +INSERT 100000 +-- flush the data to disk +checkpoint; +CHECKPOINT +-- Suspend walsender +SELECT gp_inject_fault_infinite('wal_sender_loop', 'suspend', dbid) FROM gp_segment_configuration WHERE role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + +checkpoint; +CHECKPOINT +-- SELECT will set the hint bit on (the buffer will be marked dirty) +-- This query should wait for Syncrep since its WAL size for hint bits is greater than wait_for_replication_threshold +1U&: SELECT count(*) FROM select_throttle; <waiting ...> + +-- check if the above query is waiting on SyncRep in pg_stat_activity +SELECT is_query_waiting_for_syncrep(50, 'SELECT count(*) FROM select_throttle;'); + is_query_waiting_for_syncrep +------------------------------ + t +(1 row) + +-- while SELECT is waiting for syncrep, it should not block a subsequent checkpoint +CHECKPOINT; +CHECKPOINT + +-- stop the mirror should turn off syncrep +SELECT pg_ctl(datadir, 'stop', 'immediate') FROM gp_segment_configuration WHERE content=1 AND role = 'm'; + pg_ctl +-------- + OK +(1 row) + +-- reset walsender and let it exit so that mirror stop can be detected +SELECT gp_inject_fault_infinite('wal_sender_loop', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' and content = 1; + gp_inject_fault_infinite +-------------------------- + Success: +(1 row) + +-- perform fts probe scan and verify that mirror is down +select wait_for_mirror_down(1::smallint, 30); + wait_for_mirror_down +---------------------- + t +(1 row) +select content, role, preferred_role, mode, status from gp_segment_configuration where content = 1; + content | role | preferred_role | mode | status +---------+------+----------------+------+-------- + 1 | p | p | n | u + 1 | m | m | n | d +(2 rows) + +-- after mirror is stopped, the SELECT query should proceed without waiting for syncrep +1U<: <... completed> + count +------- + 33327 +(1 row) + +-- Cleanup 1U: RESET gp_disable_tuple_hints; RESET +-- reset the mirror down grace period back to its default value. +-- the 1Uq and 1U pair will force a wait on the config reload. +!\retcode gpconfig -r gp_fts_mark_mirror_down_grace_period; +-- start_ignore +20230428:13:32:53:1967437 gpconfig:station6:pivotal-[INFO]:-completed successfully with parameters '-r gp_fts_mark_mirror_down_grace_period' + +-- end_ignore +(exited with code 0) +!\retcode gpstop -u; +-- start_ignore +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:-Starting gpstop with args: -u +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:-Gathering information and validating the environment... +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:-Obtaining Greenplum Coordinator catalog information +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:-Obtaining Segment details from coordinator... +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:-Greenplum Version: 'postgres (Greenplum Database) 7.0.0-beta.1+dev.82.g0281df93bb6 build dev' +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:-Signalling all postmaster processes to reload +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:--------------------------------------------- +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:-Some segment postmasters were not reloaded +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:--------------------------------------------- +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:- Host Datadir Port Status +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:- station6 /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 7006 d +20230428:13:32:53:1967844 gpstop:station6:pivotal-[INFO]:--------------------------------------------- + +-- end_ignore +(exited with code 1) 1Uq: ... <quitting> +1U: show gp_fts_mark_mirror_down_grace_period; + gp_fts_mark_mirror_down_grace_period +-------------------------------------- + 30s +(1 row) ALTER SYSTEM RESET wait_for_replication_threshold; ALTER @@ -98,3 +240,531 @@ SELECT pg_reload_conf(); ---------------- t (1 row) + +!\retcode gprecoverseg -av; +-- start_ignore +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[INFO]:-Starting gprecoverseg with args: -av +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-WorkerPool() initialized with 16 workers +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Obtaining coordinator's port from coordinator data directory +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Read from postgresql.conf port=7000 +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Read from postgresql.conf max_connections=150 +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: $GPHOME/bin/postgres --gp-version +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[INFO]:-local Greenplum Version: 'postgres (Greenplum Database) 7.0.0-beta.1+dev.82.g0281df93bb6 build dev' +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Connecting to db template1 on host localhost +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[INFO]:-coordinator Greenplum Version: 'PostgreSQL 12.12 (Greenplum Database 7.0.0-beta.1+dev.82.g0281df93bb6 build dev) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 11.3.0-1ubuntu1~22.04) 11.3.0, 64-bit compiled on Apr 27 2023 17:48:22 (with assert checking) Bhuvnesh C.' +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[INFO]:-Obtaining Segment details from coordinator... +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Connecting to db template1 on host localhost +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Validating configuration... +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-WorkerPool() initialized with 1 workers +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Adding cmd to work_queue: ssh station6 'echo station6' +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] got cmd: ssh station6 'echo station6' +20230428:13:32:53:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: ssh station6 'echo station6' +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] finished cmd: check station6 is up cmdStr='ssh station6 'echo station6'' had result: cmd had rc=0 completed=True halted=False + stdout='station6 +' + stderr='' +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-WorkerPool haltWork() +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] haltWork +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] got a halt cmd +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Connecting to db template1 on host localhost +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-No basebackup running +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Checking for running instances of pg_rewind with host station6 and port 7003 as source server +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Connecting to db template1 on host station6 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-WorkerPool() initialized with 1 workers +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Adding cmd to work_queue: $GPHOME/bin/pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Adding cmd to work_queue: $GPHOME/bin/pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] got cmd: $GPHOME/bin/pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: $GPHOME/bin/pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] finished cmd: run pg_controldata cmdStr='ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 station6 ". /usr/local/gpdb/greenplum_path.sh; $GPHOME/bin/pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1"' had result: cmd had rc=0 completed=True halted=False + stdout='pg_control version number: 12010700 +Catalog version number: 302304041 +Database system identifier: 7227176750494621446 +Database cluster state: in production +pg_control last modified: Fri 28 Apr 2023 01:32:52 PM PDT +Latest checkpoint location: 0/1D0EC0A8 +Latest checkpoint's REDO location: 0/1D0EC070 +Latest checkpoint's REDO WAL file: 000000010000000000000007 +Latest checkpoint's TimeLineID: 1 +Latest checkpoint's PrevTimeLineID: 1 +Latest checkpoint's full_page_writes: on +Latest checkpoint's NextXID: 0:1979 +Latest checkpoint's NextGxid: 1678 +Latest checkpoint's NextOID: 24623 +Latest checkpoint's NextRelfilenode: 32834 +Latest checkpoint's NextMultiXactId: 1 +Latest checkpoint's NextMultiOffset: 0 +Latest checkpoint's oldestXID: 531 +Latest checkpoint's oldestXID's DB: 1 +Latest checkpoint's oldestActiveXID: 1979 +Latest checkpoint's oldestMultiXid: 1 +Latest checkpoint's oldestMulti's DB: 1 +Latest checkpoint's oldestCommitTsXid:0 +Latest checkpoint's newestCommitTsXid:0 +Time of latest checkpoint: Fri 28 Apr 2023 01:32:52 PM PDT +Fake LSN counter for unlogged rels: 0/3E8 +Minimum recovery ending location: 0/0 +Min recovery ending loc's timeline: 0 +Backup start location: 0/0 +Backup end location: 0/0 +End-of-backup record required: no +wal_level setting: replica +wal_log_hints setting: off +max_connections setting: 750 +max_worker_processes setting: 12 +max_wal_senders setting: 10 +max_prepared_xacts setting: 250 +max_locks_per_xact setting: 128 +track_commit_timestamp setting: off +Maximum data alignment: 8 +Database block size: 32768 +Blocks per segment of large relation: 32768 +WAL block size: 32768 +Bytes per WAL segment: 67108864 +Maximum length of identifiers: 64 +Maximum columns in an index: 32 +Maximum size of a TOAST chunk: 8140 +Size of a large-object chunk: 8192 +Date/time type storage: 64-bit integers +Float4 argument passing: by value +Float8 argument passing: by value +Data page checksum version: 1 +Mock authentication nonce: e3a5af22bfbe2ac44c969f4724f7c7a50599226317759207240cd019584d55e6 +' + stderr='' +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] got cmd: $GPHOME/bin/pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: $GPHOME/bin/pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] finished cmd: run pg_controldata cmdStr='ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 station6 ". /usr/local/gpdb/greenplum_path.sh; $GPHOME/bin/pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1"' had result: cmd had rc=0 completed=True halted=False + stdout='pg_control version number: 12010700 +Catalog version number: 302304041 +Database system identifier: 7227176750494621446 +Database cluster state: in archive recovery +pg_control last modified: Fri 28 Apr 2023 01:32:42 PM PDT +Latest checkpoint location: 0/15B48EC0 +Latest checkpoint's REDO location: 0/15B48EC0 +Latest checkpoint's REDO WAL file: 000000010000000000000005 +Latest checkpoint's TimeLineID: 1 +Latest checkpoint's PrevTimeLineID: 1 +Latest checkpoint's full_page_writes: on +Latest checkpoint's NextXID: 0:1977 +Latest checkpoint's NextGxid: 968 +Latest checkpoint's NextOID: 24623 +Latest checkpoint's NextRelfilenode: 24642 +Latest checkpoint's NextMultiXactId: 1 +Latest checkpoint's NextMultiOffset: 0 +Latest checkpoint's oldestXID: 531 +Latest checkpoint's oldestXID's DB: 1 +Latest checkpoint's oldestActiveXID: 0 +Latest checkpoint's oldestMultiXid: 1 +Latest checkpoint's oldestMulti's DB: 1 +Latest checkpoint's oldestCommitTsXid:0 +Latest checkpoint's newestCommitTsXid:0 +Time of latest checkpoint: Fri 28 Apr 2023 01:07:42 PM PDT +Fake LSN counter for unlogged rels: 0/3E8 +Minimum recovery ending location: 0/1CEDCCC8 +Min recovery ending loc's timeline: 1 +Backup start location: 0/0 +Backup end location: 0/0 +End-of-backup record required: no +wal_level setting: replica +wal_log_hints setting: off +max_connections setting: 750 +max_worker_processes setting: 12 +max_wal_senders setting: 10 +max_prepared_xacts setting: 250 +max_locks_per_xact setting: 128 +track_commit_timestamp setting: off +Maximum data alignment: 8 +Database block size: 32768 +Blocks per segment of large relation: 32768 +WAL block size: 32768 +Bytes per WAL segment: 67108864 +Maximum length of identifiers: 64 +Maximum columns in an index: 32 +Maximum size of a TOAST chunk: 8140 +Size of a large-object chunk: 8192 +Date/time type storage: 64-bit integers +Float4 argument passing: by value +Float8 argument passing: by value +Data page checksum version: 1 +Mock authentication nonce: e3a5af22bfbe2ac44c969f4724f7c7a50599226317759207240cd019584d55e6 +' + stderr='' +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-WorkerPool haltWork() +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] haltWork +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] got a halt cmd +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:-Successfully finished pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 for dbid 3: +stdout: pg_control version number: 12010700 +Catalog version number: 302304041 +Database system identifier: 7227176750494621446 +Database cluster state: in production +pg_control last modified: Fri 28 Apr 2023 01:32:52 PM PDT +Latest checkpoint location: 0/1D0EC0A8 +Latest checkpoint's REDO location: 0/1D0EC070 +Latest checkpoint's REDO WAL file: 000000010000000000000007 +Latest checkpoint's TimeLineID: 1 +Latest checkpoint's PrevTimeLineID: 1 +Latest checkpoint's full_page_writes: on +Latest checkpoint's NextXID: 0:1979 +Latest checkpoint's NextGxid: 1678 +Latest checkpoint's NextOID: 24623 +Latest checkpoint's NextRelfilenode: 32834 +Latest checkpoint's NextMultiXactId: 1 +Latest checkpoint's NextMultiOffset: 0 +Latest checkpoint's oldestXID: 531 +Latest checkpoint's oldestXID's DB: 1 +Latest checkpoint's oldestActiveXID: 1979 +Latest checkpoint's oldestMultiXid: 1 +Latest checkpoint's oldestMulti's DB: 1 +Latest checkpoint's oldestCommitTsXid:0 +Latest checkpoint's newestCommitTsXid:0 +Time of latest checkpoint: Fri 28 Apr 2023 01:32:52 PM PDT +Fake LSN counter for unlogged rels: 0/3E8 +Minimum recovery ending location: 0/0 +Min recovery ending loc's timeline: 0 +Backup start location: 0/0 +Backup end location: 0/0 +End-of-backup record required: no +wal_level setting: replica +wal_log_hints setting: off +max_connections setting: 750 +max_worker_processes setting: 12 +max_wal_senders setting: 10 +max_prepared_xacts setting: 250 +max_locks_per_xact setting: 128 +track_commit_timestamp setting: off +Maximum data alignment: 8 +Database block size: 32768 +Blocks per segment of large relation: 32768 +WAL block size: 32768 +Bytes per WAL segment: 67108864 +Maximum length of identifiers: 64 +Maximum columns in an index: 32 +Maximum size of a TOAST chunk: 8140 +Size of a large-object chunk: 8192 +Date/time type storage: 64-bit integers +Float4 argument passing: by value +Float8 argument passing: by value +Data page checksum version: 1 +Mock authentication nonce: e3a5af22bfbe2ac44c969f4724f7c7a50599226317759207240cd019584d55e6 + +stderr: +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:-Successfully finished pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 for dbid 6: +stdout: pg_control version number: 12010700 +Catalog version number: 302304041 +Database system identifier: 7227176750494621446 +Database cluster state: in archive recovery +pg_control last modified: Fri 28 Apr 2023 01:32:42 PM PDT +Latest checkpoint location: 0/15B48EC0 +Latest checkpoint's REDO location: 0/15B48EC0 +Latest checkpoint's REDO WAL file: 000000010000000000000005 +Latest checkpoint's TimeLineID: 1 +Latest checkpoint's PrevTimeLineID: 1 +Latest checkpoint's full_page_writes: on +Latest checkpoint's NextXID: 0:1977 +Latest checkpoint's NextGxid: 968 +Latest checkpoint's NextOID: 24623 +Latest checkpoint's NextRelfilenode: 24642 +Latest checkpoint's NextMultiXactId: 1 +Latest checkpoint's NextMultiOffset: 0 +Latest checkpoint's oldestXID: 531 +Latest checkpoint's oldestXID's DB: 1 +Latest checkpoint's oldestActiveXID: 0 +Latest checkpoint's oldestMultiXid: 1 +Latest checkpoint's oldestMulti's DB: 1 +Latest checkpoint's oldestCommitTsXid:0 +Latest checkpoint's newestCommitTsXid:0 +Time of latest checkpoint: Fri 28 Apr 2023 01:07:42 PM PDT +Fake LSN counter for unlogged rels: 0/3E8 +Minimum recovery ending location: 0/1CEDCCC8 +Min recovery ending loc's timeline: 1 +Backup start location: 0/0 +Backup end location: 0/0 +End-of-backup record required: no +wal_level setting: replica +wal_log_hints setting: off +max_connections setting: 750 +max_worker_processes setting: 12 +max_wal_senders setting: 10 +max_prepared_xacts setting: 250 +max_locks_per_xact setting: 128 +track_commit_timestamp setting: off +Maximum data alignment: 8 +Database block size: 32768 +Blocks per segment of large relation: 32768 +WAL block size: 32768 +Bytes per WAL segment: 67108864 +Maximum length of identifiers: 64 +Maximum columns in an index: 32 +Maximum size of a TOAST chunk: 8140 +Size of a large-object chunk: 8192 +Date/time type storage: 64-bit integers +Float4 argument passing: by value +Float8 argument passing: by value +Data page checksum version: 1 +Mock authentication nonce: e3a5af22bfbe2ac44c969f4724f7c7a50599226317759207240cd019584d55e6 + +stderr: +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: $GPHOME/bin/pg_controldata /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/qddir/demoDataDir-1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:-Heap checksum setting is consistent between coordinator and the segments that are candidates for recoverseg +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:-Greenplum instance recovery parameters +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:---------------------------------------------------------- +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:-Recovery type = Standard +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:---------------------------------------------------------- +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:-Recovery 1 of 1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:---------------------------------------------------------- +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Synchronization mode = Incremental +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Failed instance host = station6 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Failed instance address = station6 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Failed instance directory = /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Failed instance port = 7006 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Recovery Source instance host = station6 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Recovery Source instance address = station6 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Recovery Source instance directory = /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Recovery Source instance port = 7003 +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:- Recovery Target = in-place +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:---------------------------------------------------------- +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[INFO]:-Starting to create new pg_hba.conf on primary segments +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-WorkerPool() initialized with 1 workers +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Adding cmd to work_queue: $GPHOME/sbin/seg_update_pg_hba.py --data-dir /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 --entries ' +host replication pivotal samehost trust +host all pivotal 10.0.0.234/32 trust +host all pivotal 172.17.0.1/32 trust +host all pivotal 10.16.119.186/32 trust +host all pivotal 2601:647:4800:c740:49b3:5dc9:f206:1c27/128 trust +host all pivotal 2601:647:4800:c740::b8fa/128 trust +host all pivotal 2601:647:4800:c740:21bb:8d13:b3c2:16d6/128 trust +host all pivotal 2601:647:4800:c740:f9d1:a4e5:bdbb:67cb/128 trust +host all pivotal 2601:647:4800:c740:dc25:37de:75f8:efdf/128 trust +host all pivotal 2601:647:4800:c740:3259:5e67:ae0d:643f/128 trust +host all pivotal 2601:647:4800:c740:7cd8:f44e:3889:509e/128 trust +host all pivotal 2601:647:4800:c740:40d4:7879:8a:dc04/128 trust +host all pivotal 2601:647:4800:c740:c21a:83b2:8a4a:33bd/128 trust +host replication pivotal 10.0.0.234/32 trust +host replication pivotal 172.17.0.1/32 trust +host replication pivotal 10.16.119.186/32 trust +host replication pivotal 2601:647:4800:c740:49b3:5dc9:f206:1c27/128 trust +host replication pivotal 2601:647:4800:c740::b8fa/128 trust +host replication pivotal 2601:647:4800:c740:21bb:8d13:b3c2:16d6/128 trust +host replication pivotal 2601:647:4800:c740:f9d1:a4e5:bdbb:67cb/128 trust +host replication pivotal 2601:647:4800:c740:dc25:37de:75f8:efdf/128 trust +host replication pivotal 2601:647:4800:c740:3259:5e67:ae0d:643f/128 trust +host replication pivotal 2601:647:4800:c740:7cd8:f44e:3889:509e/128 trust +host replication pivotal 2601:647:4800:c740:40d4:7879:8a:dc04/128 trust +host replication pivotal 2601:647:4800:c740:c21a:83b2:8a4a:33bd/128 trust' +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] got cmd: $GPHOME/sbin/seg_update_pg_hba.py --data-dir /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 --entries ' +host replication pivotal samehost trust +host all pivotal 10.0.0.234/32 trust +host all pivotal 172.17.0.1/32 trust +host all pivotal 10.16.119.186/32 trust +host all pivotal 2601:647:4800:c740:49b3:5dc9:f206:1c27/128 trust +host all pivotal 2601:647:4800:c740::b8fa/128 trust +host all pivotal 2601:647:4800:c740:21bb:8d13:b3c2:16d6/128 trust +host all pivotal 2601:647:4800:c740:f9d1:a4e5:bdbb:67cb/128 trust +host all pivotal 2601:647:4800:c740:dc25:37de:75f8:efdf/128 trust +host all pivotal 2601:647:4800:c740:3259:5e67:ae0d:643f/128 trust +host all pivotal 2601:647:4800:c740:7cd8:f44e:3889:509e/128 trust +host all pivotal 2601:647:4800:c740:40d4:7879:8a:dc04/128 trust +host all pivotal 2601:647:4800:c740:c21a:83b2:8a4a:33bd/128 trust +host replication pivotal 10.0.0.234/32 trust +host replication pivotal 172.17.0.1/32 trust +host replication pivotal 10.16.119.186/32 trust +host replication pivotal 2601:647:4800:c740:49b3:5dc9:f206:1c27/128 trust +host replication pivotal 2601:647:4800:c740::b8fa/128 trust +host replication pivotal 2601:647:4800:c740:21bb:8d13:b3c2:16d6/128 trust +host replication pivotal 2601:647:4800:c740:f9d1:a4e5:bdbb:67cb/128 trust +host replication pivotal 2601:647:4800:c740:dc25:37de:75f8:efdf/128 trust +host replication pivotal 2601:647:4800:c740:3259:5e67:ae0d:643f/128 trust +host replication pivotal 2601:647:4800:c740:7cd8:f44e:3889:509e/128 trust +host replication pivotal 2601:647:4800:c740:40d4:7879:8a:dc04/128 trust +host replication pivotal 2601:647:4800:c740:c21a:83b2:8a4a:33bd/128 trust' +20230428:13:32:54:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: $GPHOME/sbin/seg_update_pg_hba.py --data-dir /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 --entries ' +host replication pivotal samehost trust +host all pivotal 10.0.0.234/32 trust +host all pivotal 172.17.0.1/32 trust +host all pivotal 10.16.119.186/32 trust +host all pivotal 2601:647:4800:c740:49b3:5dc9:f206:1c27/128 trust +host all pivotal 2601:647:4800:c740::b8fa/128 trust +host all pivotal 2601:647:4800:c740:21bb:8d13:b3c2:16d6/128 trust +host all pivotal 2601:647:4800:c740:f9d1:a4e5:bdbb:67cb/128 trust +host all pivotal 2601:647:4800:c740:dc25:37de:75f8:efdf/128 trust +host all pivotal 2601:647:4800:c740:3259:5e67:ae0d:643f/128 trust +host all pivotal 2601:647:4800:c740:7cd8:f44e:3889:509e/128 trust +host all pivotal 2601:647:4800:c740:40d4:7879:8a:dc04/128 trust +host all pivotal 2601:647:4800:c740:c21a:83b2:8a4a:33bd/128 trust +host replication pivotal 10.0.0.234/32 trust +host replication pivotal 172.17.0.1/32 trust +host replication pivotal 10.16.119.186/32 trust +host replication pivotal 2601:647:4800:c740:49b3:5dc9:f206:1c27/128 trust +host replication pivotal 2601:647:4800:c740::b8fa/128 trust +host replication pivotal 2601:647:4800:c740:21bb:8d13:b3c2:16d6/128 trust +host replication pivotal 2601:647:4800:c740:f9d1:a4e5:bdbb:67cb/128 trust +host replication pivotal 2601:647:4800:c740:dc25:37de:75f8:efdf/128 trust +host replication pivotal 2601:647:4800:c740:3259:5e67:ae0d:643f/128 trust +host replication pivotal 2601:647:4800:c740:7cd8:f44e:3889:509e/128 trust +host replication pivotal 2601:647:4800:c740:40d4:7879:8a:dc04/128 trust +host replication pivotal 2601:647:4800:c740:c21a:83b2:8a4a:33bd/128 trust' +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] finished cmd: Update pg_hba.conf cmdStr='ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 station6 ". /usr/local/gpdb/greenplum_path.sh; $GPHOME/sbin/seg_update_pg_hba.py --data-dir /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 --entries ' +host replication pivotal samehost trust +host all pivotal 10.0.0.234/32 trust +host all pivotal 172.17.0.1/32 trust +host all pivotal 10.16.119.186/32 trust +host all pivotal 2601:647:4800:c740:49b3:5dc9:f206:1c27/128 trust +host all pivotal 2601:647:4800:c740::b8fa/128 trust +host all pivotal 2601:647:4800:c740:21bb:8d13:b3c2:16d6/128 trust +host all pivotal 2601:647:4800:c740:f9d1:a4e5:bdbb:67cb/128 trust +host all pivotal 2601:647:4800:c740:dc25:37de:75f8:efdf/128 trust +host all pivotal 2601:647:4800:c740:3259:5e67:ae0d:643f/128 trust +host all pivotal 2601:647:4800:c740:7cd8:f44e:3889:509e/128 trust +host all pivotal 2601:647:4800:c740:40d4:7879:8a:dc04/128 trust +host all pivotal 2601:647:4800:c740:c21a:83b2:8a4a:33bd/128 trust +host replication pivotal 10.0.0.234/32 trust +host replication pivotal 172.17.0.1/32 trust +host replication pivotal 10.16.119.186/32 trust +host replication pivotal 2601:647:4800:c740:49b3:5dc9:f206:1c27/128 trust +host replication pivotal 2601:647:4800:c740::b8fa/128 trust +host replication pivotal 2601:647:4800:c740:21bb:8d13:b3c2:16d6/128 trust +host replication pivotal 2601:647:4800:c740:f9d1:a4e5:bdbb:67cb/128 trust +host replication pivotal 2601:647:4800:c740:dc25:37de:75f8:efdf/128 trust +host replication pivotal 2601:647:4800:c740:3259:5e67:ae0d:643f/128 trust +host replication pivotal 2601:647:4800:c740:7cd8:f44e:3889:509e/128 trust +host replication pivotal 2601:647:4800:c740:40d4:7879:8a:dc04/128 trust +host replication pivotal 2601:647:4800:c740:c21a:83b2:8a4a:33bd/128 trust'"' had result: cmd had rc=0 completed=True halted=False + stdout='' + stderr='' +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-WorkerPool haltWork() +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] haltWork +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] got a halt cmd +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[INFO]:-Successfully modified pg_hba.conf on primary segments to allow replication connections +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[INFO]:-1 segment(s) to recover +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[INFO]:-Ensuring 1 failed segment(s) are stopped +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: python -c 'import os; print(os.path.realpath("/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1"))' +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: head -1 /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1/postmaster.pid +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Skipping to stop segment /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 on host station6 since pid could not be found +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[INFO]:-Setting up the required segments for recovery +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Adding cmd to work_queue: $GPHOME/sbin/gpsegsetuprecovery.py -c '[{"target_datadir": "/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1", "target_port": 7006, "target_segment_dbid": 6, "source_hostname": "station6", "source_port": 7003, "is_full_recovery": false, "progress_file": "/home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out"}]' -l /home/pivotal/gpAdminLogs -v -b 64 --force-overwrite +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] got cmd: $GPHOME/sbin/gpsegsetuprecovery.py -c '[{"target_datadir": "/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1", "target_port": 7006, "target_segment_dbid": 6, "source_hostname": "station6", "source_port": 7003, "is_full_recovery": false, "progress_file": "/home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out"}]' -l /home/pivotal/gpAdminLogs -v -b 64 --force-overwrite +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: $GPHOME/sbin/gpsegsetuprecovery.py -c '[{"target_datadir": "/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1", "target_port": 7006, "target_segment_dbid": 6, "source_hostname": "station6", "source_port": 7003, "is_full_recovery": false, "progress_file": "/home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out"}]' -l /home/pivotal/gpAdminLogs -v -b 64 --force-overwrite +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] finished cmd: Run validation checks and setup data directories for recovery cmdStr='ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 station6 ". /usr/local/gpdb/greenplum_path.sh; $GPHOME/sbin/gpsegsetuprecovery.py -c '[{\"target_datadir\": \"/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1\", \"target_port\": 7006, \"target_segment_dbid\": 6, \"source_hostname\": \"station6\ [...] + stdout='20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[INFO]:-Starting recovery with args: -c [{"target_datadir": "/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1", "target_port": 7006, "target_segment_dbid": 6, "source_hostname": "station6", "source_port": 7003, "is_full_recovery": false, "progress_file": "/home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out"}] -l /home/pivotal/gpAdminLogs -v -b 64 --force-overwrite +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-WorkerPool() initialized with 1 workers +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-Adding cmd to work_queue: +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-[worker0] got cmd: +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-Do CHECKPOINT on station6 (port: 7003) before running pg_rewind. +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-Connecting to db template1 on host station6 +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-Running Command: rm -f /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1/postmaster.pid +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-[worker0] finished cmd: Setup for pg_rewind cmdStr='' had result: cmd had rc=0 completed=True halted=False + stdout='' + stderr='' +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-WorkerPool haltWork() +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-[worker0] haltWork +20230428:13:32:55:1968258 gpsegsetuprecovery.py:station6:pivotal-[DEBUG]:-[worker0] got a halt cmd +' + stderr='' +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[INFO]:-Updating configuration for mirrors +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Validating configuration changes... +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Connecting to db template1 on host localhost +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-mirror_to_remove: 0 segments +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-primary_to_remove: 0 segments +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-primary_to_add: 0 segments +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-mirror_to_add: 0 segments +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-initial_segment_to_update: 0 segments +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-mirror_to_remove_and_add: 0 segments +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_to_update: 0 segments +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_unchanged: 8 segments +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_unchanged: 1|-1|p|p|n|u|station6|station6|7000|/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/qddir/demoDataDir-1 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_unchanged: 8|-1|m|m|s|u|station6|station6|7001|/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/standby +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_unchanged: 2|0|p|p|s|u|station6|station6|7002|/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast1/demoDataDir0 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_unchanged: 5|0|m|m|s|u|station6|station6|7005|/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror1/demoDataDir0 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_unchanged: 3|1|p|p|n|u|station6|station6|7003|/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast2/demoDataDir1 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_unchanged: 6|1|m|m|n|d|station6|station6|7006|/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_unchanged: 4|2|p|p|s|u|station6|station6|7004|/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast3/demoDataDir2 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-segment_unchanged: 7|2|m|m|s|u|station6|station6|7007|/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror3/demoDataDir2 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Committing configuration table changes +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Generating configuration backout scripts +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[INFO]:-Initiating segment recovery. Upon completion, will start the successfully recovered segments +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/log/gp_era - read_gp_era +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-parse: # Greenplum Database era. +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-parse: # Do not change the contents of this file. +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-parse: era = 84f970d55cff544c_230428131001 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-match era: 84f970d55cff544c_230428131001 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[INFO]:-era is 84f970d55cff544c_230428131001 +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Adding cmd to work_queue: $GPHOME/sbin/gpsegrecovery.py -c '[{"target_datadir": "/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1", "target_port": 7006, "target_segment_dbid": 6, "source_hostname": "station6", "source_port": 7003, "is_full_recovery": false, "progress_file": "/home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out"}]' -l /home/pivotal/gpAdminLogs -v -b 64 --force-overwrite [...] +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker1] got cmd: $GPHOME/sbin/gpsegrecovery.py -c '[{"target_datadir": "/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1", "target_port": 7006, "target_segment_dbid": 6, "source_hostname": "station6", "source_port": 7003, "is_full_recovery": false, "progress_file": "/home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out"}]' -l /home/pivotal/gpAdminLogs -v -b 64 --force-overwrite --era= [...] +20230428:13:32:55:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: $GPHOME/sbin/gpsegrecovery.py -c '[{"target_datadir": "/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1", "target_port": 7006, "target_segment_dbid": 6, "source_hostname": "station6", "source_port": 7003, "is_full_recovery": false, "progress_file": "/home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out"}]' -l /home/pivotal/gpAdminLogs -v -b 64 --force-overwrite --era=84 [...] +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker1] finished cmd: Recover segments cmdStr='ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 station6 ". /usr/local/gpdb/greenplum_path.sh; $GPHOME/sbin/gpsegrecovery.py -c '[{\"target_datadir\": \"/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1\", \"target_port\": 7006, \"target_segment_dbid\": 6, \"source_hostname\": \"station6\", \"source_port\": 7003, \"is_full_recovery\": fa [...] + stdout='20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[INFO]:-Starting recovery with args: -c [{"target_datadir": "/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1", "target_port": 7006, "target_segment_dbid": 6, "source_hostname": "station6", "source_port": 7003, "is_full_recovery": false, "progress_file": "/home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out"}] -l /home/pivotal/gpAdminLogs -v -b 64 --force-overwrite --era=84f970d55 [...] +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-WorkerPool() initialized with 1 workers +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-Adding cmd to work_queue: +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-[worker0] got cmd: +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[INFO]:-Running pg_rewind with progress output temporarily in /home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-Running Command: [ -f /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1/standby.signal ] || PGOPTIONS="-c gp_role=utility" $GPHOME/bin/pg_rewind --write-recovery-conf --slot="internal_wal_replication_slot" --source-server="host=station6 port=7003 dbname=template1 application_name=__gprecoverseg_pg_rewind__" --target-pgdata=/home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/dem [...] +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[INFO]:-Successfully ran pg_rewind for dbid: 6 +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[INFO]:-Updating /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1/postgresql.conf +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-Running Command: perl -p -i.bak -e 's/^port[ ]*=[ ]*\d+/port=7006/' /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1/postgresql.conf +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[INFO]:-SegmentStart pg_ctl cmd is env GPSESSID=0000000000 GPERA=84f970d55cff544c_230428131001 $GPHOME/bin/pg_ctl -D /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 -l /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1/log/startup.log -w -t 600 -o " -c gp_role=utility " start +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[INFO]:-Starting new segment with dbid 6: cmdStr='env GPSESSID=0000000000 GPERA=84f970d55cff544c_230428131001 $GPHOME/bin/pg_ctl -D /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 -l /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1/log/startup.log -w -t 600 -o " -c gp_role=utility " start 2>&1' +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-Running Command: env GPSESSID=0000000000 GPERA=84f970d55cff544c_230428131001 $GPHOME/bin/pg_ctl -D /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1 -l /home/pivotal/workspace/gpdb/gpAux/gpdemo/datadirs/dbfast_mirror2/demoDataDir1/log/startup.log -w -t 600 -o " -c gp_role=utility " start 2>&1 +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-[worker0] finished cmd: Run pg_rewind cmdStr='' had result: cmd had rc=0 completed=True halted=False + stdout='' + stderr='' +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-WorkerPool haltWork() +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-[worker0] haltWork +20230428:13:32:56:1968306 gpsegrecovery.py:station6:pivotal-[DEBUG]:-[worker0] got a halt cmd +' + stderr='' +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: set -o pipefail; touch -a /home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out; tail -1 /home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out | tr '\r' '\n' | tail -1 +station6 (dbid 6): skipping pg_rewind on mirror as standby.signal is present +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Adding cmd to work_queue: rm -f /home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker2] got cmd: rm -f /home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Running Command: rm -f /home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker2] finished cmd: remove file cmdStr='ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 station6 ". /usr/local/gpdb/greenplum_path.sh; rm -f /home/pivotal/gpAdminLogs/pg_rewind.20230428_133255.dbid6.out"' had result: cmd had rc=0 completed=True halted=False + stdout='' + stderr='' +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[INFO]:-Triggering FTS probe +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-Connecting to db template1 on host localhost +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[INFO]:-******************************** +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[INFO]:-Segments successfully recovered. +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[INFO]:-******************************** +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[INFO]:-Recovered mirror segments need to sync WAL with primary segments. +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[INFO]:-Use 'gpstate -e' to check progress of WAL sync remaining bytes +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-WorkerPool haltWork() +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker1] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker2] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker3] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker4] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker5] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker6] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker7] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker6] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker3] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker8] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker10] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker8] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker5] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker7] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker9] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker4] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker9] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker11] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker10] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker12] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker11] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker13] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker12] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker14] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker13] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker15] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker14] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker0] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker15] haltWork +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker1] got a halt cmd +20230428:13:32:56:1967876 gprecoverseg:station6:pivotal-[DEBUG]:-[worker2] got a halt cmd + +-- end_ignore +(exited with code 0) +SELECT wait_until_all_segments_synchronized(); + wait_until_all_segments_synchronized +-------------------------------------- + OK +(1 row) diff --git a/src/test/isolation2/expected/setup.out b/src/test/isolation2/expected/setup.out index 10f69cd1eb..7834efcea9 100644 --- a/src/test/isolation2/expected/setup.out +++ b/src/test/isolation2/expected/setup.out @@ -139,3 +139,7 @@ CREATE -- Helper function that ensures stats collector receives stat from the latest operation. create or replace function wait_until_dead_tup_change_to(relid oid, stat_val_expected bigint) returns text as $$ declare stat_val int; /* in func */ i int; /* in func */ begin i := 0; /* in func */ while i < 1200 loop select pg_stat_get_dead_tuples(relid) into stat_val; /* in func */ if stat_val = stat_val_expected then /* in func */ return 'OK'; /* in func */ end if; /* in func */ perform pg_sleep(0.1); /* in func */ perform pg_stat_clear_snapshot(); /* in func */ i := i + 1; /* in func [...] CREATE + +-- Helper function that ensures mirror of the specified contentid is down. +create or replace function wait_for_mirror_down(contentid smallint, timeout_sec integer) returns bool as $$ declare i int; /* in func */ begin /* in func */ i := 0; /* in func */ loop /* in func */ perform gp_request_fts_probe_scan(); /* in func */ if (select count(1) from gp_segment_configuration where role='m' and content=$1 and status='d') = 1 then /* in func */ return true; /* in func */ end if; /* in func */ if i >= 2 * $2 then /* in func */ return false; /* in func */ end if; /* in [...] +CREATE diff --git a/src/test/isolation2/sql/segwalrep/select_throttle.sql b/src/test/isolation2/sql/segwalrep/select_throttle.sql index 6c613bb241..5094e2b1ed 100644 --- a/src/test/isolation2/sql/segwalrep/select_throttle.sql +++ b/src/test/isolation2/sql/segwalrep/select_throttle.sql @@ -1,19 +1,15 @@ -- Test: SELECT or other read-only operations which set hint bits on pages, -- and in turn generate Full Page Images (FPI) WAL records should be throttled. -- We will only throttle when our transaction wal exceeds --- wait_for_replication_threshold --- For this test we will: +-- wait_for_replication_threshold. While the backend is throttled waiting for +-- synchronous replication, it should not block the CHECKPOINT process, and if +-- the mirror is taken down, synchronous replication should be turned off and +-- the backend should stop waiting for synchronous replication and proceed. + +-- Setup: -- 1. Set wait_for_replication_threshold to 1kB for quicker test -- 2. create two tables (one small and one large) -- 3. set gp_disable_tuple_hints=off so buffer will be immediately marked dirty on hint bit change --- 4. Suspend walsender --- 5. Perform a read-only operation (SELECT) which would now set the hint bits --- For the small table this operation should finish, --- but for large table the SELECT should be throttled --- since it would generate a lot of WAL greater than wait_for_replication_threshold --- 6. Confirm that the query is waiting on Syncrep --- 7. Reset the walsender and the transaction should complete --- -- set wait_for_replication_threshold to 1kB for quicker test ALTER SYSTEM SET wait_for_replication_threshold = 1; @@ -28,6 +24,15 @@ INSERT INTO select_throttle SELECT generate_series (1, 100000); -- (so that we don't have to wait for the tuple to age. See logic in markDirty) 1U: SET gp_disable_tuple_hints=off; +-- Test 1: +-- 1. Suspend walsender +-- 2. Perform a read-only operation (SELECT) which would now set the hint bits +-- For the small table this operation should finish, +-- but for large table the SELECT should be throttled +-- since it would generate a lot of WAL greater than wait_for_replication_threshold +-- 3. Confirm that the query is waiting on Syncrep +-- 4. Reset the walsender and the transaction should complete + -- flush the data to disk checkpoint; @@ -51,8 +56,69 @@ SELECT gp_inject_fault_infinite('wal_sender_loop', 'reset', dbid) FROM gp_segmen 1U<: SELECT wait_until_all_segments_synchronized(); + +-- Test 2: +-- 1. Suspend walsender +-- 2. Perform a read-only operation (SELECT) which would now set the hint bits +-- For the large table the SELECT should be throttled +-- since it would generate a lot of WAL greater than wait_for_replication_threshold +-- 3. Confirm that the query is waiting on Syncrep +-- 4. Perform CHECKPOINT and confirm that it does not block +-- 5. Stop the mirror +-- 6. Reset the walsender and the transaction should complete without waiting for syncrep + +-- Setup: +-- set mirror down grace period to zero to instantly mark mirror down. +-- the 1Uq and 1U pair will force a wait on the config reload. +!\retcode gpconfig -c gp_fts_mark_mirror_down_grace_period -v 2; +!\retcode gpstop -u; +1Uq: +1U: show gp_fts_mark_mirror_down_grace_period; +-- Enable tuple hints so that buffer will be marked dirty upon a hint bit change +-- (so that we don't have to wait for the tuple to age. See logic in markDirty) +1U: SET gp_disable_tuple_hints=off; +Truncate select_throttle; +INSERT INTO select_throttle SELECT generate_series (1, 100000); +-- flush the data to disk +checkpoint; +-- Suspend walsender +SELECT gp_inject_fault_infinite('wal_sender_loop', 'suspend', dbid) FROM gp_segment_configuration WHERE role = 'p' and content = 1; + +checkpoint; +-- SELECT will set the hint bit on (the buffer will be marked dirty) +-- This query should wait for Syncrep since its WAL size for hint bits is greater than wait_for_replication_threshold +1U&: SELECT count(*) FROM select_throttle; + +-- check if the above query is waiting on SyncRep in pg_stat_activity +SELECT is_query_waiting_for_syncrep(50, 'SELECT count(*) FROM select_throttle;'); + +-- while SELECT is waiting for syncrep, it should not block a subsequent checkpoint +CHECKPOINT; + +-- stop the mirror should turn off syncrep +SELECT pg_ctl(datadir, 'stop', 'immediate') FROM gp_segment_configuration WHERE content=1 AND role = 'm'; + +-- reset walsender and let it exit so that mirror stop can be detected +SELECT gp_inject_fault_infinite('wal_sender_loop', 'reset', dbid) FROM gp_segment_configuration WHERE role = 'p' and content = 1; + +-- perform fts probe scan and verify that mirror is down +select wait_for_mirror_down(1::smallint, 30); +select content, role, preferred_role, mode, status from gp_segment_configuration where content = 1; + +-- after mirror is stopped, the SELECT query should proceed without waiting for syncrep +1U<: + +-- Cleanup 1U: RESET gp_disable_tuple_hints; +-- reset the mirror down grace period back to its default value. +-- the 1Uq and 1U pair will force a wait on the config reload. +!\retcode gpconfig -r gp_fts_mark_mirror_down_grace_period; +!\retcode gpstop -u; 1Uq: +1U: show gp_fts_mark_mirror_down_grace_period; ALTER SYSTEM RESET wait_for_replication_threshold; SELECT pg_reload_conf(); + +!\retcode gprecoverseg -av; +SELECT wait_until_all_segments_synchronized(); diff --git a/src/test/isolation2/sql/setup.sql b/src/test/isolation2/sql/setup.sql index af10262d12..9d6c0cc245 100644 --- a/src/test/isolation2/sql/setup.sql +++ b/src/test/isolation2/sql/setup.sql @@ -419,3 +419,23 @@ begin return 'Fail'; /* in func */ end; /* in func */ $$ language plpgsql; + +-- Helper function that ensures mirror of the specified contentid is down. +create or replace function wait_for_mirror_down(contentid smallint, timeout_sec integer) returns bool as +$$ +declare i int; /* in func */ +begin /* in func */ + i := 0; /* in func */ + loop /* in func */ + perform gp_request_fts_probe_scan(); /* in func */ + if (select count(1) from gp_segment_configuration where role='m' and content=$1 and status='d') = 1 then /* in func */ + return true; /* in func */ + end if; /* in func */ + if i >= 2 * $2 then /* in func */ + return false; /* in func */ + end if; /* in func */ + perform pg_sleep(0.5); /* in func */ + i = i + 1; /* in func */ + end loop; /* in func */ +end; /* in func */ +$$ language plpgsql; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
