This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit b478dc20b18e9df80659434e57b69b89cbfef990
Author: Wang Weinan <[email protected]>
AuthorDate: Fri Jan 7 12:32:12 2022 +0800

    Fix checkpoint wal replay failed issue
    
    Stop the cluster with immediate mode, the instance does not have
    chance to write a SHUTDOWN_CHECKPOINT WAL. Hence, WAL replay is
    trigger in startup process in next time cluster start. It will
    collect a bunch of global txn which is not finished and write a
    SHUTDOWN_CHECKPOINT to promise the cluster consistency.
    
    Before we believe a RecordShort is enough for our extend checkpoint
    wal due to global txn information. It is not always true, in data
    bulk import into env, at least ten import session is working, so
    RecordLong may used.
    
    For every time the cluster instance start, it will valid the last
    checkpoint wal size and replay. If the checkpoint wal was used
    RecordLong the size validation believe it is an illegal xlog. The
    issue is triggered.
    
    Fix it.
---
 src/backend/access/transam/xlog.c                  |  24 +-
 .../isolation2/expected/checkpoint_dtx_info.out    | 270 +++++++++++++++++++++
 src/test/isolation2/sql/checkpoint_dtx_info.sql    | 134 ++++++++++
 3 files changed, 421 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/transam/xlog.c 
b/src/backend/access/transam/xlog.c
index 8900018dd4..20a8fbbe0b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8837,7 +8837,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, 
XLogRecPtr RecPtr,
        uint8           info;
        bool sizeOk;
        uint32 chkpt_len;
-       uint32 chkpt_tot_len;
+       uint32 chkpt_hdr_len_short;
+       uint32 chkpt_hdr_len_long;
+       bool length_match;
 
        if (!XRecOffIsValid(RecPtr))
        {
@@ -8919,16 +8921,24 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, 
XLogRecPtr RecPtr,
        /*
         * GPDB: Verify the Checkpoint record length. For an extended Checkpoint
         * record (when record total length is greater than regular checkpoint
-        * record total length), compare the difference between the regular
-        * checkpoint size and the extended variable size.
+        * record total length, e.g. in the case of containing DTX info), 
compare
+        * the difference between the regular checkpoint size and the extended
+        * variable size.
         */
        sizeOk = false;
        chkpt_len = XLogRecGetDataLen(xlogreader);
-       chkpt_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + 
sizeof(CheckPoint);
-       if ((chkpt_len == sizeof(CheckPoint) && record->xl_tot_len == 
chkpt_tot_len) ||
+       chkpt_hdr_len_short = SizeOfXLogRecord + 
SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint);
+       chkpt_hdr_len_long = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderLong 
+ sizeof(CheckPoint);
+
+       if (chkpt_len > 255) /* for XLR_BLOCK_ID_DATA_LONG */
+               length_match = ((chkpt_len - sizeof(CheckPoint)) == 
(record->xl_tot_len - chkpt_hdr_len_long));
+       else /* for XLR_BLOCK_ID_DATA_SHORT */
+               length_match = ((chkpt_len - sizeof(CheckPoint)) == 
(record->xl_tot_len - chkpt_hdr_len_short));
+
+       if ((chkpt_len == sizeof(CheckPoint) && record->xl_tot_len == 
chkpt_hdr_len_short) ||
                ((chkpt_len > sizeof(CheckPoint) &&
-                 record->xl_tot_len > chkpt_tot_len &&
-                 ((chkpt_len - sizeof(CheckPoint)) == (record->xl_tot_len - 
chkpt_tot_len)))))
+                 record->xl_tot_len > chkpt_hdr_len_short &&
+                 length_match)))
                sizeOk = true;
 
        if (!sizeOk)
diff --git a/src/test/isolation2/expected/checkpoint_dtx_info.out 
b/src/test/isolation2/expected/checkpoint_dtx_info.out
index baf785a827..e17e870663 100644
--- a/src/test/isolation2/expected/checkpoint_dtx_info.out
+++ b/src/test/isolation2/expected/checkpoint_dtx_info.out
@@ -93,3 +93,273 @@ server closed the connection unexpectedly
 -------
  0     
 (1 row)
+
+-- Validate CHECKPOINT XLOG record length, verifying issue
+-- https://github.com/greenplum-db/gpdb/issues/12977.
+-- The extended CHECKPOINT WAL record contains global transaction
+-- information, it could exceed the previous expected length in
+-- SizeOfXLogRecordDataHeaderShort, result in crash recovery
+-- failure on coordinator. The solution is adding the expected length
+-- in SizeOfXLogRecordDataHeaderLong also, to fixup the missing condition.
+create table ckpt_xlog_len_tbl(a int, b int);
+CREATE
+
+-- Need to start at least 18 concurrent sessions to create a long header
+-- CHECKPOINT WAL record, which size is not less than 256.
+2q: ... <quitting>
+33q: ... <quitting>
+
+10: select gp_inject_fault_infinite('start_insertedDistributedCommitted', 
'suspend', 1);
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+10: begin;
+BEGIN
+10: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+10&: commit;  <waiting ...>
+
+11: begin;
+BEGIN
+11: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+11&: commit;  <waiting ...>
+
+12: begin;
+BEGIN
+12: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+12&: commit;  <waiting ...>
+
+13: begin;
+BEGIN
+13: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+13&: commit;  <waiting ...>
+
+14: begin;
+BEGIN
+14: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+14&: commit;  <waiting ...>
+
+15: begin;
+BEGIN
+15: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+15&: commit;  <waiting ...>
+
+16: begin;
+BEGIN
+16: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+16&: commit;  <waiting ...>
+
+17: begin;
+BEGIN
+17: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+17&: commit;  <waiting ...>
+
+18: begin;
+BEGIN
+18: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+18&: commit;  <waiting ...>
+
+19: begin;
+BEGIN
+19: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+19&: commit;  <waiting ...>
+
+20: begin;
+BEGIN
+20: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+20&: commit;  <waiting ...>
+
+21: begin;
+BEGIN
+21: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+21&: commit;  <waiting ...>
+
+22: begin;
+BEGIN
+22: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+22&: commit;  <waiting ...>
+
+23: begin;
+BEGIN
+23: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+23&: commit;  <waiting ...>
+
+24: begin;
+BEGIN
+24: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+24&: commit;  <waiting ...>
+
+25: begin;
+BEGIN
+25: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+25&: commit;  <waiting ...>
+
+26: begin;
+BEGIN
+26: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+26&: commit;  <waiting ...>
+
+27: begin;
+BEGIN
+27: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+27&: commit;  <waiting ...>
+
+28: begin;
+BEGIN
+28: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+INSERT 10
+28&: commit;  <waiting ...>
+
+-- wait to make sure the commit is taking place and blocked at 
start_insertedDistributedCommitted
+2: select gp_wait_until_triggered_fault('start_insertedDistributedCommitted', 
1, 1);
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+2: select gp_inject_fault_infinite('before_wait_VirtualXIDsDelayingChkpt', 
'skip', 1);
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+33&: checkpoint;  <waiting ...>
+2: select gp_inject_fault_infinite('keep_log_seg', 'panic', 1);
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+-- wait to make sure we don't resume commit processing before this
+-- step in checkpoint
+2: select 
gp_wait_until_triggered_fault('before_wait_VirtualXIDsDelayingChkpt', 1, 1);
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+-- reason for this inifinite wait is just to avoid test flake. Without
+-- this joining step "1<" may see "COMMIT" sometimes or "server closed
+-- the connection unexpectedly" otherwise. With this its always
+-- "server closed the connection unexpectedly".
+2: select gp_inject_fault_infinite('after_xlog_xact_distributed_commit', 
'infinite_loop', 1);
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+2: select gp_inject_fault_infinite('start_insertedDistributedCommitted', 
'resume', 1);
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+10<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+33<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+11<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+12<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+13<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+14<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+15<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+16<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+17<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+18<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+19<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+20<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+21<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+22<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+23<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+24<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+25<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+26<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+27<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+28<:  <... completed>
+server closed the connection unexpectedly
+       This probably means the server terminated abnormally
+       before or while processing the request.
+
+3q: ... <quitting>
+3: select 1;
+ ?column? 
+----------
+ 1        
+(1 row)
+3: select count(*) from ckpt_xlog_len_tbl;
+ count 
+-------
+ 190   
+(1 row)
+
+3: drop table ckpt_xlog_len_tbl;
+DROP
diff --git a/src/test/isolation2/sql/checkpoint_dtx_info.sql 
b/src/test/isolation2/sql/checkpoint_dtx_info.sql
index 029ee5a48f..85a7d9c79a 100644
--- a/src/test/isolation2/sql/checkpoint_dtx_info.sql
+++ b/src/test/isolation2/sql/checkpoint_dtx_info.sql
@@ -49,3 +49,137 @@
 -- wait until coordinator is up for querying.
 3: select 1;
 3: select count(1) from twopcbug;
+
+-- Validate CHECKPOINT XLOG record length, verifying issue
+-- https://github.com/greenplum-db/gpdb/issues/12977.
+-- The extended CHECKPOINT WAL record contains global transaction
+-- information, it could exceed the previous expected length in
+-- SizeOfXLogRecordDataHeaderShort, result in crash recovery
+-- failure on coordinator. The solution is adding the expected length
+-- in SizeOfXLogRecordDataHeaderLong also, to fixup the missing condition.
+create table ckpt_xlog_len_tbl(a int, b int);
+
+-- Need to start at least 18 concurrent sessions to create a long header
+-- CHECKPOINT WAL record, which size is not less than 256.
+2q:
+33q:
+
+10: select gp_inject_fault_infinite('start_insertedDistributedCommitted', 
'suspend', 1);
+
+10: begin;
+10: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+10&: commit;
+
+11: begin;
+11: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+11&: commit;
+
+12: begin;
+12: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+12&: commit;
+
+13: begin;
+13: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+13&: commit;
+
+14: begin;
+14: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+14&: commit;
+
+15: begin;
+15: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+15&: commit;
+
+16: begin;
+16: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+16&: commit;
+
+17: begin;
+17: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+17&: commit;
+
+18: begin;
+18: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+18&: commit;
+
+19: begin;
+19: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+19&: commit;
+
+20: begin;
+20: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+20&: commit;
+
+21: begin;
+21: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+21&: commit;
+
+22: begin;
+22: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+22&: commit;
+
+23: begin;
+23: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+23&: commit;
+
+24: begin;
+24: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+24&: commit;
+
+25: begin;
+25: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+25&: commit;
+
+26: begin;
+26: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+26&: commit;
+
+27: begin;
+27: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+27&: commit;
+
+28: begin;
+28: insert into ckpt_xlog_len_tbl select i,i from generate_series(1,10)i;
+28&: commit;
+
+-- wait to make sure the commit is taking place and blocked at 
start_insertedDistributedCommitted
+2: select gp_wait_until_triggered_fault('start_insertedDistributedCommitted', 
1, 1);
+2: select gp_inject_fault_infinite('before_wait_VirtualXIDsDelayingChkpt', 
'skip', 1);
+33&: checkpoint;
+2: select gp_inject_fault_infinite('keep_log_seg', 'panic', 1);
+-- wait to make sure we don't resume commit processing before this
+-- step in checkpoint
+2: select 
gp_wait_until_triggered_fault('before_wait_VirtualXIDsDelayingChkpt', 1, 1);
+-- reason for this inifinite wait is just to avoid test flake. Without
+-- this joining step "1<" may see "COMMIT" sometimes or "server closed
+-- the connection unexpectedly" otherwise. With this its always
+-- "server closed the connection unexpectedly".
+2: select gp_inject_fault_infinite('after_xlog_xact_distributed_commit', 
'infinite_loop', 1);
+2: select gp_inject_fault_infinite('start_insertedDistributedCommitted', 
'resume', 1);
+
+10<:
+33<:
+11<:
+12<:
+13<:
+14<:
+15<:
+16<:
+17<:
+18<:
+19<:
+20<:
+21<:
+22<:
+23<:
+24<:
+25<:
+26<:
+27<:
+28<:
+
+3q:
+3: select 1;
+3: select count(*) from ckpt_xlog_len_tbl;
+
+3: drop table ckpt_xlog_len_tbl;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to