On 07.10.2020 11:18, Michael Paquier wrote:
On Fri, Sep 25, 2020 at 08:53:27AM +0200, Michael Banck wrote:
Oh right, I've fixed up the commit message in the attached V4.
Not much a fan of what's proposed here, for a couple of reasons:
- If the page is not new, we should check if the header is sane or
not.
- It may be better to actually count checksum failures if these are
repeatable in a given block, and report them.
- The code would be more simple with a "continue" added for a block
detected as new or with a LSN newer than the backup start.
- The new error messages are confusing, and I think that these are
hard to translate in a meaningful way.
I am interested in moving this patch forward, so here is the updated v5.
This version uses PageIsVerified. All error messages are left unchanged.
So I think that we should try to use PageIsVerified() directly in the
code path of basebackup.c, and this requires a couple of changes to
make the routine more extensible:
- Addition of a dboid argument for pgstat_report_checksum_failure(),
whose call needs to be changed to use the *_in_db() flavor.

In the current patch, PageIsVerifed called from pgbasebackup simply doesn't report failures to pgstat. Because in basebackup.c we already report checksum failures separately. Once per file.
        pgstat_report_checksum_failures_in_db(dboid, checksum_failures);

Should we change this too? I don't see any difference.

- Addition of an option to decide if a log should be generated or
not.
- Addition of an option to control if a checksum failure should be
reported to pgstat or not, even if this is actually linked to the
previous point, I have seen cases where being able to control both
separately would be helpful, particularly the log part.

For the last two ones, I would use an extensible argument based on
bits32 with a set of flags that the caller can set at will.
Done.

--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

commit 553d62dbcdec1a00139f3c5ab6a325ed857b6c9d
Author: anastasia <a.lubennik...@postgrespro.ru>
Date:   Fri Oct 16 17:36:02 2020 +0300

    Fix checksum verification in base backups for zero page headers
    
    We currently do not checksum a page if it is considered new by PageIsNew().
    However, this means that if the whole page header is zero, PageIsNew() will
    consider that page new due to the pd_upper field being zero and no subsequent
    checksum verification is done.
    
    To fix, use PageIsVerified() in pg_basebackup code.
    
    Reported-By: Andres Freund
    Author: Michael Banck
    Reviewed-By: Asif Rehman, Anastasia Lubennikova
    Discussion: https://postgr.es/m/20190319200050.ncuxejradurja...@alap3.anarazel.de

diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index dbbd3aa31f..b4e050c9b8 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -443,7 +443,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 
 		smgrread(src, forkNum, blkno, buf.data);
 
-		if (!PageIsVerified(page, blkno))
+		if (!PageIsVerified(page, blkno, PIV_LOG_WARNING | PIV_REPORT_STAT))
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
 					 errmsg("invalid page in block %u of relation %s",
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index b89df01fa7..aa8d52c1aa 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1682,11 +1682,14 @@ sendFile(const char *readfilename, const char *tarfilename,
 				 * this case. We also skip completely new pages, since they
 				 * don't have a checksum yet.
 				 */
-				if (!PageIsNew(page) && PageGetLSN(page) < startptr)
+				if (PageGetLSN(page) < startptr)
 				{
-					checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
-					phdr = (PageHeader) page;
-					if (phdr->pd_checksum != checksum)
+					int piv_flags = (checksum_failures < 5) ? (PIV_LOG_WARNING) : 0;
+					/*
+					 * Do not report checksum failures to pgstats from
+					 *  PageIsVerified, since we will do it later.
+					 */
+					if (!PageIsVerified(page, blkno, piv_flags))
 					{
 						/*
 						 * Retry the block on the first failure.  It's
@@ -1732,13 +1735,6 @@ sendFile(const char *readfilename, const char *tarfilename,
 
 						checksum_failures++;
 
-						if (checksum_failures <= 5)
-							ereport(WARNING,
-									(errmsg("checksum verification failed in "
-											"file \"%s\", block %d: calculated "
-											"%X but expected %X",
-											readfilename, blkno, checksum,
-											phdr->pd_checksum)));
 						if (checksum_failures == 5)
 							ereport(WARNING,
 									(errmsg("further checksum verification "
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e549fa1d30..971c96c7de 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -917,7 +917,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			}
 
 			/* check for garbage data */
-			if (!PageIsVerified((Page) bufBlock, blockNum))
+			if (!PageIsVerified((Page) bufBlock, blockNum,  PIV_LOG_WARNING | PIV_REPORT_STAT))
 			{
 				if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
 				{
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 4bc2bf955d..d52e8710ea 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -79,7 +79,7 @@ PageInit(Page page, Size pageSize, Size specialSize)
  * will clean up such a page and make it usable.
  */
 bool
-PageIsVerified(Page page, BlockNumber blkno)
+PageIsVerified(Page page, BlockNumber blkno, int flags)
 {
 	PageHeader	p = (PageHeader) page;
 	size_t	   *pagebytes;
@@ -140,12 +140,14 @@ PageIsVerified(Page page, BlockNumber blkno)
 	 */
 	if (checksum_failure)
 	{
-		ereport(WARNING,
-				(errcode(ERRCODE_DATA_CORRUPTED),
-				 errmsg("page verification failed, calculated checksum %u but expected %u",
-						checksum, p->pd_checksum)));
+		if((flags & PIV_LOG_WARNING) != 0)
+			ereport(WARNING,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("page verification failed, calculated checksum %u but expected %u",
+							checksum, p->pd_checksum)));
 
-		pgstat_report_checksum_failure();
+		if((flags & PIV_REPORT_STAT) != 0)
+			pgstat_report_checksum_failure();
 
 		if (header_sane && ignore_checksum_failure)
 			return true;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index f674a7c94e..093ec204fd 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
 use File::Path qw(rmtree);
 use PostgresNode;
 use TestLib;
-use Test::More tests => 109;
+use Test::More tests => 112;
 
 program_help_ok('pg_basebackup');
 program_version_ok('pg_basebackup');
@@ -524,10 +524,27 @@ $node->command_checks_all(
 	[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt" ],
 	1,
 	[qr{^$}],
-	[qr/^WARNING.*checksum verification failed/s],
+	[qr/^WARNING.*page verification failed/s],
 	'pg_basebackup reports checksum mismatch');
 rmtree("$tempdir/backup_corrupt");
 
+# zero out the pageheader completely
+open $file, '+<', "$pgdata/$file_corrupt1";
+seek($file, $pageheader_size, 0);
+system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
+my $zero_data = "\0"x$pageheader_size;
+syswrite($file, $zero_data);
+close $file;
+system_or_bail 'pg_ctl', '-D', $pgdata, 'start';
+
+$node->command_checks_all(
+	[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt1a" ],
+	1,
+	[qr{^$}],
+	[qr/^WARNING.*page verification failed/s],
+	"pg_basebackup reports checksum mismatch for zeroed pageheader");
+rmtree("$tempdir/backup_corrupt1a");
+
 # induce further corruption in 5 more blocks
 system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
 open $file, '+<', "$pgdata/$file_corrupt1";
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 51b8f994ac..622e9aa4db 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -404,9 +404,14 @@ do { \
  *		extern declarations
  * ----------------------------------------------------------------
  */
+/*flags for PageAddItemExtended */
 #define PAI_OVERWRITE			(1 << 0)
 #define PAI_IS_HEAP				(1 << 1)
 
+/* flags for PageIsVerified */
+#define PIV_LOG_WARNING			(1 << 0)
+#define PIV_REPORT_STAT			(1 << 1)
+
 #define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap) \
 	PageAddItemExtended(page, item, size, offsetNumber, \
 						((overwrite) ? PAI_OVERWRITE : 0) | \
@@ -423,7 +428,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
 				 "BLCKSZ has to be a multiple of sizeof(size_t)");
 
 extern void PageInit(Page page, Size pageSize, Size specialSize);
-extern bool PageIsVerified(Page page, BlockNumber blkno);
+extern bool PageIsVerified(Page page, BlockNumber blkno, int flags);
 extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
 										OffsetNumber offsetNumber, int flags);
 extern Page PageGetTempPage(Page page);

Reply via email to