On 07.10.2020 11:18, Michael Paquier wrote:
On Fri, Sep 25, 2020 at 08:53:27AM +0200, Michael Banck wrote:
Oh right, I've fixed up the commit message in the attached V4.
Not much a fan of what's proposed here, for a couple of reasons:
- If the page is not new, we should check if the header is sane or
not.
- It may be better to actually count checksum failures if these are
repeatable in a given block, and report them.
- The code would be more simple with a "continue" added for a block
detected as new or with a LSN newer than the backup start.
- The new error messages are confusing, and I think that these are
hard to translate in a meaningful way.
I am interested in moving this patch forward, so here is the updated v5.
This version uses PageIsVerified. All error messages are left unchanged.
So I think that we should try to use PageIsVerified() directly in the
code path of basebackup.c, and this requires a couple of changes to
make the routine more extensible:
- Addition of a dboid argument for pgstat_report_checksum_failure(),
whose call needs to be changed to use the *_in_db() flavor.
In the current patch, PageIsVerifed called from pgbasebackup simply
doesn't report failures to pgstat.
Because in basebackup.c we already report checksum failures separately.
Once per file.
pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
Should we change this too? I don't see any difference.
- Addition of an option to decide if a log should be generated or
not.
- Addition of an option to control if a checksum failure should be
reported to pgstat or not, even if this is actually linked to the
previous point, I have seen cases where being able to control both
separately would be helpful, particularly the log part.
For the last two ones, I would use an extensible argument based on
bits32 with a set of flags that the caller can set at will.
Done.
--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
commit 553d62dbcdec1a00139f3c5ab6a325ed857b6c9d
Author: anastasia <a.lubennik...@postgrespro.ru>
Date: Fri Oct 16 17:36:02 2020 +0300
Fix checksum verification in base backups for zero page headers
We currently do not checksum a page if it is considered new by PageIsNew().
However, this means that if the whole page header is zero, PageIsNew() will
consider that page new due to the pd_upper field being zero and no subsequent
checksum verification is done.
To fix, use PageIsVerified() in pg_basebackup code.
Reported-By: Andres Freund
Author: Michael Banck
Reviewed-By: Asif Rehman, Anastasia Lubennikova
Discussion: https://postgr.es/m/20190319200050.ncuxejradurja...@alap3.anarazel.de
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index dbbd3aa31f..b4e050c9b8 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -443,7 +443,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
smgrread(src, forkNum, blkno, buf.data);
- if (!PageIsVerified(page, blkno))
+ if (!PageIsVerified(page, blkno, PIV_LOG_WARNING | PIV_REPORT_STAT))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid page in block %u of relation %s",
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index b89df01fa7..aa8d52c1aa 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1682,11 +1682,14 @@ sendFile(const char *readfilename, const char *tarfilename,
* this case. We also skip completely new pages, since they
* don't have a checksum yet.
*/
- if (!PageIsNew(page) && PageGetLSN(page) < startptr)
+ if (PageGetLSN(page) < startptr)
{
- checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
- phdr = (PageHeader) page;
- if (phdr->pd_checksum != checksum)
+ int piv_flags = (checksum_failures < 5) ? (PIV_LOG_WARNING) : 0;
+ /*
+ * Do not report checksum failures to pgstats from
+ * PageIsVerified, since we will do it later.
+ */
+ if (!PageIsVerified(page, blkno, piv_flags))
{
/*
* Retry the block on the first failure. It's
@@ -1732,13 +1735,6 @@ sendFile(const char *readfilename, const char *tarfilename,
checksum_failures++;
- if (checksum_failures <= 5)
- ereport(WARNING,
- (errmsg("checksum verification failed in "
- "file \"%s\", block %d: calculated "
- "%X but expected %X",
- readfilename, blkno, checksum,
- phdr->pd_checksum)));
if (checksum_failures == 5)
ereport(WARNING,
(errmsg("further checksum verification "
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e549fa1d30..971c96c7de 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -917,7 +917,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
}
/* check for garbage data */
- if (!PageIsVerified((Page) bufBlock, blockNum))
+ if (!PageIsVerified((Page) bufBlock, blockNum, PIV_LOG_WARNING | PIV_REPORT_STAT))
{
if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
{
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 4bc2bf955d..d52e8710ea 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -79,7 +79,7 @@ PageInit(Page page, Size pageSize, Size specialSize)
* will clean up such a page and make it usable.
*/
bool
-PageIsVerified(Page page, BlockNumber blkno)
+PageIsVerified(Page page, BlockNumber blkno, int flags)
{
PageHeader p = (PageHeader) page;
size_t *pagebytes;
@@ -140,12 +140,14 @@ PageIsVerified(Page page, BlockNumber blkno)
*/
if (checksum_failure)
{
- ereport(WARNING,
- (errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("page verification failed, calculated checksum %u but expected %u",
- checksum, p->pd_checksum)));
+ if((flags & PIV_LOG_WARNING) != 0)
+ ereport(WARNING,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("page verification failed, calculated checksum %u but expected %u",
+ checksum, p->pd_checksum)));
- pgstat_report_checksum_failure();
+ if((flags & PIV_REPORT_STAT) != 0)
+ pgstat_report_checksum_failure();
if (header_sane && ignore_checksum_failure)
return true;
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index f674a7c94e..093ec204fd 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
use File::Path qw(rmtree);
use PostgresNode;
use TestLib;
-use Test::More tests => 109;
+use Test::More tests => 112;
program_help_ok('pg_basebackup');
program_version_ok('pg_basebackup');
@@ -524,10 +524,27 @@ $node->command_checks_all(
[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt" ],
1,
[qr{^$}],
- [qr/^WARNING.*checksum verification failed/s],
+ [qr/^WARNING.*page verification failed/s],
'pg_basebackup reports checksum mismatch');
rmtree("$tempdir/backup_corrupt");
+# zero out the pageheader completely
+open $file, '+<', "$pgdata/$file_corrupt1";
+seek($file, $pageheader_size, 0);
+system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
+my $zero_data = "\0"x$pageheader_size;
+syswrite($file, $zero_data);
+close $file;
+system_or_bail 'pg_ctl', '-D', $pgdata, 'start';
+
+$node->command_checks_all(
+ [ 'pg_basebackup', '-D', "$tempdir/backup_corrupt1a" ],
+ 1,
+ [qr{^$}],
+ [qr/^WARNING.*page verification failed/s],
+ "pg_basebackup reports checksum mismatch for zeroed pageheader");
+rmtree("$tempdir/backup_corrupt1a");
+
# induce further corruption in 5 more blocks
system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
open $file, '+<', "$pgdata/$file_corrupt1";
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 51b8f994ac..622e9aa4db 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -404,9 +404,14 @@ do { \
* extern declarations
* ----------------------------------------------------------------
*/
+/*flags for PageAddItemExtended */
#define PAI_OVERWRITE (1 << 0)
#define PAI_IS_HEAP (1 << 1)
+/* flags for PageIsVerified */
+#define PIV_LOG_WARNING (1 << 0)
+#define PIV_REPORT_STAT (1 << 1)
+
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap) \
PageAddItemExtended(page, item, size, offsetNumber, \
((overwrite) ? PAI_OVERWRITE : 0) | \
@@ -423,7 +428,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
-extern bool PageIsVerified(Page page, BlockNumber blkno);
+extern bool PageIsVerified(Page page, BlockNumber blkno, int flags);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
extern Page PageGetTempPage(Page page);