On Fri, Oct 23, 2020 at 08:00:08AM +0900, Michael Paquier wrote: > Yeah, we could try to make the logic a bit more complicated like > that. However, for any code path relying on a page read without any > locking insurance, we cannot really have a lot of trust in any of the > fields assigned to the page as this could just be random corruption > garbage, and the only thing I am ready to trust here a checksum > mismatch check, because that's the only field on the page that's > linked to its full contents on the 8k page. This also keeps the code > simpler.
A small update here. I have extracted the refactored part for PageIsVerified() and committed it as that's independently useful. This makes the patch proposed here simpler on HEAD, leading to the attached. -- Michael
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index b89df01fa7..37b60437a6 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1569,21 +1569,24 @@ sendFile(const char *readfilename, const char *tarfilename,
{
int fd;
BlockNumber blkno = 0;
- bool block_retry = false;
+ int block_attempts = 0;
char buf[TAR_SEND_SIZE];
- uint16 checksum;
int checksum_failures = 0;
off_t cnt;
int i;
pgoff_t len = 0;
char *page;
size_t pad;
- PageHeader phdr;
int segmentno = 0;
char *segmentpath;
bool verify_checksum = false;
pg_checksum_context checksum_ctx;
+ /* Maximum number of checksum failures reported for this file */
+#define CHECKSUM_REPORT_THRESHOLD 5
+ /* maximum number of retries done for a single page */
+#define PAGE_RETRY_THRESHOLD 5
+
pg_checksum_init(&checksum_ctx, manifest->checksum_type);
fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
@@ -1661,9 +1664,7 @@ sendFile(const char *readfilename, const char *tarfilename,
if (verify_checksum && (cnt % BLCKSZ != 0))
{
ereport(WARNING,
- (errmsg("could not verify checksum in file \"%s\", block "
- "%d: read buffer size %d and page size %d "
- "differ",
+ (errmsg("could not verify checksum in file \"%s\", block %u: read buffer size %d and page size %d differ",
readfilename, blkno, (int) cnt, BLCKSZ)));
verify_checksum = false;
}
@@ -1675,78 +1676,75 @@ sendFile(const char *readfilename, const char *tarfilename,
page = buf + BLCKSZ * i;
/*
- * Only check pages which have not been modified since the
- * start of the base backup. Otherwise, they might have been
- * written only halfway and the checksum would not be valid.
- * However, replaying WAL would reinstate the correct page in
- * this case. We also skip completely new pages, since they
- * don't have a checksum yet.
+ * Check on-disk pages with the same set of verification
+ * conditions used before loading pages into shared buffers.
+ * Note that PageIsVerifiedExtended() considers pages filled
+ * only with zeros as valid, since they don't have a checksum
+ * yet. Failures are not reported to pgstat yet, as these are
+ * reported in a single batch once a file is completed. It
+ * could be possible, that a page is written halfway while
+ * doing this check, causing a false positive. If that
+ * happens, a page is retried multiple times, with an error
+ * reported if the second attempt also fails.
*/
- if (!PageIsNew(page) && PageGetLSN(page) < startptr)
+ if (PageIsVerifiedExtended(page, blkno, 0))
{
- checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
- phdr = (PageHeader) page;
- if (phdr->pd_checksum != checksum)
+ /* success, move on to the next block */
+ blkno++;
+ block_attempts = 0;
+ continue;
+ }
+
+ /* The verification of a page has failed, retry again */
+ if (block_attempts < PAGE_RETRY_THRESHOLD)
+ {
+ int reread_cnt;
+
+ /* Reread the failed block */
+ reread_cnt =
+ basebackup_read_file(fd, buf + BLCKSZ * i,
+ BLCKSZ, len + BLCKSZ * i,
+ readfilename,
+ false);
+ if (reread_cnt == 0)
{
/*
- * Retry the block on the first failure. It's
- * possible that we read the first 4K page of the
- * block just before postgres updated the entire block
- * so it ends up looking torn to us. We only need to
- * retry once because the LSN should be updated to
- * something we can ignore on the next pass. If the
- * error happens again then it is a true validation
- * failure.
+ * If we hit end-of-file, a concurrent truncation must
+ * have occurred, so break out of this loop just as if
+ * the initial fread() returned 0. We'll drop through
+ * to the same code that handles that case. (We must
+ * fix up cnt first, though.)
*/
- if (block_retry == false)
- {
- int reread_cnt;
-
- /* Reread the failed block */
- reread_cnt =
- basebackup_read_file(fd, buf + BLCKSZ * i,
- BLCKSZ, len + BLCKSZ * i,
- readfilename,
- false);
- if (reread_cnt == 0)
- {
- /*
- * If we hit end-of-file, a concurrent
- * truncation must have occurred, so break out
- * of this loop just as if the initial fread()
- * returned 0. We'll drop through to the same
- * code that handles that case. (We must fix
- * up cnt first, though.)
- */
- cnt = BLCKSZ * i;
- break;
- }
-
- /* Set flag so we know a retry was attempted */
- block_retry = true;
-
- /* Reset loop to validate the block again */
- i--;
- continue;
- }
-
- checksum_failures++;
-
- if (checksum_failures <= 5)
- ereport(WARNING,
- (errmsg("checksum verification failed in "
- "file \"%s\", block %d: calculated "
- "%X but expected %X",
- readfilename, blkno, checksum,
- phdr->pd_checksum)));
- if (checksum_failures == 5)
- ereport(WARNING,
- (errmsg("further checksum verification "
- "failures in file \"%s\" will not "
- "be reported", readfilename)));
+ cnt = BLCKSZ * i;
+ break;
}
+
+ /* Set the counter so we know a retry was attempted */
+ block_attempts++;
+
+ /*
+ * Wait for 100 ms to give some room to any parallel page
+ * write that may have caused this retry to finish.
+ */
+ pg_usleep(100000L);
+
+ /* Reset loop to validate the block again */
+ i--;
+ continue;
}
- block_retry = false;
+
+ checksum_failures++;
+ if (checksum_failures <= CHECKSUM_REPORT_THRESHOLD)
+ ereport(WARNING,
+ (errmsg("checksum verification failed in block %u of file \"%s\"",
+ blkno, readfilename)));
+ if (checksum_failures == CHECKSUM_REPORT_THRESHOLD)
+ ereport(WARNING,
+ (errmsg("further checksum verification failures in file \"%s\" will not be reported",
+ readfilename)));
+
+ /* move to next block */
+ block_attempts = 0;
blkno++;
}
}
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index f674a7c94e..a46a9ece42 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
use File::Path qw(rmtree);
use PostgresNode;
use TestLib;
-use Test::More tests => 109;
+use Test::More tests => 112;
program_help_ok('pg_basebackup');
program_version_ok('pg_basebackup');
@@ -528,6 +528,24 @@ $node->command_checks_all(
'pg_basebackup reports checksum mismatch');
rmtree("$tempdir/backup_corrupt");
+# Create a new type of corruption and zero out one page header
+# completely.
+open $file, '+<', "$pgdata/$file_corrupt1";
+seek($file, $pageheader_size, 0);
+system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
+my $zero_data = '\0' x $pageheader_size;
+syswrite($file, $zero_data);
+close $file;
+system_or_bail 'pg_ctl', '-D', $pgdata, 'start';
+
+$node->command_checks_all(
+ [ 'pg_basebackup', '-D', "$tempdir/backup_corrupt1a" ],
+ 1,
+ [qr{^$}],
+ [qr/^WARNING.*checksum verification failed/s],
+ "pg_basebackup reports checksum mismatch for zeroed page headers");
+rmtree("$tempdir/backup_corrupt1a");
+
# induce further corruption in 5 more blocks
system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
open $file, '+<', "$pgdata/$file_corrupt1";
signature.asc
Description: PGP signature
