On Mon, Jul 09, 2018 at 02:03:09PM +0900, Michael Paquier wrote:
> I think that we really need to harden things, by making
> ReadTwoPhaseFile() fail hard is it finds something unexpected, which is
> in this case anything except trying to open a file which fails on
> ENOENT, and that this stuff should be back-patched.

Rebased as attached because of the conflicts from 811b6e3.
--
Michael
From 8a6e43eb3faaf707e77f0fb8d0838efb3692832e Mon Sep 17 00:00:00 2001
From: Michael Paquier <mich...@paquier.xyz>
Date: Wed, 18 Jul 2018 17:15:32 +0900
Subject: [PATCH] Fail hard when facing corrupted two-phase state files at
 recovery

When a corrupted file is found by WAL replay, be it for crash recovery
or archive recovery, then the file is simply skipped and a WARNING is
logged to the user.  Facing an on-disk WAL file which is corrupted is
as likely to happen as its pair recorded in dedicated WAL records, but
WAL records are already able to fail hard if there is a CRC mismatch at
record level.

The situation got better since 978b2f6 (9.6) which has added two-phase
state information directly in WAL instead of using on-disk files, so the
risk is limited to two-phase transactions which live across at least one
checkpoint.

Reported-by: Michael Paquier
Author: Michael Paquier
Discussion: https://postgr.es/m/XXX
---
 src/backend/access/transam/twophase.c | 136 +++++++++++---------------
 1 file changed, 56 insertions(+), 80 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 306861bb79..35fcd06b8f 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1206,10 +1206,11 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
  * Read and validate the state file for xid.
  *
  * If it looks OK (has a valid magic number and CRC), return the palloc'd
- * contents of the file.  Otherwise return NULL.
+ * contents of the file.  If missing_ok is true, do not issue an ERROR and
+ * return NULL.
  */
 static char *
-ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
+ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
 {
 	char		path[MAXPGPATH];
 	char	   *buf;
@@ -1226,11 +1227,12 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
 	if (fd < 0)
 	{
-		if (give_warnings)
-			ereport(WARNING,
-					(errcode_for_file_access(),
-					 errmsg("could not open file \"%s\": %m", path)));
-		return NULL;
+		if (missing_ok && errno == ENOENT)
+			return NULL;
+
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
 	}
 
 	/*
@@ -1240,35 +1242,25 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 	 * even on a valid file.
 	 */
 	if (fstat(fd, &stat))
-	{
-		int			save_errno = errno;
-
-		CloseTransientFile(fd);
-		if (give_warnings)
-		{
-			errno = save_errno;
-			ereport(WARNING,
-					(errcode_for_file_access(),
-					 errmsg("could not stat file \"%s\": %m", path)));
-		}
-		return NULL;
-	}
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not stat file \"%s\": %m", path)));
 
 	if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
 						MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
 						sizeof(pg_crc32c)) ||
 		stat.st_size > MaxAllocSize)
-	{
-		CloseTransientFile(fd);
-		return NULL;
-	}
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("incorrect size of two-phase state file \"%s\": %zu bytes",
+						path, stat.st_size)));
 
 	crc_offset = stat.st_size - sizeof(pg_crc32c);
 	if (crc_offset != MAXALIGN(crc_offset))
-	{
-		CloseTransientFile(fd);
-		return NULL;
-	}
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("incorrect alignment of CRC offset for two-phase state file \"%s\"",
+						path)));
 
 	/*
 	 * OK, slurp in the file.
@@ -1279,37 +1271,31 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 	r = read(fd, buf, stat.st_size);
 	if (r != stat.st_size)
 	{
-		int			save_errno = errno;
-
-		pgstat_report_wait_end();
-		CloseTransientFile(fd);
-		if (give_warnings)
-		{
-			if (r < 0)
-			{
-				errno = save_errno;
-				ereport(WARNING,
-						(errcode_for_file_access(),
-						 errmsg("could not read file \"%s\": %m", path)));
-			}
-			else
-				ereport(WARNING,
-						(errmsg("could not read file \"%s\": read %d of %zu",
-								path, r, (Size) stat.st_size)));
-		}
-		pfree(buf);
-		return NULL;
+		if (r < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m", path)));
+		else
+			ereport(ERROR,
+					(errmsg("could not read file \"%s\": read %d of %zu",
+							path, r, (Size) stat.st_size)));
 	}
 
 	pgstat_report_wait_end();
 	CloseTransientFile(fd);
 
 	hdr = (TwoPhaseFileHeader *) buf;
-	if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size)
-	{
-		pfree(buf);
-		return NULL;
-	}
+	if (hdr->magic != TWOPHASE_MAGIC)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid magic number stored in two-phase state file \"%s\"",
+						path)));
+
+	if (hdr->total_len != stat.st_size)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid size stored in two-phase state file \"%s\"",
+						path)));
 
 	INIT_CRC32C(calc_crc);
 	COMP_CRC32C(calc_crc, buf, crc_offset);
@@ -1318,10 +1304,10 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 	file_crc = *((pg_crc32c *) (buf + crc_offset));
 
 	if (!EQ_CRC32C(calc_crc, file_crc))
-	{
-		pfree(buf);
-		return NULL;
-	}
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("calculated CRC checksum does not match value stored in two^phase state file \"%s\"",
+						path)));
 
 	return buf;
 }
@@ -1430,7 +1416,7 @@ StandbyTransactionIdIsPrepared(TransactionId xid)
 		return false;			/* nothing to do */
 
 	/* Read and validate file */
-	buf = ReadTwoPhaseFile(xid, false);
+	buf = ReadTwoPhaseFile(xid, true);
 	if (buf == NULL)
 		return false;
 
@@ -1478,7 +1464,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	 * to disk if for some reason they have lived for a long time.
 	 */
 	if (gxact->ondisk)
-		buf = ReadTwoPhaseFile(xid, true);
+		buf = ReadTwoPhaseFile(xid, false);
 	else
 		XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
 
@@ -1872,6 +1858,10 @@ restoreTwoPhaseData(void)
  * write a WAL entry, and so there might be no evidence in WAL of those
  * subxact XIDs.
  *
+ * On corrupted two-phase files, fail immediately.  Keeping around broken
+ * entries and let replay continue causes harm on the system, and a new
+ * backup should be rolled in.
+ *
  * Our other responsibility is to determine and return the oldest valid XID
  * among the prepared xacts (if none, return ShmemVariableCache->nextXid).
  * This is needed to synchronize pg_subtrans startup properly.
@@ -2162,15 +2152,7 @@ ProcessTwoPhaseBuffer(TransactionId xid,
 	if (fromdisk)
 	{
 		/* Read and validate file */
-		buf = ReadTwoPhaseFile(xid, true);
-		if (buf == NULL)
-		{
-			ereport(WARNING,
-					(errmsg("removing corrupt two-phase state file for transaction %u",
-							xid)));
-			RemoveTwoPhaseFile(xid, true);
-			return NULL;
-		}
+		buf = ReadTwoPhaseFile(xid, false);
 	}
 	else
 	{
@@ -2183,21 +2165,15 @@ ProcessTwoPhaseBuffer(TransactionId xid,
 	if (!TransactionIdEquals(hdr->xid, xid))
 	{
 		if (fromdisk)
-		{
-			ereport(WARNING,
-					(errmsg("removing corrupt two-phase state file for transaction %u",
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("corrupted two-phase state file for \"%u\"",
 							xid)));
-			RemoveTwoPhaseFile(xid, true);
-		}
 		else
-		{
-			ereport(WARNING,
-					(errmsg("removing corrupt two-phase state from memory for transaction %u",
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("corrupted two-phase state in memory for \"%u\"",
 							xid)));
-			PrepareRedoRemove(xid, true);
-		}
-		pfree(buf);
-		return NULL;
 	}
 
 	/*
-- 
2.18.0

Attachment: signature.asc
Description: PGP signature

Reply via email to