Re: Incorrect error handling for two-phase state files resulting in data loss

2018-09-07 Thread Michael Paquier
On Fri, Aug 17, 2018 at 07:56:00AM +0900, Michael Paquier wrote:
> As this is a data corruption issue, are there any objections if I patch
> and back-patch?  I also would like to get this stuff in first as I have
> other refactoring work which would shave some more code.

I looked at this patch again after letting it aside for a couple of
weeks and pushed it.  This is a potential, very rare data loss bug, and
nobody complained about it so I have not done any back-patch.  Please
let me know if you would like to see something happen in stable branches
as well.
--
Michael


signature.asc
Description: PGP signature


Re: Incorrect error handling for two-phase state files resulting in data loss

2018-08-16 Thread Michael Paquier
On Wed, Jul 18, 2018 at 05:18:18PM +0900, Michael Paquier wrote:
> On Mon, Jul 09, 2018 at 02:03:09PM +0900, Michael Paquier wrote:
>> I think that we really need to harden things, by making
>> ReadTwoPhaseFile() fail hard is it finds something unexpected, which is
>> in this case anything except trying to open a file which fails on
>> ENOENT, and that this stuff should be back-patched.
> 
> Rebased as attached because of the conflicts from 811b6e3.

So...  I have been doing a self-review of this patch after letting it
aside for a couple of weeks, and I did not spot any fundamental issue
wit hit.  I have spotted two minor issues:

-{
-CloseTransientFile(fd);
-return NULL;
-}
+ereport(ERROR,
+(errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("incorrect size of two-phase state file \"%s\": %zu 
bytes",
+path, stat.st_size)));
This needs to use errmsg_plural.

+ereport(ERROR,
+(errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted two-phase state file for \"%u\"",
This should use "for transaction %u".

As this is a data corruption issue, are there any objections if I patch
and back-patch?  I also would like to get this stuff in first as I have
other refactoring work which would shave some more code.
--
Michael


signature.asc
Description: PGP signature


Re: Incorrect error handling for two-phase state files resulting in data loss

2018-07-18 Thread Michael Paquier
On Mon, Jul 09, 2018 at 02:03:09PM +0900, Michael Paquier wrote:
> I think that we really need to harden things, by making
> ReadTwoPhaseFile() fail hard is it finds something unexpected, which is
> in this case anything except trying to open a file which fails on
> ENOENT, and that this stuff should be back-patched.

Rebased as attached because of the conflicts from 811b6e3.
--
Michael
From 8a6e43eb3faaf707e77f0fb8d0838efb3692832e Mon Sep 17 00:00:00 2001
From: Michael Paquier 
Date: Wed, 18 Jul 2018 17:15:32 +0900
Subject: [PATCH] Fail hard when facing corrupted two-phase state files at
 recovery

When a corrupted file is found by WAL replay, be it for crash recovery
or archive recovery, then the file is simply skipped and a WARNING is
logged to the user.  Facing an on-disk WAL file which is corrupted is
as likely to happen as its pair recorded in dedicated WAL records, but
WAL records are already able to fail hard if there is a CRC mismatch at
record level.

The situation got better since 978b2f6 (9.6) which has added two-phase
state information directly in WAL instead of using on-disk files, so the
risk is limited to two-phase transactions which live across at least one
checkpoint.

Reported-by: Michael Paquier
Author: Michael Paquier
Discussion: https://postgr.es/m/XXX
---
 src/backend/access/transam/twophase.c | 136 +++---
 1 file changed, 56 insertions(+), 80 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 306861bb79..35fcd06b8f 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1206,10 +1206,11 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
  * Read and validate the state file for xid.
  *
  * If it looks OK (has a valid magic number and CRC), return the palloc'd
- * contents of the file.  Otherwise return NULL.
+ * contents of the file.  If missing_ok is true, do not issue an ERROR and
+ * return NULL.
  */
 static char *
-ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
+ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
 {
 	char		path[MAXPGPATH];
 	char	   *buf;
@@ -1226,11 +1227,12 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
 	if (fd < 0)
 	{
-		if (give_warnings)
-			ereport(WARNING,
-	(errcode_for_file_access(),
-	 errmsg("could not open file \"%s\": %m", path)));
-		return NULL;
+		if (missing_ok && errno == ENOENT)
+			return NULL;
+
+		ereport(ERROR,
+(errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
 	}
 
 	/*
@@ -1240,35 +1242,25 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 	 * even on a valid file.
 	 */
 	if (fstat(fd, ))
-	{
-		int			save_errno = errno;
-
-		CloseTransientFile(fd);
-		if (give_warnings)
-		{
-			errno = save_errno;
-			ereport(WARNING,
-	(errcode_for_file_access(),
-	 errmsg("could not stat file \"%s\": %m", path)));
-		}
-		return NULL;
-	}
+		ereport(ERROR,
+(errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", path)));
 
 	if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
 		MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
 		sizeof(pg_crc32c)) ||
 		stat.st_size > MaxAllocSize)
-	{
-		CloseTransientFile(fd);
-		return NULL;
-	}
+		ereport(ERROR,
+(errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("incorrect size of two-phase state file \"%s\": %zu bytes",
+		path, stat.st_size)));
 
 	crc_offset = stat.st_size - sizeof(pg_crc32c);
 	if (crc_offset != MAXALIGN(crc_offset))
-	{
-		CloseTransientFile(fd);
-		return NULL;
-	}
+		ereport(ERROR,
+(errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("incorrect alignment of CRC offset for two-phase state file \"%s\"",
+		path)));
 
 	/*
 	 * OK, slurp in the file.
@@ -1279,37 +1271,31 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 	r = read(fd, buf, stat.st_size);
 	if (r != stat.st_size)
 	{
-		int			save_errno = errno;
-
-		pgstat_report_wait_end();
-		CloseTransientFile(fd);
-		if (give_warnings)
-		{
-			if (r < 0)
-			{
-errno = save_errno;
-ereport(WARNING,
-		(errcode_for_file_access(),
-		 errmsg("could not read file \"%s\": %m", path)));
-			}
-			else
-ereport(WARNING,
-		(errmsg("could not read file \"%s\": read %d of %zu",
-path, r, (Size) stat.st_size)));
-		}
-		pfree(buf);
-		return NULL;
+		if (r < 0)
+			ereport(ERROR,
+	(errcode_for_file_access(),
+	 errmsg("could not read file \"%s\": %m", path)));
+		else
+			ereport(ERROR,
+	(errmsg("could not read file \"%s\": read %d of %zu",
+			path, r, (Size) stat.st_size)));
 	}
 
 	pgstat_report_wait_end();
 	CloseTransientFile(fd);
 
 	hdr = (TwoPhaseFileHeader *) buf;
-	if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size)
-	{
-		pfree(buf);
-		return NULL;
-	}
+	if (hdr->magic != TWOPHASE_MAGIC)
+		ereport(ERROR,
+(errcode(ERRCODE_DATA_CORRUPTED),
+ 

Incorrect error handling for two-phase state files resulting in data loss

2018-07-08 Thread Michael Paquier
Hi all,

This is a follow-up of the following thread where I have touched the
topic of corrupted 2PC files being completely ignored by recovery:
https://www.postgresql.org/message-id/20180709012955.GD1467%40paquier.xyz
I have posted a patch on this thread, but after more reviews I have
noticed that it does not close all the holes.

What happens is that when a two-phase state file cannot be read by
recovery when the file is on disk, which is what ReadTwoPhaseFile does,
then its data is simply skipped.  It is perfectly possible that an
on-disk 2PC is missing at crash recovery if it has been already
processed, so if trying to open the file and seeing an ENOENT, then the
file can be properly skipped.  However, if you look at this API, it
skips *all* errors instead of the ones that matter.  For example, if a
file needs to be processed and is cannot be opened because of permission
issues, then it is simply skipped.  If its size, its CRC or its magic
number is incorrect, the same thing happens.  Skipping unconditionally
files this way can result in data loss.

I think that we really need to harden things, by making
ReadTwoPhaseFile() fail hard is it finds something unexpected, which is
in this case anything except trying to open a file which fails on
ENOENT, and that this stuff should be back-patched.

Thoughts?
--
Michael
From c53e3f6b604ceffcf75484975331e1b141bb6139 Mon Sep 17 00:00:00 2001
From: Michael Paquier 
Date: Mon, 9 Jul 2018 13:59:46 +0900
Subject: [PATCH] Fail hard when facing corrupted two-phase state files at
 recovery

When a corrupted file is found by WAL replay, be it for crash recovery
or archive recovery, then the file is simply skipped and a WARNING is
logged to the user.  Facing an on-disk WAL file which is corrupted is
as likely to happen as its pair recorded in dedicated WAL records, but
WAL records are already able to fail hard if there is a CRC mismatch at
record level.

The situation got better since 978b2f6 (9.6) which has added two-phase
state information directly in WAL instead of using on-disk files, so the
risk is limited to two-phase transactions which live across at least one
checkpoint.

Reported-by: Michael Paquier
Author: Michael Paquier
Discussion: https://postgr.es/m/XXX
---
 src/backend/access/transam/twophase.c | 131 +++---
 1 file changed, 53 insertions(+), 78 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index e8d4e37fe3..3a048f9a4b 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1206,10 +1206,11 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
  * Read and validate the state file for xid.
  *
  * If it looks OK (has a valid magic number and CRC), return the palloc'd
- * contents of the file.  Otherwise return NULL.
+ * contents of the file.  If missing_ok is true, do not issue an ERROR and
+ * return NULL.
  */
 static char *
-ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
+ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
 {
 	char		path[MAXPGPATH];
 	char	   *buf;
@@ -1225,12 +1226,12 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
 	if (fd < 0)
 	{
-		if (give_warnings)
-			ereport(WARNING,
-	(errcode_for_file_access(),
-	 errmsg("could not open two-phase state file \"%s\": %m",
-			path)));
-		return NULL;
+		if (missing_ok && errno == ENOENT)
+			return NULL;
+		ereport(ERROR,
+(errcode_for_file_access(),
+ errmsg("could not open two-phase state file \"%s\": %m",
+		path)));
 	}
 
 	/*
@@ -1240,36 +1241,26 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
 	 * even on a valid file.
 	 */
 	if (fstat(fd, ))
-	{
-		int			save_errno = errno;
-
-		CloseTransientFile(fd);
-		if (give_warnings)
-		{
-			errno = save_errno;
-			ereport(WARNING,
-	(errcode_for_file_access(),
-	 errmsg("could not stat two-phase state file \"%s\": %m",
-			path)));
-		}
-		return NULL;
-	}
+		ereport(ERROR,
+(errcode_for_file_access(),
+ errmsg("could not stat two-phase state file \"%s\": %m",
+		path)));
 
 	if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
 		MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
 		sizeof(pg_crc32c)) ||
 		stat.st_size > MaxAllocSize)
-	{
-		CloseTransientFile(fd);
-		return NULL;
-	}
+		ereport(ERROR,
+(errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("incorrect size of two-phase state file \"%s\": %zu bytes",
+		path, stat.st_size)));
 
 	crc_offset = stat.st_size - sizeof(pg_crc32c);
 	if (crc_offset != MAXALIGN(crc_offset))
-	{
-		CloseTransientFile(fd);
-		return NULL;
-	}
+		ereport(ERROR,
+(errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("incorrect alignment of CRC offset for two-phase state file \"%s\"",
+		path)));
 
 	/*
 	 * OK, slurp in the file.
@@ -1278,32 +1269,26 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)