On 2016-04-14 11:09:29 +0530, Abhijit Menon-Sen wrote:
> At 2016-04-12 09:00:57 -0400, robertmh...@gmail.com wrote:
> >
> > On Mon, Apr 11, 2016 at 1:17 PM, Andres Freund <and...@anarazel.de> wrote:
> > >
> > > 3) Actually handle the case of the last open segment not being
> > >    RELSEG_SIZE properly in _mdfd_getseg() - mdnblocks() does so.
> > 
> > #3 seems like it's probably about 15 years overdue, so let's do that
> > anyway.
> 
> I don't have anything useful to contribute, I'm just trying to
> understand this fix.
> 
> _mdfd_getseg() looks like this:
> 
>       targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
>       for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
>       {
>               Assert(nextsegno == v->mdfd_segno + 1);
> 
>               if (v->mdfd_chain == NULL)
>               {
>                       /*
>                        * Normally we will create new segments only if 
> authorized by the
>                        * caller (i.e., we are doing mdextend()). […]
>                        */
>                       if (behavior == EXTENSION_CREATE || InRecovery)
>                       {
>                               if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
>                                     /* mdextend */
>                               v->mdfd_chain = _mdfd_openseg(reln, forknum, 
> +nextsegno, O_CREAT);
>                       }
>                       else
>                       {
>                               /* We won't create segment if not existent */
>                               v->mdfd_chain = _mdfd_openseg(reln, forknum, 
> nextsegno, 0);
>                       }
>                       if (v->mdfd_chain == NULL)
>                             /* return NULL or ERROR */
>               }
>               v = v->mdfd_chain;
>       }
> 
> Do I understand correctly that the case of the "last open segment"
> (i.e., the one for which mdfd_chain == NULL) not being RELSEG_SIZE
> (i.e., _mdnblocks(reln, forknum, v) < RELSEG_SIZE) should not call
> _mfdf_openseg on nextsegno if behaviour is not EXTENSION_CREATE or
> InRecovery?
>
> And that "We won't create segment if not existent" should happen, but
> doesn't only because the next segment file wasn't removed earlier, so
> we have to add an extra check for that case?
> 
> In other words, is something like the following what's needed here, or
> is there more to it?

It's a bit more complicated than that, but that's the principle. Thanks
for the patch, and sorry for my late response. See my attached version
for a more fleshed out version of this.

Looking at the code around this I found:
* _mdfd_getseg(), in contrast to mdnblocks() doesn't check segment size,
  neither whether to continue with a too small, nor to error out with a
  too big segment
* For, imo pretty bad, reasons in mdsync() we currently rely on
  EXTENSION_RETURN_NULL to extend the file in recovery, and to set errno
  to ENOENT. Brrr.
* we currently FATAL if a segment is too big - I've copied that
  behaviour, but why not just ERROR out?

The attached patch basically adds the segment size checks to
_mdfd_getseg(), and doesn't perform extension, even in recovery, if
EXTENSION_REALLY_RETURN_NULL is passed.

This fixes the issue for me, both in the originally reported variant,
and in some more rudely setup version (i.e. rm'ing files).


I'm seing some weird behaviour (even with writeback flushing disabled)
with smgr invals and recovery/standbys, but I don't want to dive into it
before 
http://www.postgresql.org/message-id/CAD21AoDpZ6Xjg=gFrGPnSn4oTRRcwK1EBrWCq9OqOHuAcMMC=w...@mail.gmail.com
is addressed, it's too likely to be related.

Greetings,

Andres Freund
>From ea6bd2b96e9d8c499b7c926f5f4b05b39dfbc540 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Fri, 22 Apr 2016 09:45:26 -0700
Subject: [PATCH] Don't open formally non-existant segments in _mdfd_getseg().

Discussion: CAA-aLv6Dp_ZsV-44QA-2zgkqWKQq=GedBX2dRSrWpxqovXK=p...@mail.gmail.com
Fixes: 428b1d6b29ca599c5700d4bc4f4ce4c5880369bf
---
 src/backend/storage/smgr/md.c | 86 ++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 26 deletions(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 578276d..89a20d1 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -165,9 +165,14 @@ static CycleCtr mdckpt_cycle_ctr = 0;
 
 typedef enum					/* behavior for mdopen & _mdfd_getseg */
 {
-	EXTENSION_FAIL,				/* ereport if segment not present */
-	EXTENSION_RETURN_NULL,		/* return NULL if not present */
-	EXTENSION_CREATE			/* create new segments as needed */
+	/* ereport if segment not present, create in recovery */
+	EXTENSION_FAIL,
+	/* return NULL if not present, create in recovery */
+	EXTENSION_RETURN_NULL,
+	/* return NULL if not present */
+	EXTENSION_REALLY_RETURN_NULL,
+	/* create new segments as needed */
+	EXTENSION_CREATE
 } ExtensionBehavior;
 
 /* local routines */
@@ -591,7 +596,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 			fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 		if (fd < 0)
 		{
-			if (behavior == EXTENSION_RETURN_NULL &&
+			if ((behavior == EXTENSION_RETURN_NULL ||
+				 behavior == EXTENSION_REALLY_RETURN_NULL) &&
 				FILE_POSSIBLY_DELETED(errno))
 			{
 				pfree(path);
@@ -685,7 +691,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum,
 					segnum_end;
 
 		v = _mdfd_getseg(reln, forknum, blocknum, false,
-						 EXTENSION_RETURN_NULL);
+						 EXTENSION_REALLY_RETURN_NULL);
 
 		/*
 		 * We might be flushing buffers of already removed relations, that's
@@ -1774,7 +1780,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 	BlockNumber nextsegno;
 
 	if (!v)
-		return NULL;			/* only possible if EXTENSION_RETURN_NULL */
+		return NULL;			/* if EXTENSION_(REALLY_)RETURN_NULL */
 
 	targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
 	for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
@@ -1783,23 +1789,30 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 
 		if (v->mdfd_chain == NULL)
 		{
-			/*
-			 * Normally we will create new segments only if authorized by the
-			 * caller (i.e., we are doing mdextend()).  But when doing WAL
-			 * recovery, create segments anyway; this allows cases such as
-			 * replaying WAL data that has a write into a high-numbered
-			 * segment of a relation that was later deleted.  We want to go
-			 * ahead and create the segments so we can finish out the replay.
-			 *
-			 * We have to maintain the invariant that segments before the last
-			 * active segment are of size RELSEG_SIZE; therefore, pad them out
-			 * with zeroes if needed.  (This only matters if caller is
-			 * extending the relation discontiguously, but that can happen in
-			 * hash indexes.)
-			 */
-			if (behavior == EXTENSION_CREATE || InRecovery)
+			BlockNumber nblocks = _mdnblocks(reln, forknum, v);
+
+			if (nblocks > ((BlockNumber) RELSEG_SIZE))
+				elog(FATAL, "segment too big");
+
+			if (behavior == EXTENSION_CREATE ||
+				(InRecovery && behavior != EXTENSION_REALLY_RETURN_NULL))
 			{
-				if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
+				/*
+				 * Normally we will create new segments only if authorized by
+				 * the caller (i.e., we are doing mdextend()).  But when doing
+				 * WAL recovery, create segments anyway; this allows cases
+				 * such as replaying WAL data that has a write into a
+				 * high-numbered segment of a relation that was later deleted.
+				 * We want to go ahead and create the segments so we can
+				 * finish out the replay.
+				 *
+				 * We have to maintain the invariant that segments before the
+				 * last active segment are of size RELSEG_SIZE; therefore, if
+				 * extending, pad them out with zeroes if needed.  (This only
+				 * matters if caller is extending the relation
+				 * discontiguously, but that can happen in hash indexes.)
+				 */
+				if (nblocks < ((BlockNumber) RELSEG_SIZE))
 				{
 					char	   *zerobuf = palloc0(BLCKSZ);
 
@@ -1810,14 +1823,35 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 				}
 				v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
 			}
-			else
+			else if (nblocks < ((BlockNumber) RELSEG_SIZE))
 			{
-				/* We won't create segment if not existent */
-				v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
+				/*
+				 * When not extending, only open the next segment if the
+				 * current one is exactly RELSEG_SIZE.
+				 */
+				if (behavior == EXTENSION_RETURN_NULL ||
+					behavior == EXTENSION_REALLY_RETURN_NULL)
+				{
+					errno = ENOENT;		/* some callers check errno, yuck */
+					return NULL;
+				}
+				else
+				{
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
+									_mdfd_segpath(reln, forknum, nextsegno),
+									blkno, nblocks)));
+				}
 			}
+
+			/* We won't create segment if not existent */
+			v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
+
 			if (v->mdfd_chain == NULL)
 			{
-				if (behavior == EXTENSION_RETURN_NULL &&
+				if ((behavior == EXTENSION_RETURN_NULL ||
+					 behavior == EXTENSION_REALLY_RETURN_NULL) &&
 					FILE_POSSIBLY_DELETED(errno))
 					return NULL;
 				ereport(ERROR,
-- 
2.7.0.229.g701fa7f

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to