On 2016-06-30 18:14:15 -0700, Peter Geoghegan wrote:
> On Tue, Dec 15, 2015 at 10:04 AM, Andres Freund <and...@anarazel.de> wrote:
> > Took a while. But here we go. The attached version is a significantly
> > revised version of my earlier patch. Notably I've pretty much entirely
> > revised the code in _mdfd_getseg() to be more similar to the state in
> > master. Also some comment policing.
> 
> Are you planning to pick this up again, Andres?

Rebased version attached. A review would be welcome. Plan to push this
forward otherwise in the not too far away future.

Andres
>From 62b0d36864b23b91961bb800c1b0814f20d00a8e Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 18 Aug 2016 16:04:33 -0700
Subject: [PATCH 1/2] Improve scalability of md.c for large relations.

Previously several routines in md.c were O(#segments) - a problem these
days, where it's not uncommon to have tables in the multi TB range.

Replace the linked list of segments hanging of SMgrRelationData, with an
array of opened segments. That allows O(1) access to individual
segments, if they've previously been opened.
---
 src/backend/storage/smgr/md.c   | 408 ++++++++++++++++++++++------------------
 src/backend/storage/smgr/smgr.c |   4 +-
 src/include/storage/smgr.h      |   8 +-
 3 files changed, 234 insertions(+), 186 deletions(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index f329d15..0992a7e 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -92,27 +92,23 @@
  *	out to an unlinked old copy of a segment file that will eventually
  *	disappear.
  *
- *	The file descriptor pointer (md_fd field) stored in the SMgrRelation
- *	cache is, therefore, just the head of a list of MdfdVec objects, one
- *	per segment.  But note the md_fd pointer can be NULL, indicating
- *	relation not open.
+ *	File descriptors are stored in md_seg_fds arrays inside SMgrRelation. The
+ *	length of these arrays is stored in md_num_open_segs.  Note that
+ *	md_num_open_segs having a specific value does not necessarily mean the
+ *	relation doesn't have additional segments; we may just not have opened the
+ *	next segment yet.  (We could not have "all segments are in the array" as
+ *	an invariant anyway, since another backend could extend the relation while
+ *	we aren't looking.)  We do not have entries for inactive segments,
+ *	however; as soon as we find a partial segment, we assume that any
+ *	subsequent segments are inactive.
  *
- *	Also note that mdfd_chain == NULL does not necessarily mean the relation
- *	doesn't have another segment after this one; we may just not have
- *	opened the next segment yet.  (We could not have "all segments are
- *	in the chain" as an invariant anyway, since another backend could
- *	extend the relation when we weren't looking.)  We do not make chain
- *	entries for inactive segments, however; as soon as we find a partial
- *	segment, we assume that any subsequent segments are inactive.
- *
- *	All MdfdVec objects are palloc'd in the MdCxt memory context.
+ *	The entire MdfdVec array is palloc'd in the MdCxt memory context.
  */
 
 typedef struct _MdfdVec
 {
 	File		mdfd_vfd;		/* fd number in fd.c's pool */
 	BlockNumber mdfd_segno;		/* segment number, from 0 */
-	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
 } MdfdVec;
 
 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
@@ -189,7 +185,9 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 					   MdfdVec *seg);
 static void register_unlink(RelFileNodeBackend rnode);
-static MdfdVec *_fdvec_alloc(void);
+static void _fdvec_resize(SMgrRelation reln,
+			  ForkNumber forknum,
+			  int nseg);
 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
 			  BlockNumber segno);
 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
@@ -298,13 +296,14 @@ mdexists(SMgrRelation reln, ForkNumber forkNum)
 void
 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 {
+	MdfdVec    *mdfd;
 	char	   *path;
 	File		fd;
 
-	if (isRedo && reln->md_fd[forkNum] != NULL)
+	if (isRedo && reln->md_num_open_segs[forkNum] > 0)
 		return;					/* created and opened already... */
 
-	Assert(reln->md_fd[forkNum] == NULL);
+	Assert(reln->md_num_open_segs[forkNum] == 0);
 
 	path = relpath(reln->smgr_rnode, forkNum);
 
@@ -334,11 +333,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 
 	pfree(path);
 
-	reln->md_fd[forkNum] = _fdvec_alloc();
-
-	reln->md_fd[forkNum]->mdfd_vfd = fd;
-	reln->md_fd[forkNum]->mdfd_segno = 0;
-	reln->md_fd[forkNum]->mdfd_chain = NULL;
+	_fdvec_resize(reln, forkNum, 1);
+	mdfd = &reln->md_seg_fds[forkNum][0];
+	mdfd->mdfd_vfd = fd;
+	mdfd->mdfd_segno = 0;
 }
 
 /*
@@ -583,8 +581,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
 	File		fd;
 
 	/* No work if already open */
-	if (reln->md_fd[forknum])
-		return reln->md_fd[forknum];
+	if (reln->md_num_open_segs[forknum] > 0)
+		return &reln->md_seg_fds[forknum][0];
 
 	path = relpath(reln->smgr_rnode, forknum);
 
@@ -616,11 +614,11 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
 
 	pfree(path);
 
-	reln->md_fd[forknum] = mdfd = _fdvec_alloc();
-
+	_fdvec_resize(reln, forknum, 1);
+	mdfd = &reln->md_seg_fds[forknum][0];
 	mdfd->mdfd_vfd = fd;
 	mdfd->mdfd_segno = 0;
-	mdfd->mdfd_chain = NULL;
+
 	Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 
 	return mdfd;
@@ -632,25 +630,29 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
 void
 mdclose(SMgrRelation reln, ForkNumber forknum)
 {
-	MdfdVec    *v = reln->md_fd[forknum];
+	int			nopensegs = reln->md_num_open_segs[forknum];
 
 	/* No work if already closed */
-	if (v == NULL)
+	if (nopensegs == 0)
 		return;
 
-	reln->md_fd[forknum] = NULL;	/* prevent dangling pointer after error */
-
-	while (v != NULL)
+	/* close segments starting from the end */
+	while (nopensegs > 0)
 	{
-		MdfdVec    *ov = v;
+		MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
 
 		/* if not closed already */
 		if (v->mdfd_vfd >= 0)
+		{
 			FileClose(v->mdfd_vfd);
-		/* Now free vector */
-		v = v->mdfd_chain;
-		pfree(ov);
+			v->mdfd_vfd = -1;
+		}
+
+		nopensegs--;
 	}
+
+	/* resize just once, avoids pointless reallocations */
+	_fdvec_resize(reln, forknum, 0);
 }
 
 /*
@@ -866,9 +868,9 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
  *	mdnblocks() -- Get the number of blocks stored in a relation.
  *
  *		Important side effect: all active segments of the relation are opened
- *		and added to the mdfd_chain list.  If this routine has not been
+ *		and added to the mdfd_seg_fds array.  If this routine has not been
  *		called, then only segments up to the last one actually touched
- *		are present in the chain.
+ *		are present in the array.
  */
 BlockNumber
 mdnblocks(SMgrRelation reln, ForkNumber forknum)
@@ -877,24 +879,24 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 	BlockNumber nblocks;
 	BlockNumber segno = 0;
 
+	/* mdopen has opened the first segment */
+	Assert(reln->md_num_open_segs[forknum] > 0);
+
 	/*
-	 * Skip through any segments that aren't the last one, to avoid redundant
-	 * seeks on them.  We have previously verified that these segments are
-	 * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
+	 * Start from the last open segments, to avoid redundant seeks.  We have
+	 * previously verified that these segments are exactly RELSEG_SIZE long,
+	 * and it's useless to recheck that each time.
 	 *
 	 * NOTE: this assumption could only be wrong if another backend has
 	 * truncated the relation.  We rely on higher code levels to handle that
 	 * scenario by closing and re-opening the md fd, which is handled via
 	 * relcache flush.  (Since the checkpointer doesn't participate in
-	 * relcache flush, it could have segment chain entries for inactive
-	 * segments; that's OK because the checkpointer never needs to compute
-	 * relation size.)
+	 * relcache flush, it could have segment entries for inactive segments;
+	 * that's OK because the checkpointer never needs to compute relation
+	 * size.)
 	 */
-	while (v->mdfd_chain != NULL)
-	{
-		segno++;
-		v = v->mdfd_chain;
-	}
+	segno = reln->md_num_open_segs[forknum] - 1;
+	v = &reln->md_seg_fds[forknum][segno];
 
 	for (;;)
 	{
@@ -909,21 +911,16 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 		 */
 		segno++;
 
-		if (v->mdfd_chain == NULL)
-		{
-			/*
-			 * We used to pass O_CREAT here, but that's has the disadvantage
-			 * that it might create a segment which has vanished through some
-			 * operating system misadventure.  In such a case, creating the
-			 * segment here undermines _mdfd_getseg's attempts to notice and
-			 * report an error upon access to a missing segment.
-			 */
-			v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, 0);
-			if (v->mdfd_chain == NULL)
-				return segno * ((BlockNumber) RELSEG_SIZE);
-		}
-
-		v = v->mdfd_chain;
+		/*
+		 * We used to pass O_CREAT here, but that's has the disadvantage that
+		 * it might create a segment which has vanished through some operating
+		 * system misadventure.  In such a case, creating the segment here
+		 * undermines _mdfd_getseg's attempts to notice and report an error
+		 * upon access to a missing segment.
+		 */
+		v = _mdfd_openseg(reln, forknum, segno, 0);
+		if (v == NULL)
+			return segno * ((BlockNumber) RELSEG_SIZE);
 	}
 }
 
@@ -933,9 +930,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 void
 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
-	MdfdVec    *v;
 	BlockNumber curnblk;
 	BlockNumber priorblocks;
+	int			curopensegs;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -955,19 +952,24 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	if (nblocks == curnblk)
 		return;					/* no work */
 
-	v = mdopen(reln, forknum, EXTENSION_FAIL);
-
-	priorblocks = 0;
-	while (v != NULL)
+	/*
+	 * Truncate segments, starting at the last one. Starting at the end makes
+	 * managing the memory for the fd array easier, should there be errors.
+	 */
+	curopensegs = reln->md_num_open_segs[forknum];
+	while (curopensegs > 0)
 	{
-		MdfdVec    *ov = v;
+		MdfdVec    *v;
+
+		priorblocks = (curopensegs - 1) * RELSEG_SIZE;
+
+		v = &reln->md_seg_fds[forknum][curopensegs - 1];
 
 		if (priorblocks > nblocks)
 		{
 			/*
-			 * This segment is no longer active (and has already been unlinked
-			 * from the mdfd_chain). We truncate the file, but do not delete
-			 * it, for reasons explained in the header comments.
+			 * This segment is no longer active. We truncate the file, but do
+			 * not delete it, for reasons explained in the header comments.
 			 */
 			if (FileTruncate(v->mdfd_vfd, 0) < 0)
 				ereport(ERROR,
@@ -977,20 +979,20 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 
 			if (!SmgrIsTemp(reln))
 				register_dirty_segment(reln, forknum, v);
-			v = v->mdfd_chain;
-			Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
-												 * segment */
-			pfree(ov);
+
+			/* we never drop the 1st segment */
+			Assert(v != &reln->md_seg_fds[forknum][0]);
+
+			_fdvec_resize(reln, forknum, curopensegs - 1);
 		}
 		else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 		{
 			/*
 			 * This is the last segment we want to keep. Truncate the file to
-			 * the right length, and clear chain link that points to any
-			 * remaining segments (which we shall zap). NOTE: if nblocks is
-			 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
-			 * segment to 0 length but keep it. This adheres to the invariant
-			 * given in the header comments.
+			 * the right length. NOTE: if nblocks is exactly a multiple K of
+			 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
+			 * keep it. This adheres to the invariant given in the header
+			 * comments.
 			 */
 			BlockNumber lastsegblocks = nblocks - priorblocks;
 
@@ -1002,18 +1004,16 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 						   nblocks)));
 			if (!SmgrIsTemp(reln))
 				register_dirty_segment(reln, forknum, v);
-			v = v->mdfd_chain;
-			ov->mdfd_chain = NULL;
 		}
 		else
 		{
 			/*
-			 * We still need this segment and 0 or more blocks beyond it, so
-			 * nothing to do here.
+			 * We still need this segment, so nothing to do for this and any
+			 * earlier segment.
 			 */
-			v = v->mdfd_chain;
+			break;
 		}
-		priorblocks += RELSEG_SIZE;
+		curopensegs--;
 	}
 }
 
@@ -1026,7 +1026,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 void
 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
-	MdfdVec    *v;
+	int			segno;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -1034,16 +1034,18 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 	 */
 	mdnblocks(reln, forknum);
 
-	v = mdopen(reln, forknum, EXTENSION_FAIL);
+	segno = reln->md_num_open_segs[forknum];
 
-	while (v != NULL)
+	while (segno > 0)
 	{
+		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
+
 		if (FileSync(v->mdfd_vfd) < 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(v->mdfd_vfd))));
-		v = v->mdfd_chain;
+		segno--;
 	}
 }
 
@@ -1706,12 +1708,34 @@ ForgetDatabaseFsyncRequests(Oid dbid)
 
 
 /*
- *	_fdvec_alloc() -- Make a MdfdVec object.
+ *	_fdvec_resize() -- Resize the fork's open segments array
  */
-static MdfdVec *
-_fdvec_alloc(void)
+static void
+_fdvec_resize(SMgrRelation reln,
+			  ForkNumber forknum,
+			  int nseg)
 {
-	return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
+	if (nseg == 0)
+	{
+		if (reln->md_num_open_segs[forknum] > 0)
+		{
+			pfree(reln->md_seg_fds[forknum]);
+			reln->md_seg_fds[forknum] = NULL;
+		}
+	}
+	else if (reln->md_num_open_segs[forknum] == 0)
+	{
+		reln->md_seg_fds[forknum] =
+			MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
+	}
+	else
+	{
+		reln->md_seg_fds[forknum] =
+			repalloc(reln->md_seg_fds[forknum],
+					 sizeof(MdfdVec) * nseg);
+	}
+
+	reln->md_num_open_segs[forknum] = nseg;
 }
 
 /*
@@ -1759,13 +1783,14 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
 	if (fd < 0)
 		return NULL;
 
-	/* allocate an mdfdvec entry for it */
-	v = _fdvec_alloc();
+	if (segno <= reln->md_num_open_segs[forknum])
+		_fdvec_resize(reln, forknum, segno + 1);
 
 	/* fill the entry */
+	v = &reln->md_seg_fds[forknum][segno];
 	v->mdfd_vfd = fd;
 	v->mdfd_segno = segno;
-	v->mdfd_chain = NULL;
+
 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 
 	/* all done */
@@ -1784,7 +1809,7 @@ static MdfdVec *
 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 bool skipFsync, int behavior)
 {
-	MdfdVec    *v = mdopen(reln, forknum, behavior);
+	MdfdVec    *v;
 	BlockNumber targetseg;
 	BlockNumber nextsegno;
 
@@ -1792,98 +1817,117 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 	Assert(behavior &
 		   (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL));
 
-	if (!v)
-		return NULL;			/* if behavior & EXTENSION_RETURN_NULL */
-
 	targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
-	for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
+
+	/* if an existing and opened segment, we're done */
+	if (targetseg < reln->md_num_open_segs[forknum])
 	{
+		v = &reln->md_seg_fds[forknum][targetseg];
+		return v;
+	}
+
+	/*
+	 * The target segment is not yet open. Iterate over all the segments
+	 * between the last opened and the target segment. This way missing
+	 * segments either raise an error, or get created (according to
+	 * 'behavior'). Start with either the last opened, or the first segment if
+	 * none was opened before.
+	 */
+	if (reln->md_num_open_segs[forknum] > 0)
+		v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
+	else
+	{
+		v = mdopen(reln, forknum, behavior);
+		if (!v)
+			return NULL;			/* if behavior & EXTENSION_RETURN_NULL */
+	}
+
+	for (nextsegno = reln->md_num_open_segs[forknum];
+		 nextsegno <= targetseg; nextsegno++)
+	{
+		BlockNumber nblocks = _mdnblocks(reln, forknum, v);
+		int			flags = 0;
+
 		Assert(nextsegno == v->mdfd_segno + 1);
 
-		if (v->mdfd_chain == NULL)
+		if (nblocks > ((BlockNumber) RELSEG_SIZE))
+			elog(FATAL, "segment too big");
+
+		if ((behavior & EXTENSION_CREATE) ||
+			(InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
 		{
-			BlockNumber nblocks = _mdnblocks(reln, forknum, v);
-			int			flags = 0;
-
-			if (nblocks > ((BlockNumber) RELSEG_SIZE))
-				elog(FATAL, "segment too big");
-
-			if ((behavior & EXTENSION_CREATE) ||
-				(InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
+			/*
+			 * Normally we will create new segments only if authorized by
+			 * the caller (i.e., we are doing mdextend()).  But when doing
+			 * WAL recovery, create segments anyway; this allows cases
+			 * such as replaying WAL data that has a write into a
+			 * high-numbered segment of a relation that was later deleted.
+			 * We want to go ahead and create the segments so we can
+			 * finish out the replay.  However if the caller has specified
+			 * EXTENSION_REALLY_RETURN_NULL, then extension is not desired
+			 * even in recovery; we won't reach this point in that case.
+			 *
+			 * We have to maintain the invariant that segments before the
+			 * last active segment are of size RELSEG_SIZE; therefore, if
+			 * extending, pad them out with zeroes if needed.  (This only
+			 * matters if in recovery, or if the caller is extending the
+			 * relation discontiguously, but that can happen in hash
+			 * indexes.)
+			 */
+			if (nblocks < ((BlockNumber) RELSEG_SIZE))
 			{
-				/*
-				 * Normally we will create new segments only if authorized by
-				 * the caller (i.e., we are doing mdextend()).  But when doing
-				 * WAL recovery, create segments anyway; this allows cases
-				 * such as replaying WAL data that has a write into a
-				 * high-numbered segment of a relation that was later deleted.
-				 * We want to go ahead and create the segments so we can
-				 * finish out the replay.  However if the caller has specified
-				 * EXTENSION_REALLY_RETURN_NULL, then extension is not desired
-				 * even in recovery; we won't reach this point in that case.
-				 *
-				 * We have to maintain the invariant that segments before the
-				 * last active segment are of size RELSEG_SIZE; therefore, if
-				 * extending, pad them out with zeroes if needed.  (This only
-				 * matters if in recovery, or if the caller is extending the
-				 * relation discontiguously, but that can happen in hash
-				 * indexes.)
-				 */
-				if (nblocks < ((BlockNumber) RELSEG_SIZE))
-				{
-					char	   *zerobuf = palloc0(BLCKSZ);
+				char	   *zerobuf = palloc0(BLCKSZ);
 
-					mdextend(reln, forknum,
-							 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
-							 zerobuf, skipFsync);
-					pfree(zerobuf);
-				}
-				flags = O_CREAT;
-			}
-			else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
-					 nblocks < ((BlockNumber) RELSEG_SIZE))
-			{
-				/*
-				 * When not extending (or explicitly including truncated
-				 * segments), only open the next segment if the current one is
-				 * exactly RELSEG_SIZE.  If not (this branch), either return
-				 * NULL or fail.
-				 */
-				if (behavior & EXTENSION_RETURN_NULL)
-				{
-					/*
-					 * Some callers discern between reasons for _mdfd_getseg()
-					 * returning NULL based on errno. As there's no failing
-					 * syscall involved in this case, explicitly set errno to
-					 * ENOENT, as that seems the closest interpretation.
-					 */
-					errno = ENOENT;
-					return NULL;
-				}
-
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
-								_mdfd_segpath(reln, forknum, nextsegno),
-								blkno, nblocks)));
-			}
-
-			v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, flags);
-
-			if (v->mdfd_chain == NULL)
-			{
-				if ((behavior & EXTENSION_RETURN_NULL) &&
-					FILE_POSSIBLY_DELETED(errno))
-					return NULL;
-				ereport(ERROR,
-						(errcode_for_file_access(),
-				   errmsg("could not open file \"%s\" (target block %u): %m",
-						  _mdfd_segpath(reln, forknum, nextsegno),
-						  blkno)));
+				mdextend(reln, forknum,
+						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+						 zerobuf, skipFsync);
+				pfree(zerobuf);
 			}
+			flags = O_CREAT;
+		}
+		else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
+				 nblocks < ((BlockNumber) RELSEG_SIZE))
+		{
+			/*
+			 * When not extending (or explicitly including truncated
+			 * segments), only open the next segment if the current one is
+			 * exactly RELSEG_SIZE.  If not (this branch), either return
+			 * NULL or fail.
+			 */
+			if (behavior & EXTENSION_RETURN_NULL)
+			{
+				/*
+				 * Some callers discern between reasons for _mdfd_getseg()
+				 * returning NULL based on errno. As there's no failing
+				 * syscall involved in this case, explicitly set errno to
+				 * ENOENT, as that seems the closest interpretation.
+				 */
+				errno = ENOENT;
+				return NULL;
+			}
+
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
+							_mdfd_segpath(reln, forknum, nextsegno),
+							blkno, nblocks)));
+		}
+
+		v = _mdfd_openseg(reln, forknum, nextsegno, flags);
+
+		if (v == NULL)
+		{
+			if ((behavior & EXTENSION_RETURN_NULL) &&
+				FILE_POSSIBLY_DELETED(errno))
+				return NULL;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+			   errmsg("could not open file \"%s\" (target block %u): %m",
+					  _mdfd_segpath(reln, forknum, nextsegno),
+					  blkno)));
 		}
-		v = v->mdfd_chain;
 	}
+
 	return v;
 }
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 94aa952..93f0080 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -174,7 +174,7 @@ smgropen(RelFileNode rnode, BackendId backend)
 
 		/* mark it not open */
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			reln->md_fd[forknum] = NULL;
+			reln->md_num_open_segs[forknum] = 0;
 
 		/* it has no owner yet */
 		add_to_unowned_list(reln);
@@ -379,7 +379,7 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 	 * Exit quickly in WAL replay mode if we've already opened the file. If
 	 * it's open, it surely must exist.
 	 */
-	if (isRedo && reln->md_fd[forknum] != NULL)
+	if (isRedo && reln->md_num_open_segs[forknum] > 0)
 		return;
 
 	/*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a8e7877..3430d86 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -64,8 +64,12 @@ typedef struct SMgrRelationData
 	 */
 	int			smgr_which;		/* storage manager selector */
 
-	/* for md.c; NULL for forks that are not open */
-	struct _MdfdVec *md_fd[MAX_FORKNUM + 1];
+	/*
+	 * for md.c; per-fork arrays of the number of open segments
+	 * (md_num_open_segs) and the segments themselves (md_seg_fds).
+	 */
+	int			md_num_open_segs[MAX_FORKNUM + 1];
+	struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
 
 	/* if unowned, list link in list of all unowned SMgrRelations */
 	struct SMgrRelationData *next_unowned_reln;
-- 
2.8.1

>From d9321d75a48ac8b817fa469f064c76ce54b84919 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Tue, 15 Dec 2015 19:03:10 +0100
Subject: [PATCH 2/2] Faster PageIsVerified() for the all zeroes case.

That's primarily useful for testing the handling of very large
relations, using sparse files.
---
 src/backend/storage/page/bufpage.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index f2a07f2..1b70bfb 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -81,7 +81,7 @@ bool
 PageIsVerified(Page page, BlockNumber blkno)
 {
 	PageHeader	p = (PageHeader) page;
-	char	   *pagebytes;
+	size_t	   *pagebytes;
 	int			i;
 	bool		checksum_failure = false;
 	bool		header_sane = false;
@@ -118,10 +118,17 @@ PageIsVerified(Page page, BlockNumber blkno)
 			return true;
 	}
 
-	/* Check all-zeroes case */
+	/*
+	 * Check all-zeroes case. Luckily BLCKSZ is guaranteed to always be a
+	 * multiple of size_t - and it's much faster to compare memory using the
+	 * native word size.
+	 */
+	StaticAssertStmt(BLCKSZ == (BLCKSZ / sizeof(size_t)) * sizeof(size_t),
+					 "BLCKSZ has to be a multiple of sizeof(size_t)");
+
 	all_zeroes = true;
-	pagebytes = (char *) page;
-	for (i = 0; i < BLCKSZ; i++)
+	pagebytes = (size_t *) page;
+	for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
 	{
 		if (pagebytes[i] != 0)
 		{
-- 
2.8.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to