Hi,

On 2014-03-31 12:10:01 +0200, Andres Freund wrote:
> I recently have seen some perf profiles in which _mdfd_getseg() was in
> the top #3 when VACUUMing large (~200GB) relations. Called by mdread(),
> mdwrite(). Looking at it's implementation, I am not surprised. It
> iterates over all segment entries a relations has; for every read or
> write. That's not painful for smaller relations, but at a couple of
> hundred GB it starts to be annoying. Especially if kernel readahead has
> already read in all data from disk.
> 
> I don't have a good idea what to do about this yet, but it seems like
> something that should be fixed mid-term.
> 
> The best I can come up is is caching the last mdvec used, but that's
> fairly ugly. Alternatively it might be a good idea to not store MdfdVec
> as a linked list, but as a densely allocated array.

I've seen this a couple times more since. On larger relations it gets
even more pronounced. When sequentially scanning a 2TB relation,
_mdfd_getseg() gets up to 80% proportionate CPU time towards the end of
the scan.

I wrote the attached patch that get rids of that essentially quadratic
behaviour, by replacing the mdfd chain/singly linked list with an
array. Since we seldomly grow files by a whole segment I can't see the
slightly bigger memory reallocations matter significantly. In pretty
much every other case the array is bound to be a winner.

Does anybody have fundamental arguments against that idea?

With some additional work we could save a bit more memory by getting rid
of the mdfd_segno as it's essentially redundant - but that's not
entirely trivial and I'm unsure if it's worth it.

I've also attached a second patch that makes PageIsVerified() noticeably
faster when the page is new. That's helpful and related because it makes
it easier to test the correctness of the md.c rewrite by faking full 1GB
segments. It's also pretty simple, so there's imo little reason not to
do it.

Greetings,

Andres Freund

PS:
  The research leading to these results has received funding from the
  European Union's Seventh Framework Programme (FP7/2007-2013) under
  grant agreement n° 318633

-- 
 Andres Freund                     http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
>From f7bf4813eeee7d5c3c4268e0fb6231ca6996d9fe Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 30 Oct 2014 13:24:20 +0100
Subject: [PATCH 1/2] Faster PageIsVerified() for the all zeroes case.

---
 src/backend/storage/page/bufpage.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 6351a9b..c175ed0 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -81,7 +81,7 @@ bool
 PageIsVerified(Page page, BlockNumber blkno)
 {
 	PageHeader	p = (PageHeader) page;
-	char	   *pagebytes;
+	size_t	   *pagebytes;
 	int			i;
 	bool		checksum_failure = false;
 	bool		header_sane = false;
@@ -118,10 +118,17 @@ PageIsVerified(Page page, BlockNumber blkno)
 			return true;
 	}
 
-	/* Check all-zeroes case */
+	/*
+	 * Check all-zeroes case. Luckily BLCKSZ is guaranteed to always be a
+	 * multiple of size_t - and it's much faster compare memory using the
+	 * processor's native word size.
+	 */
+	StaticAssertStmt(BLCKSZ == (BLCKSZ / sizeof(size_t)) * sizeof(size_t),
+					 "BLCKSZ has to be a multiple of sizeof(size_t)");
+
 	all_zeroes = true;
-	pagebytes = (char *) page;
-	for (i = 0; i < BLCKSZ; i++)
+	pagebytes = (size_t *) page;
+	for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
 	{
 		if (pagebytes[i] != 0)
 		{
-- 
2.0.0.rc2.4.g1dc51c6.dirty

>From 5bc40c11081a0d034f2857b45cb3d45c5b63fa58 Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Thu, 30 Oct 2014 13:24:52 +0100
Subject: [PATCH 2/2] Much faster/O(1) mfdvec.

---
 src/backend/storage/smgr/md.c   | 307 +++++++++++++++++++++-------------------
 src/backend/storage/smgr/smgr.c |   4 +-
 src/include/storage/smgr.h      |   4 +-
 3 files changed, 166 insertions(+), 149 deletions(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 167d61c..6a7afa8 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -92,27 +92,22 @@
  *	out to an unlinked old copy of a segment file that will eventually
  *	disappear.
  *
- *	The file descriptor pointer (md_fd field) stored in the SMgrRelation
- *	cache is, therefore, just the head of a list of MdfdVec objects, one
- *	per segment.  But note the md_fd pointer can be NULL, indicating
- *	relation not open.
+ *	File descriptors are stored in the md_seg_fds arrays inside
+ *	SMgrRelation. The length of these arrays is stored in md_seg_no.  Note
+ *	that md_nfd having a specific value does not necessarily mean the relation
+ *	doesn't have additional segments; we may just not have opened the next
+ *	segment yet.  (We could not have "all segments are in the array" as an
+ *	invariant anyway, since another backend could extend the relation when we
+ *	weren't looking.)  We do not have entries for inactive segments, however;
+ *	as soon as we find a partial segment, we assume that any subsequent
+ *	segments are inactive.
  *
- *	Also note that mdfd_chain == NULL does not necessarily mean the relation
- *	doesn't have another segment after this one; we may just not have
- *	opened the next segment yet.  (We could not have "all segments are
- *	in the chain" as an invariant anyway, since another backend could
- *	extend the relation when we weren't looking.)  We do not make chain
- *	entries for inactive segments, however; as soon as we find a partial
- *	segment, we assume that any subsequent segments are inactive.
- *
- *	All MdfdVec objects are palloc'd in the MdCxt memory context.
+ *	The entire MdfdVec array is palloc'd in the MdCxt memory context.
  */
-
 typedef struct _MdfdVec
 {
 	File		mdfd_vfd;		/* fd number in fd.c's pool */
 	BlockNumber mdfd_segno;		/* segment number, from 0 */
-	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
 } MdfdVec;
 
 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
@@ -178,7 +173,9 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 					   MdfdVec *seg);
 static void register_unlink(RelFileNodeBackend rnode);
-static MdfdVec *_fdvec_alloc(void);
+static void _fdvec_alloc(SMgrRelation reln,
+							  ForkNumber forknum,
+							  size_t nseg);
 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
 			  BlockNumber segno);
 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
@@ -288,13 +285,14 @@ mdexists(SMgrRelation reln, ForkNumber forkNum)
 void
 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 {
+	MdfdVec    *mdfd;
 	char	   *path;
 	File		fd;
 
-	if (isRedo && reln->md_fd[forkNum] != NULL)
+	if (isRedo && reln->md_seg_no[forkNum] > 0)
 		return;					/* created and opened already... */
 
-	Assert(reln->md_fd[forkNum] == NULL);
+	Assert(reln->md_seg_no[forkNum] == 0);
 
 	path = relpath(reln->smgr_rnode, forkNum);
 
@@ -324,11 +322,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 
 	pfree(path);
 
-	reln->md_fd[forkNum] = _fdvec_alloc();
-
-	reln->md_fd[forkNum]->mdfd_vfd = fd;
-	reln->md_fd[forkNum]->mdfd_segno = 0;
-	reln->md_fd[forkNum]->mdfd_chain = NULL;
+	_fdvec_alloc(reln, forkNum, 1);
+	mdfd = &reln->md_seg_fds[forkNum][0];
+	mdfd->mdfd_vfd = fd;
+	mdfd->mdfd_segno = 0;
 }
 
 /*
@@ -573,8 +570,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 	File		fd;
 
 	/* No work if already open */
-	if (reln->md_fd[forknum])
-		return reln->md_fd[forknum];
+	if (reln->md_seg_no[forknum] > 0)
+		return reln->md_seg_fds[forknum];
 
 	path = relpath(reln->smgr_rnode, forknum);
 
@@ -606,11 +603,11 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 
 	pfree(path);
 
-	reln->md_fd[forknum] = mdfd = _fdvec_alloc();
-
+	_fdvec_alloc(reln, forknum, 1);
+	mdfd = &reln->md_seg_fds[forknum][0];
 	mdfd->mdfd_vfd = fd;
 	mdfd->mdfd_segno = 0;
-	mdfd->mdfd_chain = NULL;
+
 	Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 
 	return mdfd;
@@ -622,24 +619,21 @@ mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 void
 mdclose(SMgrRelation reln, ForkNumber forknum)
 {
-	MdfdVec    *v = reln->md_fd[forknum];
-
-	/* No work if already closed */
-	if (v == NULL)
-		return;
+	size_t		nopensegs;
 
-	reln->md_fd[forknum] = NULL;	/* prevent dangling pointer after error */
+	nopensegs = reln->md_seg_no[forknum];
 
-	while (v != NULL)
+	/* close segments starting from the end */
+	while (nopensegs > 0)
 	{
-		MdfdVec    *ov = v;
+		MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
 
 		/* if not closed already */
 		if (v->mdfd_vfd >= 0)
 			FileClose(v->mdfd_vfd);
-		/* Now free vector */
-		v = v->mdfd_chain;
-		pfree(ov);
+
+		nopensegs--;
+		_fdvec_alloc(reln, forknum, nopensegs);
 	}
 }
 
@@ -814,6 +808,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 	BlockNumber nblocks;
 	BlockNumber segno = 0;
 
+	/* mdopen has opened the first segment */
+	Assert(reln->md_seg_no[forknum] > 0);
+
 	/*
 	 * Skip through any segments that aren't the last one, to avoid redundant
 	 * seeks on them.  We have previously verified that these segments are
@@ -827,11 +824,8 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 	 * segments; that's OK because the checkpointer never needs to compute
 	 * relation size.)
 	 */
-	while (v->mdfd_chain != NULL)
-	{
-		segno++;
-		v = v->mdfd_chain;
-	}
+	segno = reln->md_seg_no[forknum] - 1;
+	v = &reln->md_seg_fds[forknum][segno];
 
 	for (;;)
 	{
@@ -846,23 +840,17 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 		 */
 		segno++;
 
-		if (v->mdfd_chain == NULL)
-		{
-			/*
-			 * Because we pass O_CREAT, we will create the next segment (with
-			 * zero length) immediately, if the last segment is of length
-			 * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
-			 * the logic simple.
-			 */
-			v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
-			if (v->mdfd_chain == NULL)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not open file \"%s\": %m",
-								_mdfd_segpath(reln, forknum, segno))));
-		}
-
-		v = v->mdfd_chain;
+		/*
+		 * Because we pass O_CREAT, we will create the next segment (with zero
+		 * length) immediately, if the last segment is of length RELSEG_SIZE.
+		 * While perhaps not strictly necessary, this keeps the logic simple.
+		 */
+		v = _mdfd_openseg(reln, forknum, segno, O_CREAT);
+		if (v == NULL)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m",
+							_mdfd_segpath(reln, forknum, segno))));
 	}
 }
 
@@ -872,9 +860,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 void
 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
-	MdfdVec    *v;
 	BlockNumber curnblk;
 	BlockNumber priorblocks;
+	size_t		curopensegs;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -894,19 +882,24 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	if (nblocks == curnblk)
 		return;					/* no work */
 
-	v = mdopen(reln, forknum, EXTENSION_FAIL);
-
-	priorblocks = 0;
-	while (v != NULL)
+	/*
+	 * Truncate segments, starting at the last one. Starting at the end makes
+	 * managing the memory for the fd array easier should there be errors.
+	 */
+	curopensegs = reln->md_seg_no[forknum];
+	while (curopensegs > 0 )
 	{
-		MdfdVec    *ov = v;
+		MdfdVec    *v;
+
+		priorblocks = (curopensegs - 1) * RELSEG_SIZE;
+
+		v = &reln->md_seg_fds[forknum][curopensegs - 1];
 
 		if (priorblocks > nblocks)
 		{
 			/*
-			 * This segment is no longer active (and has already been unlinked
-			 * from the mdfd_chain). We truncate the file, but do not delete
-			 * it, for reasons explained in the header comments.
+			 * This segment is no longer active. We truncate the file, but do
+			 * not delete it, for reasons explained in the header comments.
 			 */
 			if (FileTruncate(v->mdfd_vfd, 0) < 0)
 				ereport(ERROR,
@@ -916,20 +909,20 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 
 			if (!SmgrIsTemp(reln))
 				register_dirty_segment(reln, forknum, v);
-			v = v->mdfd_chain;
-			Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
-												 * segment */
-			pfree(ov);
+
+			/* we never drop the 1st segment */
+			Assert(v != &reln->md_seg_fds[forknum][0]);
+
+			_fdvec_alloc(reln, forknum, curopensegs - 1);
 		}
 		else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 		{
 			/*
 			 * This is the last segment we want to keep. Truncate the file to
-			 * the right length, and clear chain link that points to any
-			 * remaining segments (which we shall zap). NOTE: if nblocks is
-			 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
-			 * segment to 0 length but keep it. This adheres to the invariant
-			 * given in the header comments.
+			 * the right length. NOTE: if nblocks is exactly a multiple K of
+			 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
+			 * keep it. This adheres to the invariant given in the header
+			 * comments.
 			 */
 			BlockNumber lastsegblocks = nblocks - priorblocks;
 
@@ -941,18 +934,16 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 						   nblocks)));
 			if (!SmgrIsTemp(reln))
 				register_dirty_segment(reln, forknum, v);
-			v = v->mdfd_chain;
-			ov->mdfd_chain = NULL;
 		}
 		else
 		{
 			/*
-			 * We still need this segment and 0 or more blocks beyond it, so
-			 * nothing to do here.
+			 * We still need this segment, so nothing to do for this and any
+			 * earlier segment.
 			 */
-			v = v->mdfd_chain;
+			break;
 		}
-		priorblocks += RELSEG_SIZE;
+		curopensegs--;
 	}
 }
 
@@ -965,7 +956,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 void
 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
-	MdfdVec    *v;
+	size_t		i;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -973,16 +964,18 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 	 */
 	mdnblocks(reln, forknum);
 
-	v = mdopen(reln, forknum, EXTENSION_FAIL);
+	i = reln->md_seg_no[forknum];
 
-	while (v != NULL)
+	while (i > 0)
 	{
+		MdfdVec    *v = &reln->md_seg_fds[forknum][i];
+
 		if (FileSync(v->mdfd_vfd) < 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(v->mdfd_vfd))));
-		v = v->mdfd_chain;
+		i--;
 	}
 }
 
@@ -1645,10 +1638,29 @@ ForgetDatabaseFsyncRequests(Oid dbid)
 /*
  *	_fdvec_alloc() -- Make a MdfdVec object.
  */
-static MdfdVec *
-_fdvec_alloc(void)
+static void
+_fdvec_alloc(SMgrRelation reln,
+			 ForkNumber forknum,
+			 size_t nseg)
 {
-	return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
+	if (nseg == 0)
+	{
+		if (reln->md_seg_no[forknum] > 0)
+			pfree(reln->md_seg_fds[forknum]);
+	}
+	else if (reln->md_seg_no[forknum] == 0)
+	{
+		reln->md_seg_fds[forknum] =
+			MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
+	}
+	else
+	{
+		reln->md_seg_fds[forknum] =
+			repalloc(reln->md_seg_fds[forknum],
+					 sizeof(MdfdVec) * nseg);
+	}
+
+	reln->md_seg_no[forknum] = nseg;
 }
 
 /*
@@ -1696,13 +1708,14 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
 	if (fd < 0)
 		return NULL;
 
-	/* allocate an mdfdvec entry for it */
-	v = _fdvec_alloc();
+	if (reln->md_seg_no[forknum] < segno + 1)
+		_fdvec_alloc(reln, forknum, segno + 1);
 
 	/* fill the entry */
+	v = &reln->md_seg_fds[forknum][segno];
 	v->mdfd_vfd = fd;
 	v->mdfd_segno = segno;
-	v->mdfd_chain = NULL;
+
 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 
 	/* all done */
@@ -1721,65 +1734,69 @@ static MdfdVec *
 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 bool skipFsync, ExtensionBehavior behavior)
 {
-	MdfdVec    *v = mdopen(reln, forknum, behavior);
+	MdfdVec    *v = NULL;
 	BlockNumber targetseg;
 	BlockNumber nextsegno;
 
-	if (!v)
-		return NULL;			/* only possible if EXTENSION_RETURN_NULL */
-
 	targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
-	for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
+	nextsegno = targetseg;
+
+	/* perhaps it's already open */
+	if (reln->md_seg_no[forknum] >= targetseg)
 	{
-		Assert(nextsegno == v->mdfd_segno + 1);
+		v =_mdfd_openseg(reln, forknum, targetseg, 0);
+		goto done;
+	}
+
+	/* check the size of the relation, this opens all segments */
+	mdnblocks(reln, forknum);
 
-		if (v->mdfd_chain == NULL)
+	/* check whether it's an existing segment */
+	if (reln->md_seg_no[forknum] >= targetseg)
+	{
+		v = _mdfd_openseg(reln, forknum, targetseg, 0);
+		goto done;
+	}
+
+	if (behavior != EXTENSION_CREATE && !InRecovery)
+		goto done;
+
+
+	if (reln->md_seg_no[forknum] > 0)
+		v = &reln->md_seg_fds[forknum][reln->md_seg_no[forknum] - 1];
+
+	/*
+	 * We only get here if we want to extend the filenode to have one more
+	 * relation. For that we first need to extend the last existing segment to
+	 * be of the full size if necessary, and then add the new segment(s).
+	 */
+	nextsegno = reln->md_seg_no[forknum] + 1;
+	for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
+	{
+		if (v != NULL && _mdnblocks(reln, forknum, v) < RELSEG_SIZE)
 		{
-			/*
-			 * Normally we will create new segments only if authorized by the
-			 * caller (i.e., we are doing mdextend()).  But when doing WAL
-			 * recovery, create segments anyway; this allows cases such as
-			 * replaying WAL data that has a write into a high-numbered
-			 * segment of a relation that was later deleted.  We want to go
-			 * ahead and create the segments so we can finish out the replay.
-			 *
-			 * We have to maintain the invariant that segments before the last
-			 * active segment are of size RELSEG_SIZE; therefore, pad them out
-			 * with zeroes if needed.  (This only matters if caller is
-			 * extending the relation discontiguously, but that can happen in
-			 * hash indexes.)
-			 */
-			if (behavior == EXTENSION_CREATE || InRecovery)
-			{
-				if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
-				{
-					char	   *zerobuf = palloc0(BLCKSZ);
+			char	   *zerobuf = palloc0(BLCKSZ);
 
-					mdextend(reln, forknum,
-							 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
-							 zerobuf, skipFsync);
-					pfree(zerobuf);
-				}
-				v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
-			}
-			else
-			{
-				/* We won't create segment if not existent */
-				v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
-			}
-			if (v->mdfd_chain == NULL)
-			{
-				if (behavior == EXTENSION_RETURN_NULL &&
-					FILE_POSSIBLY_DELETED(errno))
-					return NULL;
-				ereport(ERROR,
-						(errcode_for_file_access(),
-				   errmsg("could not open file \"%s\" (target block %u): %m",
-						  _mdfd_segpath(reln, forknum, nextsegno),
-						  blkno)));
-			}
+			mdextend(reln, forknum,
+					 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+					 zerobuf, skipFsync);
+			pfree(zerobuf);
 		}
-		v = v->mdfd_chain;
+
+		v = _mdfd_openseg(reln, forknum, nextsegno, O_CREAT);
+	}
+
+done:
+	if (v == NULL && behavior == EXTENSION_RETURN_NULL &&
+		FILE_POSSIBLY_DELETED(errno))
+		return NULL;
+	else if (v == NULL)
+	{
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\" (target block %u): %m",
+						_mdfd_segpath(reln, forknum, nextsegno),
+						blkno)));
 	}
 	return v;
 }
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index d16f559..43b6bae 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -173,7 +173,7 @@ smgropen(RelFileNode rnode, BackendId backend)
 
 		/* mark it not open */
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			reln->md_fd[forknum] = NULL;
+			reln->md_seg_no[forknum] = 0;
 
 		/* it has no owner yet */
 		add_to_unowned_list(reln);
@@ -378,7 +378,7 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 	 * Exit quickly in WAL replay mode if we've already opened the file. If
 	 * it's open, it surely must exist.
 	 */
-	if (isRedo && reln->md_fd[forknum] != NULL)
+	if (isRedo && reln->md_seg_no[forknum] > 0)
 		return;
 
 	/*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index ba7c909..e6758c6 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -64,8 +64,8 @@ typedef struct SMgrRelationData
 	 */
 	int			smgr_which;		/* storage manager selector */
 
-	/* for md.c; NULL for forks that are not open */
-	struct _MdfdVec *md_fd[MAX_FORKNUM + 1];
+	size_t		md_seg_no[MAX_FORKNUM + 1];
+	struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
 
 	/* if unowned, list link in list of all unowned SMgrRelations */
 	struct SMgrRelationData *next_unowned_reln;
-- 
2.0.0.rc2.4.g1dc51c6.dirty

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to