Re: [HACKERS] Relation extension scalability

Andres Freund Sun, 19 Jul 2015 07:08:41 -0700

Hi,

Eeek, the attached patch included a trivial last minute screwup
(dereferencing bistate unconditionally...). Fixed version attached.


Andres

>From fc095897a6f4207d384559a095f80a36cf49648c Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Sun, 29 Mar 2015 20:55:32 +0200
Subject: [PATCH] WIP: Saner heap extension.

---
 src/backend/access/heap/hio.c       |  86 ++++----
 src/backend/commands/vacuumlazy.c   |  39 ++--
 src/backend/storage/buffer/bufmgr.c | 377 ++++++++++++++++++++++++++----------
 src/backend/storage/smgr/md.c       |  62 ++++++
 src/backend/storage/smgr/smgr.c     |  20 +-
 src/include/storage/buf_internals.h |   1 +
 src/include/storage/bufmgr.h        |   1 +
 src/include/storage/smgr.h          |   7 +-
 8 files changed, 417 insertions(+), 176 deletions(-)

diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 6db73bf..b47f9fe 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -15,6 +15,8 @@
 
 #include "postgres.h"
 
+#include "miscadmin.h"
+
 #include "access/heapam.h"
 #include "access/hio.h"
 #include "access/htup_details.h"
@@ -237,7 +239,6 @@ RelationGetBufferForTuple(Relation relation, Size len,
 				saveFreeSpace;
 	BlockNumber targetBlock,
 				otherBlock;
-	bool		needLock;
 
 	len = MAXALIGN(len);		/* be conservative */
 
@@ -433,63 +434,50 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	/*
 	 * Have to extend the relation.
 	 *
-	 * We have to use a lock to ensure no one else is extending the rel at the
-	 * same time, else we will both try to initialize the same new page.  We
-	 * can skip locking for new or temp relations, however, since no one else
-	 * could be accessing them.
+	 * To avoid, as it used to be the case, holding the extension lock during
+	 * victim buffer search for the new buffer, we extend the relation here
+	 * instead of relying on bufmgr.c. We still have to hold the extension
+	 * lock to prevent a race between two backends initializing the same page.
 	 */
-	needLock = !RELATION_IS_LOCAL(relation);
-
-	if (needLock)
-		LockRelationForExtension(relation, ExclusiveLock);
+	while(true)
+	{
+		buffer = ExtendRelation(relation, MAIN_FORKNUM, bistate->strategy);
 
-	/*
-	 * XXX This does an lseek - rather expensive - but at the moment it is the
-	 * only way to accurately determine how many blocks are in a relation.  Is
-	 * it worth keeping an accurate file length in shared memory someplace,
-	 * rather than relying on the kernel to do it for us?
-	 */
-	buffer = ReadBufferBI(relation, P_NEW, bistate);
+		if (otherBuffer != InvalidBuffer)
+			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
 
-	/*
-	 * We can be certain that locking the otherBuffer first is OK, since it
-	 * must have a lower page number.
-	 */
-	if (otherBuffer != InvalidBuffer)
-		LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
+		/*
+		 * Now acquire lock on the new page.
+		 */
+		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
-	/*
-	 * Now acquire lock on the new page.
-	 */
-	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+		page = BufferGetPage(buffer);
 
-	/*
-	 * Release the file-extension lock; it's now OK for someone else to extend
-	 * the relation some more.  Note that we cannot release this lock before
-	 * we have buffer lock on the new page, or we risk a race condition
-	 * against vacuumlazy.c --- see comments therein.
-	 */
-	if (needLock)
-		UnlockRelationForExtension(relation, ExclusiveLock);
+		/*
+		 * While unlikely, it's possible that another backend managed to
+		 * initialize the page and use up the free space till we got the
+		 * exclusive lock. That'd require the page to be vacuumed (to be put
+		 * on the free space list) and then be used; possible but fairly
+		 * unlikely in practice. If it happens and there's not enough space,
+		 * just retry.
+		 */
+		if (PageIsNew(page))
+		{
+			PageInit(page, BLCKSZ, 0);
 
-	/*
-	 * We need to initialize the empty new page.  Double-check that it really
-	 * is empty (this should never happen, but if it does we don't want to
-	 * risk wiping out valid data).
-	 */
-	page = BufferGetPage(buffer);
+			Assert(len <= PageGetHeapFreeSpace(page));
+			break;
+		}
+		else if (len <= PageGetHeapFreeSpace(page))
+			break;
 
-	if (!PageIsNew(page))
-		elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
-			 BufferGetBlockNumber(buffer),
-			 RelationGetRelationName(relation));
+		if (otherBuffer != InvalidBuffer)
+			LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
 
-	PageInit(page, BufferGetPageSize(buffer), 0);
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		ReleaseBuffer(buffer);
 
-	if (len > PageGetHeapFreeSpace(page))
-	{
-		/* We should not get here given the test at the top */
-		elog(PANIC, "tuple is too big: size %zu", len);
+		CHECK_FOR_INTERRUPTS();
 	}
 
 	/*
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index a01cfb4..896731c 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -674,35 +674,18 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			/*
 			 * An all-zeroes page could be left over if a backend extends the
 			 * relation but crashes before initializing the page. Reclaim such
-			 * pages for use.
-			 *
-			 * We have to be careful here because we could be looking at a
-			 * page that someone has just added to the relation and not yet
-			 * been able to initialize (see RelationGetBufferForTuple). To
-			 * protect against that, release the buffer lock, grab the
-			 * relation extension lock momentarily, and re-lock the buffer. If
-			 * the page is still uninitialized by then, it must be left over
-			 * from a crashed backend, and we can initialize it.
-			 *
-			 * We don't really need the relation lock when this is a new or
-			 * temp relation, but it's probably not worth the code space to
-			 * check that, since this surely isn't a critical path.
-			 *
-			 * Note: the comparable code in vacuum.c need not worry because
-			 * it's got exclusive lock on the whole relation.
+			 * pages for use.  It is also possible that we're looking at a
+			 * page that has just added but not yet initialized (see
+			 * RelationGetBufferForTuple). In that case we just initialize the
+			 * page here. That means the page will end up in the free space
+			 * map a little earlier, but that seems fine.
 			 */
-			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-			LockRelationForExtension(onerel, ExclusiveLock);
-			UnlockRelationForExtension(onerel, ExclusiveLock);
-			LockBufferForCleanup(buf);
-			if (PageIsNew(page))
-			{
-				ereport(WARNING,
-				(errmsg("relation \"%s\" page %u is uninitialized --- fixing",
-						relname, blkno)));
-				PageInit(page, BufferGetPageSize(buf), 0);
-				empty_pages++;
-			}
+			ereport(DEBUG2,
+					(errmsg("relation \"%s\" page %u is uninitialized --- fixing",
+							relname, blkno)));
+			PageInit(page, BufferGetPageSize(buf), 0);
+			empty_pages++;
+
 			freespace = PageGetHeapFreeSpace(page);
 			MarkBufferDirty(buf);
 			UnlockReleaseBuffer(buf);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e4b25587..4613666 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -392,6 +392,7 @@ static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
 				  ForkNumber forkNum, BlockNumber blockNum,
 				  ReadBufferMode mode, BufferAccessStrategy strategy,
 				  bool *hit);
+static volatile BufferDesc *GetVictimBuffer(BufferAccessStrategy strategy, BufFlags *oldFlags);
 static bool PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy);
 static void PinBuffer_Locked(volatile BufferDesc *buf);
 static void UnpinBuffer(volatile BufferDesc *buf, bool fixOwner);
@@ -483,6 +484,176 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 #endif   /* USE_PREFETCH */
 }
 
+Buffer
+ExtendRelation(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy)
+{
+	BlockNumber	blockno;
+	Buffer		buf_id;
+	volatile BufferDesc *buf;
+	BufFlags	oldFlags;
+	Block		bufBlock;
+	bool		isLocalBuf = RelationUsesLocalBuffers(reln);
+	int			readblocks;
+
+	BufferTag	oldTag;			/* previous identity of selected buffer */
+	uint32		oldHash;		/* hash value for oldTag */
+	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
+
+	BufferTag	newTag;
+	uint32		newHash;
+	LWLock	   *newPartitionLock;
+
+	/* FIXME: This obviously isn't acceptable for integration */
+	if (isLocalBuf)
+	{
+		return ReadBufferExtended(reln, forkNum, P_NEW, RBM_NORMAL, strategy);
+	}
+
+	/* Open it at the smgr level if not already done */
+	RelationOpenSmgr(reln);
+
+	/* Make sure we will have room to remember the buffer pin */
+	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+retry_victim:
+	/* we'll need a clean unassociated victim buffer */
+	while (true)
+	{
+		bool		gotIt = false;
+
+		/*
+		 * Returns a buffer that was unpinned and not dirty at the time of the
+		 * check.
+		 */
+		buf = GetVictimBuffer(strategy, &oldFlags);
+
+		if (oldFlags & BM_TAG_VALID)
+		{
+			oldTag = buf->tag;
+			oldHash = BufTableHashCode(&oldTag);
+			oldPartitionLock = BufMappingPartitionLock(oldHash);
+			LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
+		}
+
+		LockBufHdr(buf);
+
+		/* somebody else might have re-pinned the buffer by now */
+		if (buf->refcount != 1 || (buf->flags & BM_DIRTY))
+		{
+			UnlockBufHdr(buf);
+		}
+		else
+		{
+			buf->flags &= ~(BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT | BM_NEW);
+
+			UnlockBufHdr(buf);
+
+			gotIt = true;
+
+			if (oldFlags & BM_TAG_VALID)
+				BufTableDelete(&oldTag, oldHash);
+		}
+
+		if (oldFlags & BM_TAG_VALID)
+			LWLockRelease(oldPartitionLock);
+
+		if (gotIt)
+			break;
+		else
+			UnpinBuffer(buf, true);
+	}
+
+	/*
+	 * At this state we have an empty victim buffer; pinned to prevent it from
+	 * being reused.
+	 */
+
+	/*
+	 * First try the current end of the relation. If a concurrent process has
+	 * acquired that, try the next one after that.
+	 */
+	blockno = smgrnblocks(reln->rd_smgr, forkNum);
+
+	while (true)
+	{
+		INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node, forkNum, blockno);
+
+		newHash = BufTableHashCode(&newTag);
+		newPartitionLock = BufMappingPartitionLock(newHash);
+		LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
+
+		buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
+
+		if (buf_id >= 0)
+		{
+			/* somebody else got this block, try the next one */
+			LWLockRelease(newPartitionLock);
+			blockno++;
+			continue;
+		}
+
+		LockBufHdr(buf);
+
+		buf->tag = newTag;
+		if (reln->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
+			buf->flags |= BM_NEW | BM_TAG_VALID | BM_PERMANENT;
+		else
+			buf->flags |= BM_NEW | BM_TAG_VALID;
+		buf->usage_count = 1;
+
+		UnlockBufHdr(buf);
+		LWLockRelease(newPartitionLock);
+
+		break;
+	}
+
+	/*
+	 * By here we made a entry into the buffer table, but haven't yet
+	 * read/written the page.  We can't just initialize the page, potentially
+	 * while we were busy with the above, another backend could have extended
+	 * the relation, written something, and the buffer could already have been
+	 * reused for something else.
+	 */
+
+	if (!StartBufferIO(buf, true))
+	{
+		/*
+		 * Somebody else is already using this block. Just try another one.
+		 */
+		UnpinBuffer(buf, true);
+		goto retry_victim;
+	}
+
+	/*
+	 * FIXME: if we die here we might have a problem: Everyone trying to read
+	 * this block will get a failure. Need to add checks for BM_NEW against
+	 * that. That's not really new to this code tho.
+	 */
+
+	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(buf) : BufHdrGetBlock(buf);
+
+	readblocks = smgrtryread(reln->rd_smgr, forkNum, blockno, bufBlock);
+
+	if (readblocks != BLCKSZ)
+	{
+		MemSet((char *) bufBlock, 0, BLCKSZ);
+
+		smgrextend(reln->rd_smgr, forkNum, blockno, (char *) bufBlock, false);
+
+		/* Set BM_VALID, terminate IO, and wake up any waiters */
+		TerminateBufferIO(buf, false, BM_VALID);
+	}
+	else
+	{
+		/* Set BM_VALID, terminate IO, and wake up any waiters */
+		TerminateBufferIO(buf, false, BM_VALID);
+		UnpinBuffer(buf, true);
+
+		goto retry_victim;
+	}
+
+	return BufferDescriptorGetBuffer(buf);
+}
 
 /*
  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
@@ -847,6 +1018,112 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	return BufferDescriptorGetBuffer(bufHdr);
 }
 
+static volatile BufferDesc *
+GetVictimBuffer(BufferAccessStrategy strategy, BufFlags *oldFlags)
+{
+	volatile BufferDesc *buf;
+
+	/*
+	 * Ensure, while the spinlock's not yet held, that there's a free refcount
+	 * entry.
+	 */
+	ReservePrivateRefCountEntry();
+
+retry:
+	/*
+	 * Select a victim buffer.  The buffer is returned with its header
+	 * spinlock still held!
+	 */
+	buf = StrategyGetBuffer(strategy);
+
+	Assert(buf->refcount == 0);
+
+	/* Must copy buffer flags while we still hold the spinlock */
+	*oldFlags = buf->flags;
+
+	/* Pin the buffer and then release the buffer spinlock */
+	PinBuffer_Locked(buf);
+
+	/*
+	 * If the buffer was dirty, try to write it out.  There is a race
+	 * condition here, in that someone might dirty it after we released it
+	 * above, or even while we are writing it out (since our share-lock
+	 * won't prevent hint-bit updates).  We will recheck the dirty bit
+	 * after re-locking the buffer header.
+	 */
+	if (*oldFlags & BM_DIRTY)
+	{
+		/*
+		 * We need a share-lock on the buffer contents to write it out
+		 * (else we might write invalid data, eg because someone else is
+		 * compacting the page contents while we write).  We must use a
+		 * conditional lock acquisition here to avoid deadlock.  Even
+		 * though the buffer was not pinned (and therefore surely not
+		 * locked) when StrategyGetBuffer returned it, someone else could
+		 * have pinned and exclusive-locked it by the time we get here. If
+		 * we try to get the lock unconditionally, we'd block waiting for
+		 * them; if they later block waiting for us, deadlock ensues.
+		 * (This has been observed to happen when two backends are both
+		 * trying to split btree index pages, and the second one just
+		 * happens to be trying to split the page the first one got from
+		 * StrategyGetBuffer.)
+		 */
+		if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
+		{
+			/*
+			 * If using a nondefault strategy, and writing the buffer
+			 * would require a WAL flush, let the strategy decide whether
+			 * to go ahead and write/reuse the buffer or to choose another
+			 * victim.  We need lock to inspect the page LSN, so this
+			 * can't be done inside StrategyGetBuffer.
+			 */
+			if (strategy != NULL)
+			{
+				XLogRecPtr	lsn;
+
+				/* Read the LSN while holding buffer header lock */
+				LockBufHdr(buf);
+				lsn = BufferGetLSN(buf);
+				UnlockBufHdr(buf);
+
+				if (XLogNeedsFlush(lsn) &&
+					StrategyRejectBuffer(strategy, buf))
+				{
+					/* Drop lock/pin and loop around for another buffer */
+					LWLockRelease(buf->content_lock);
+					UnpinBuffer(buf, true);
+					goto retry;
+				}
+			}
+
+			/* OK, do the I/O */
+			TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
+										   smgr->smgr_rnode.node.spcNode,
+											smgr->smgr_rnode.node.dbNode,
+										  smgr->smgr_rnode.node.relNode);
+
+			FlushBuffer(buf, NULL);
+			LWLockRelease(buf->content_lock);
+
+			TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
+										   smgr->smgr_rnode.node.spcNode,
+											smgr->smgr_rnode.node.dbNode,
+										  smgr->smgr_rnode.node.relNode);
+		}
+		else
+		{
+			/*
+			 * Someone else has locked the buffer, so give it up and loop
+			 * back to get another one.
+			 */
+			UnpinBuffer(buf, true);
+			goto retry;
+		}
+	}
+
+	return buf;
+}
+
 /*
  * BufferAlloc -- subroutine for ReadBuffer.  Handles lookup of a shared
  *		buffer.  If no buffer exists already, selects a replacement
@@ -940,102 +1217,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	/* Loop here in case we have to try another victim buffer */
 	for (;;)
 	{
-		/*
-		 * Ensure, while the spinlock's not yet held, that there's a free
-		 * refcount entry.
-		 */
-		ReservePrivateRefCountEntry();
-
-		/*
-		 * Select a victim buffer.  The buffer is returned with its header
-		 * spinlock still held!
-		 */
-		buf = StrategyGetBuffer(strategy);
-
-		Assert(buf->refcount == 0);
-
-		/* Must copy buffer flags while we still hold the spinlock */
-		oldFlags = buf->flags;
-
-		/* Pin the buffer and then release the buffer spinlock */
-		PinBuffer_Locked(buf);
-
-		/*
-		 * If the buffer was dirty, try to write it out.  There is a race
-		 * condition here, in that someone might dirty it after we released it
-		 * above, or even while we are writing it out (since our share-lock
-		 * won't prevent hint-bit updates).  We will recheck the dirty bit
-		 * after re-locking the buffer header.
-		 */
-		if (oldFlags & BM_DIRTY)
-		{
-			/*
-			 * We need a share-lock on the buffer contents to write it out
-			 * (else we might write invalid data, eg because someone else is
-			 * compacting the page contents while we write).  We must use a
-			 * conditional lock acquisition here to avoid deadlock.  Even
-			 * though the buffer was not pinned (and therefore surely not
-			 * locked) when StrategyGetBuffer returned it, someone else could
-			 * have pinned and exclusive-locked it by the time we get here. If
-			 * we try to get the lock unconditionally, we'd block waiting for
-			 * them; if they later block waiting for us, deadlock ensues.
-			 * (This has been observed to happen when two backends are both
-			 * trying to split btree index pages, and the second one just
-			 * happens to be trying to split the page the first one got from
-			 * StrategyGetBuffer.)
-			 */
-			if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
-			{
-				/*
-				 * If using a nondefault strategy, and writing the buffer
-				 * would require a WAL flush, let the strategy decide whether
-				 * to go ahead and write/reuse the buffer or to choose another
-				 * victim.  We need lock to inspect the page LSN, so this
-				 * can't be done inside StrategyGetBuffer.
-				 */
-				if (strategy != NULL)
-				{
-					XLogRecPtr	lsn;
-
-					/* Read the LSN while holding buffer header lock */
-					LockBufHdr(buf);
-					lsn = BufferGetLSN(buf);
-					UnlockBufHdr(buf);
-
-					if (XLogNeedsFlush(lsn) &&
-						StrategyRejectBuffer(strategy, buf))
-					{
-						/* Drop lock/pin and loop around for another buffer */
-						LWLockRelease(buf->content_lock);
-						UnpinBuffer(buf, true);
-						continue;
-					}
-				}
-
-				/* OK, do the I/O */
-				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
-											   smgr->smgr_rnode.node.spcNode,
-												smgr->smgr_rnode.node.dbNode,
-											  smgr->smgr_rnode.node.relNode);
-
-				FlushBuffer(buf, NULL);
-				LWLockRelease(buf->content_lock);
-
-				TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
-											   smgr->smgr_rnode.node.spcNode,
-												smgr->smgr_rnode.node.dbNode,
-											  smgr->smgr_rnode.node.relNode);
-			}
-			else
-			{
-				/*
-				 * Someone else has locked the buffer, so give it up and loop
-				 * back to get another one.
-				 */
-				UnpinBuffer(buf, true);
-				continue;
-			}
-		}
+		/* returns a nondirty buffer, with potentially valid contents */
+		buf = GetVictimBuffer(strategy, &oldFlags);
 
 		/*
 		 * To change the association of a valid buffer, we'll need to have
@@ -1171,7 +1354,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * 1 so that the buffer can survive one clock-sweep pass.)
 	 */
 	buf->tag = newTag;
-	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
+	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT | BM_NEW);
 	if (relpersistence == RELPERSISTENCE_PERMANENT)
 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
 	else
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 42a43bb..0038c91 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -729,6 +729,68 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	}
 }
 
+
+/*
+ *	mdtryread() -- Read the specified block from a relation.
+ */
+int
+mdtryread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+	   char *buffer)
+{
+	off_t		seekpos;
+	int			nbytes;
+	MdfdVec    *v;
+
+	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
+										reln->smgr_rnode.node.spcNode,
+										reln->smgr_rnode.node.dbNode,
+										reln->smgr_rnode.node.relNode,
+										reln->smgr_rnode.backend);
+
+	v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_RETURN_NULL);
+
+	/* would need another segment */
+	if (v == NULL)
+		return 0;
+
+	seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
+
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not seek to block %u in file \"%s\": %m",
+						blocknum, FilePathName(v->mdfd_vfd))));
+
+	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
+
+	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
+									   reln->smgr_rnode.node.spcNode,
+									   reln->smgr_rnode.node.dbNode,
+									   reln->smgr_rnode.node.relNode,
+									   reln->smgr_rnode.backend,
+									   nbytes,
+									   BLCKSZ);
+
+	if (nbytes < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read block %u in file \"%s\": %m",
+						blocknum, FilePathName(v->mdfd_vfd))));
+
+	if (nbytes > 0 && nbytes < BLCKSZ)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
+						blocknum, FilePathName(v->mdfd_vfd),
+						nbytes, BLCKSZ)));
+	}
+
+	return nbytes;
+}
+
 /*
  *	mdwrite() -- Write the supplied block at the appropriate location.
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 244b4ea..f0e9a7b 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -51,6 +51,8 @@ typedef struct f_smgr
 											  BlockNumber blocknum);
 	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
 										  BlockNumber blocknum, char *buffer);
+	int			(*smgr_tryread) (SMgrRelation reln, ForkNumber forknum,
+										  BlockNumber blocknum, char *buffer);
 	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, char *buffer, bool skipFsync);
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
@@ -66,7 +68,7 @@ typedef struct f_smgr
 static const f_smgr smgrsw[] = {
 	/* magnetic disk */
 	{mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
-		mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
+		mdprefetch, mdread, mdtryread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
 		mdpreckpt, mdsync, mdpostckpt
 	}
 };
@@ -626,6 +628,22 @@ smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	(*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
 }
 
+
+/*
+ *	smgtryrread() -- read a particular block from a relation into the supplied
+ *				  buffer.
+ *
+ *		This routine is called from the buffer manager in order to
+ *		instantiate pages in the shared buffer cache.  All storage managers
+ *		return pages in the format that POSTGRES expects.
+ */
+int
+smgrtryread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		 char *buffer)
+{
+	return (*(smgrsw[reln->smgr_which].smgr_tryread)) (reln, forknum, blocknum, buffer);
+}
+
 /*
  *	smgrwrite() -- Write the supplied buffer out.
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 521ee1c..5f961af 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -40,6 +40,7 @@
 #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
 #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
 												 * unlogged) */
+#define BM_NEW					(1 << 9)		/* Not guaranteed to exist on disk */
 
 typedef bits16 BufFlags;
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ec0a254..b52591f 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -153,6 +153,7 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 						  ForkNumber forkNum, BlockNumber blockNum,
 						  ReadBufferMode mode, BufferAccessStrategy strategy);
+extern Buffer ExtendRelation(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 69a624f..07a331c 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -94,6 +94,8 @@ extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 			 BlockNumber blocknum);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
 		 BlockNumber blocknum, char *buffer);
+extern int smgrtryread(SMgrRelation reln, ForkNumber forknum,
+		 BlockNumber blocknum, char *buffer);
 extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
 		  BlockNumber blocknum, char *buffer, bool skipFsync);
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
@@ -114,12 +116,15 @@ extern void mdclose(SMgrRelation reln, ForkNumber forknum);
 extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
-extern void mdextend(SMgrRelation reln, ForkNumber forknum,
+extern void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdappend(SMgrRelation reln, ForkNumber forknum,
 		 BlockNumber blocknum, char *buffer, bool skipFsync);
 extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
 		   BlockNumber blocknum);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	   char *buffer);
+extern int mdtryread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+	   char *buffer);
 extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
 		BlockNumber blocknum, char *buffer, bool skipFsync);
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
-- 
2.3.0.149.gf3f4077.dirty

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Relation extension scalability

Reply via email to