Tom Lane wrote:
"Simon Riggs" <[EMAIL PROTECTED]> writes:
As regards the zero_damaged_pages question, I raised that some time ago
but we didn't arrive at an explicit answer. All I would say is we can't
allow invalid pages in the buffer manager at any time, whatever options
we have requested, otherwise other code will fail almost immediately.


Yeah --- the proposed new bufmgr routine should probably explicitly zero
the content of the buffer.  It doesn't really matter in the context of
WAL recovery, since there can't be any concurrent access to the buffer,
but it'd make it safe to use in non-WAL contexts (I think there are
other places where we know we are going to init the page and so a
physical read is a waste of time).

To implement that correctly, I think we'd need to take the content lock to clear the buffer if it's already found in the cache. It doesn't seem right to me for the buffer manager to do that, in the worst case it could lead to deadlocks if that function was ever used while holding another buffer locked.

What we could have is the semantics of "Return a buffer, with either correct contents or completely zeroed out". It would act just like ReadBuffer if the buffer was already in memory, and zero out the page otherwise. That's a bit strange semantics to have, but is simple to implement and works for the use-cases we've been talking about.

Patch implementing that attached. I named the function "ReadOrZeroBuffer".

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/transam/xlogutils.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlogutils.c,v
retrieving revision 1.49
diff -c -r1.49 xlogutils.c
*** src/backend/access/transam/xlogutils.c	5 Jan 2007 22:19:24 -0000	1.49
--- src/backend/access/transam/xlogutils.c	27 Apr 2007 10:44:37 -0000
***************
*** 226,232 ****
  	if (blkno < lastblock)
  	{
  		/* page exists in file */
! 		buffer = ReadBuffer(reln, blkno);
  	}
  	else
  	{
--- 226,235 ----
  	if (blkno < lastblock)
  	{
  		/* page exists in file */
! 		if (init)
! 			buffer = ReadOrZeroBuffer(reln, blkno);
! 		else
! 			buffer = ReadBuffer(reln, blkno);
  	}
  	else
  	{
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.216
diff -c -r1.216 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c	30 Mar 2007 18:34:55 -0000	1.216
--- src/backend/storage/buffer/bufmgr.c	27 Apr 2007 10:49:08 -0000
***************
*** 17,22 ****
--- 17,26 ----
   *		and pin it so that no one can destroy it while this process
   *		is using it.
   *
+  * ReadOrZeroBuffer() -- like ReadBuffer, but clears out the contents of the 
+  *		page if it's not in cache already. Used in recovery when the page is
+  *		completely overwritten from WAL.
+  *
   * ReleaseBuffer() -- unpin a buffer
   *
   * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
***************
*** 97,102 ****
--- 101,107 ----
  static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
  				  int set_flag_bits);
  static void buffer_write_error_callback(void *arg);
+ static Buffer ReadBuffer_common(Relation reln, BlockNumber blockNum, bool alloc_only);
  static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
  			bool *foundPtr);
  static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
***************
*** 121,126 ****
--- 126,152 ----
  Buffer
  ReadBuffer(Relation reln, BlockNumber blockNum)
  {
+ 	return ReadBuffer_common(reln, blockNum, false);
+ }
+ 
+ /*
+  * ReadOrZeroBuffer -- like ReadBuffer, but if the page isn't in buffer
+  *		cache already, it's filled with zeros instead of reading it from
+  *		disk. The caller is expected to rewrite it with real data,
+  *		regardless of the current contents.
+  */
+ Buffer
+ ReadOrZeroBuffer(Relation reln, BlockNumber blockNum)
+ {
+ 	return ReadBuffer_common(reln, blockNum, true);
+ }
+ 
+ /*
+  * ReadBuffer_common -- common logic of ReadBuffer and ReadZeroedBuffer
+  */
+ static Buffer
+ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool zeroPage)
+ {
  	volatile BufferDesc *bufHdr;
  	Block		bufBlock;
  	bool		found;
***************
*** 253,269 ****
  	}
  	else
  	{
  		smgrread(reln->rd_smgr, blockNum, (char *) bufBlock);
  		/* check for garbage data */
  		if (!PageHeaderIsValid((PageHeader) bufBlock))
  		{
  			/*
! 			 * During WAL recovery, the first access to any data page should
! 			 * overwrite the whole page from the WAL; so a clobbered page
! 			 * header is not reason to fail.  Hence, when InRecovery we may
! 			 * always act as though zero_damaged_pages is ON.
  			 */
! 			if (zero_damaged_pages || InRecovery)
  			{
  				ereport(WARNING,
  						(errcode(ERRCODE_DATA_CORRUPTED),
--- 279,304 ----
  	}
  	else
  	{
+ 		/* 
+ 		 * Read in the page, unless the caller intends to overwrite it
+ 		 * and just wants us to allocate a buffer.
+ 		 */
+ 		if (zeroPage)
+ 			MemSet((char *) bufBlock, 0, BLCKSZ);
+ 		else
+ 		{
  		smgrread(reln->rd_smgr, blockNum, (char *) bufBlock);
  		/* check for garbage data */
  		if (!PageHeaderIsValid((PageHeader) bufBlock))
  		{
  			/*
! 			 * We used to ignore corrupt pages in WAL recovery, but that
! 			 * was only ever safe if full_page_writes was enabled. Now that
! 			 * the caller sets zeroPage to true if he intends to overwrite
! 			 * the whole page, if we get here the page better be valid or
! 			 * we'll lose data.
  			 */
! 			if (zero_damaged_pages)
  			{
  				ereport(WARNING,
  						(errcode(ERRCODE_DATA_CORRUPTED),
***************
*** 277,282 ****
--- 312,318 ----
  				 errmsg("invalid page header in block %u of relation \"%s\"",
  						blockNum, RelationGetRelationName(reln))));
  		}
+ 		}
  	}
  
  	if (isLocalBuf)
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.102
diff -c -r1.102 bufmgr.h
*** src/include/storage/bufmgr.h	5 Jan 2007 22:19:57 -0000	1.102
--- src/include/storage/bufmgr.h	27 Apr 2007 10:44:20 -0000
***************
*** 111,116 ****
--- 111,117 ----
   * prototypes for functions in bufmgr.c
   */
  extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+ extern Buffer ReadOrZeroBuffer(Relation reln, BlockNumber blockNum);
  extern void ReleaseBuffer(Buffer buffer);
  extern void UnlockReleaseBuffer(Buffer buffer);
  extern void MarkBufferDirty(Buffer buffer);
---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend

Reply via email to