In recovery, with full_pages_writes=on, we read in each page only to overwrite the contents with a full page image. That's a waste of time, and can have a surprisingly large effect on recovery time.

As a quick test on my laptop, I initialized a DBT-2 test with 5 warehouses, and let it run for 2 minutes without think-times to generate some WAL. Then I did a "kill -9 postmaster", and took a copy of the data directory to use for testing recovery.


With CVS HEAD, the recovery took ~ 2 minutes. With the attached patch, it took 5 seconds. (yes, I used the same not-yet-recovered data directory in both tests, and cleared the os cache with "echo 1 > /proc/sys/vm/drop_caches").

I was surprised how big a difference it makes, but when you think about it it's logical. Without the patch, it's doing roughly the same I/O as the test itself, reading in pages, modifying them, and writing them back. With the patch, all the reads are done sequentially from the WAL, and then written back in a batch at the end of the WAL replay which is a lot more efficient.

It's interesting that (with the patch) full_page_writes can *shorten* your recovery time. I've always thought it to have a purely negative effect on performance.

I'll leave it up to the jury if this tiny little change is appropriate after feature freeze...

While working on this, this comment in ReadBuffer caught my eye:

        /*
         * During WAL recovery, the first access to any data page should
         * overwrite the whole page from the WAL; so a clobbered page
         * header is not reason to fail.  Hence, when InRecovery we may
         * always act as though zero_damaged_pages is ON.
         */
        if (zero_damaged_pages || InRecovery)
        {

But that assumption only holds if full_page_writes is enabled, right? I changed that in the attached patch as well, but if it isn't accepted that part of it should still be applied, I think.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/transam/xlogutils.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlogutils.c,v
retrieving revision 1.49
diff -c -r1.49 xlogutils.c
*** src/backend/access/transam/xlogutils.c	5 Jan 2007 22:19:24 -0000	1.49
--- src/backend/access/transam/xlogutils.c	25 Apr 2007 11:40:09 -0000
***************
*** 226,232 ****
  	if (blkno < lastblock)
  	{
  		/* page exists in file */
! 		buffer = ReadBuffer(reln, blkno);
  	}
  	else
  	{
--- 226,235 ----
  	if (blkno < lastblock)
  	{
  		/* page exists in file */
! 		if(init)
! 			buffer = ZapBuffer(reln, blkno);
! 		else
! 			buffer = ReadBuffer(reln, blkno);
  	}
  	else
  	{
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.216
diff -c -r1.216 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c	30 Mar 2007 18:34:55 -0000	1.216
--- src/backend/storage/buffer/bufmgr.c	25 Apr 2007 11:44:27 -0000
***************
*** 97,102 ****
--- 97,103 ----
  static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty,
  				  int set_flag_bits);
  static void buffer_write_error_callback(void *arg);
+ static Buffer ReadBuffer_common(Relation reln, BlockNumber blockNum, bool alloc_only);
  static volatile BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
  			bool *foundPtr);
  static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
***************
*** 121,126 ****
--- 122,148 ----
  Buffer
  ReadBuffer(Relation reln, BlockNumber blockNum)
  {
+ 	return ReadBuffer_common(reln, blockNum, false);
+ }
+ 
+ /*
+  * ZapBuffer -- like ReadBuffer, but doesn't read the contents of the page
+  *		from disk. The caller is expected to completely rewrite the page,
+  *		regardless of the current contents. This should only be used in
+  *		recovery where there's no concurrent readers that might see the
+  *		contents of the page before the caller rewrites it.
+  */
+ Buffer
+ ZapBuffer(Relation reln, BlockNumber blockNum)
+ {
+ 	Assert(InRecovery);
+ 
+ 	return ReadBuffer_common(reln, blockNum, true);
+ }
+ 
+ static Buffer
+ ReadBuffer_common(Relation reln, BlockNumber blockNum, bool alloc_only)
+ {
  	volatile BufferDesc *bufHdr;
  	Block		bufBlock;
  	bool		found;
***************
*** 253,269 ****
  	}
  	else
  	{
  		smgrread(reln->rd_smgr, blockNum, (char *) bufBlock);
  		/* check for garbage data */
  		if (!PageHeaderIsValid((PageHeader) bufBlock))
  		{
  			/*
! 			 * During WAL recovery, the first access to any data page should
! 			 * overwrite the whole page from the WAL; so a clobbered page
! 			 * header is not reason to fail.  Hence, when InRecovery we may
! 			 * always act as though zero_damaged_pages is ON.
  			 */
! 			if (zero_damaged_pages || InRecovery)
  			{
  				ereport(WARNING,
  						(errcode(ERRCODE_DATA_CORRUPTED),
--- 275,293 ----
  	}
  	else
  	{
+ 		if(!alloc_only)
+ 		{
  		smgrread(reln->rd_smgr, blockNum, (char *) bufBlock);
  		/* check for garbage data */
  		if (!PageHeaderIsValid((PageHeader) bufBlock))
  		{
  			/*
! 			 * When full_pages_writes is enabled, the first access to any data page should
! 			 * overwrite the whole page from the WAL during recovery; so a clobbered page
! 			 * header is not reason to fail.  Hence, we may
! 			 * act as though zero_damaged_pages is ON.
  			 */
! 			if (zero_damaged_pages || (InRecovery && fullPageWrites))
  			{
  				ereport(WARNING,
  						(errcode(ERRCODE_DATA_CORRUPTED),
***************
*** 277,282 ****
--- 301,307 ----
  				 errmsg("invalid page header in block %u of relation \"%s\"",
  						blockNum, RelationGetRelationName(reln))));
  		}
+ 		}
  	}
  
  	if (isLocalBuf)
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.384
diff -c -r1.384 guc.c
*** src/backend/utils/misc/guc.c	12 Apr 2007 06:53:47 -0000	1.384
--- src/backend/utils/misc/guc.c	25 Apr 2007 11:19:52 -0000
***************
*** 103,109 ****
  extern int	CommitDelay;
  extern int	CommitSiblings;
  extern char *default_tablespace;
- extern bool fullPageWrites;
  
  #ifdef TRACE_SORT
  extern bool trace_sort;
--- 103,108 ----
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/xlog.h,v
retrieving revision 1.76
diff -c -r1.76 xlog.h
*** src/include/access/xlog.h	5 Jan 2007 22:19:51 -0000	1.76
--- src/include/access/xlog.h	25 Apr 2007 11:19:46 -0000
***************
*** 142,147 ****
--- 142,148 ----
  extern int	XLogArchiveTimeout;
  extern char *XLOG_sync_method;
  extern const char XLOG_sync_method_default[];
+ extern bool fullPageWrites;
  
  #define XLogArchivingActive()	(XLogArchiveCommand[0] != '\0')
  
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.102
diff -c -r1.102 bufmgr.h
*** src/include/storage/bufmgr.h	5 Jan 2007 22:19:57 -0000	1.102
--- src/include/storage/bufmgr.h	25 Apr 2007 11:39:35 -0000
***************
*** 111,116 ****
--- 111,117 ----
   * prototypes for functions in bufmgr.c
   */
  extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+ extern Buffer ZapBuffer(Relation reln, BlockNumber blockNum);
  extern void ReleaseBuffer(Buffer buffer);
  extern void UnlockReleaseBuffer(Buffer buffer);
  extern void MarkBufferDirty(Buffer buffer);
---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend

Reply via email to