Re: [PATCHES] Seq scans status update

Heikki Linnakangas Fri, 25 May 2007 12:33:20 -0700

Here's a new version, all known issues are now fixed. I'm now happy withthis patch.

Next, I'll start looking at the latest version of Jeff's synchronizedscans patch.


Bruce Momjian wrote:

Great.  Based on this, do you have a patch that is ready to apply.

---------------------------------------------------------------------------

Heikki Linnakangas wrote:
Heikki Linnakangas wrote:
In any case, I'd like to see more test results before we make adecision. I'm running tests with DBT-2 and a seq scan running in thebackground to see if the cache-spoiling effect shows up. I'm also tryingto get hold of some bigger hardware to run on. Running these tests takessome calendar time, but the hard work has already been done. I'm goingto start reviewing Jeff's synchronized scans patch now.
Here are the results of the DBT-2 tests:

http://community.enterprisedb.com/seqscan/imola/
In each of these tests, at the end of rampup a script is started thatissues a "SELECT COUNT(*) FROM stock" in a loop, with 2 minute delaybetween end of previous query and start of next one.
The patch makes the seq scans go significantly faster. In the 1 hourtest period, the patched tests perform roughly 30-100% as many selectsas unpatched tests.
With 100 and 105 warehouses, it also significantly reduces the impact ofthe seq scan on other queries; response times are lower with the patch.With 120 warehouses the reduction of impact is not as clear, but whenyou plot the response times it's still there (the plots on the "responsetimes charts"-page are useless because they're overwhelmed by thecheckpoint spike).
--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com

---------------------------(end of broadcast)---------------------------
TIP 5: don't forget to increase your free space map settings



--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

Index: src/backend/access/heap/heapam.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/heap/heapam.c,v
retrieving revision 1.232
diff -c -r1.232 heapam.c
*** src/backend/access/heap/heapam.c	8 Apr 2007 01:26:27 -0000	1.232
--- src/backend/access/heap/heapam.c	25 May 2007 19:22:33 -0000
***************
*** 83,88 ****
--- 83,96 ----
  	 */
  	scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
  
+ 	/* A scan on a table smaller than shared_buffers is treated like random
+ 	 * access, but bigger scans use the bulk read page replacement policy.
+ 	 */
+ 	if (scan->rs_nblocks > NBuffers)
+ 		scan->rs_accesspattern = AP_BULKREAD;
+ 	else
+ 		scan->rs_accesspattern = AP_NORMAL;
+ 
  	scan->rs_inited = false;
  	scan->rs_ctup.t_data = NULL;
  	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 123,133 ****
--- 131,146 ----
  
  	Assert(page < scan->rs_nblocks);
  
+ 	/* Read the page with the right strategy */
+ 	SetAccessPattern(scan->rs_accesspattern);
+ 
  	scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
  										 scan->rs_rd,
  										 page);
  	scan->rs_cblock = page;
  
+ 	SetAccessPattern(AP_NORMAL);
+ 
  	if (!scan->rs_pageatatime)
  		return;
  
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.268
diff -c -r1.268 xlog.c
*** src/backend/access/transam/xlog.c	30 Apr 2007 21:01:52 -0000	1.268
--- src/backend/access/transam/xlog.c	15 May 2007 16:23:30 -0000
***************
*** 1668,1673 ****
--- 1668,1700 ----
  }
  
  /*
+  * Returns true if 'record' hasn't been flushed to disk yet.
+  */
+ bool
+ XLogNeedsFlush(XLogRecPtr record)
+ {
+ 	/* Quick exit if already known flushed */
+ 	if (XLByteLE(record, LogwrtResult.Flush))
+ 		return false;
+ 
+ 	/* read LogwrtResult and update local state */
+ 	{
+ 		/* use volatile pointer to prevent code rearrangement */
+ 		volatile XLogCtlData *xlogctl = XLogCtl;
+ 
+ 		SpinLockAcquire(&xlogctl->info_lck);
+ 		LogwrtResult = xlogctl->LogwrtResult;
+ 		SpinLockRelease(&xlogctl->info_lck);
+ 	}
+ 
+ 	/* check again */
+ 	if (XLByteLE(record, LogwrtResult.Flush))
+ 		return false;
+ 
+ 	return true;
+ }
+ 
+ /*
   * Ensure that all XLOG data through the given position is flushed to disk.
   *
   * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.283
diff -c -r1.283 copy.c
*** src/backend/commands/copy.c	27 Apr 2007 22:05:46 -0000	1.283
--- src/backend/commands/copy.c	15 May 2007 17:05:29 -0000
***************
*** 1876,1881 ****
--- 1876,1888 ----
  	nfields = file_has_oids ? (attr_count + 1) : attr_count;
  	field_strings = (char **) palloc(nfields * sizeof(char *));
  
+ 	/* Use the special COPY buffer replacement strategy if WAL-logging
+ 	 * is enabled. If it's not, the pages we're writing are dirty but
+ 	 * don't need a WAL flush to write out, so the BULKREAD strategy
+ 	 * is more suitable.
+ 	 */
+ 	SetAccessPattern(use_wal ? AP_COPY : AP_BULKREAD);
+ 
  	/* Initialize state variables */
  	cstate->fe_eof = false;
  	cstate->eol_type = EOL_UNKNOWN;
***************
*** 2161,2166 ****
--- 2168,2176 ----
  							cstate->filename)));
  	}
  
+ 	/* Reset buffer replacement strategy */
+ 	SetAccessPattern(AP_NORMAL);
+ 
  	/* 
  	 * If we skipped writing WAL, then we need to sync the heap (but not
  	 * indexes since those use WAL anyway)
Index: src/backend/commands/vacuum.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/vacuum.c,v
retrieving revision 1.350
diff -c -r1.350 vacuum.c
*** src/backend/commands/vacuum.c	16 Apr 2007 18:29:50 -0000	1.350
--- src/backend/commands/vacuum.c	15 May 2007 17:06:18 -0000
***************
*** 421,431 ****
  				 * Tell the buffer replacement strategy that vacuum is causing
  				 * the IO
  				 */
! 				StrategyHintVacuum(true);
  
  				analyze_rel(relid, vacstmt);
  
! 				StrategyHintVacuum(false);
  
  				if (use_own_xacts)
  					CommitTransactionCommand();
--- 421,431 ----
  				 * Tell the buffer replacement strategy that vacuum is causing
  				 * the IO
  				 */
! 				SetAccessPattern(AP_VACUUM);
  
  				analyze_rel(relid, vacstmt);
  
! 				SetAccessPattern(AP_NORMAL);
  
  				if (use_own_xacts)
  					CommitTransactionCommand();
***************
*** 442,448 ****
  		/* Make sure cost accounting is turned off after error */
  		VacuumCostActive = false;
  		/* And reset buffer replacement strategy, too */
! 		StrategyHintVacuum(false);
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
--- 442,448 ----
  		/* Make sure cost accounting is turned off after error */
  		VacuumCostActive = false;
  		/* And reset buffer replacement strategy, too */
! 		SetAccessPattern(AP_NORMAL);
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
***************
*** 1088,1094 ****
  	 * Tell the cache replacement strategy that vacuum is causing all
  	 * following IO
  	 */
! 	StrategyHintVacuum(true);
  
  	/*
  	 * Do the actual work --- either FULL or "lazy" vacuum
--- 1088,1094 ----
  	 * Tell the cache replacement strategy that vacuum is causing all
  	 * following IO
  	 */
! 	SetAccessPattern(AP_VACUUM);
  
  	/*
  	 * Do the actual work --- either FULL or "lazy" vacuum
***************
*** 1098,1104 ****
  	else
  		lazy_vacuum_rel(onerel, vacstmt);
  
! 	StrategyHintVacuum(false);
  
  	/* all done with this class, but hold lock until commit */
  	relation_close(onerel, NoLock);
--- 1098,1104 ----
  	else
  		lazy_vacuum_rel(onerel, vacstmt);
  
! 	SetAccessPattern(AP_NORMAL);
  
  	/* all done with this class, but hold lock until commit */
  	relation_close(onerel, NoLock);
Index: src/backend/storage/buffer/README
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/README,v
retrieving revision 1.11
diff -c -r1.11 README
*** src/backend/storage/buffer/README	23 Jul 2006 03:07:58 -0000	1.11
--- src/backend/storage/buffer/README	16 May 2007 11:43:11 -0000
***************
*** 152,159 ****
  a field to show which backend is doing its I/O).
  
  
! Buffer replacement strategy
! ---------------------------
  
  There is a "free list" of buffers that are prime candidates for replacement.
  In particular, buffers that are completely free (contain no valid page) are
--- 152,159 ----
  a field to show which backend is doing its I/O).
  
  
! Normal buffer replacement strategy
! ----------------------------------
  
  There is a "free list" of buffers that are prime candidates for replacement.
  In particular, buffers that are completely free (contain no valid page) are
***************
*** 199,221 ****
  have to give up and try another buffer.  This however is not a concern
  of the basic select-a-victim-buffer algorithm.)
  
- A special provision is that while running VACUUM, a backend does not
- increment the usage count on buffers it accesses.  In fact, if ReleaseBuffer
- sees that it is dropping the pin count to zero and the usage count is zero,
- then it appends the buffer to the tail of the free list.  (This implies that
- VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer;
- this shouldn't create much of a contention problem.)  This provision
- encourages VACUUM to work in a relatively small number of buffers rather
- than blowing out the entire buffer cache.  It is reasonable since a page
- that has been touched only by VACUUM is unlikely to be needed again soon.
- 
- Since VACUUM usually requests many pages very fast, the effect of this is that
- it will get back the very buffers it filled and possibly modified on the next
- call and will therefore do its work in a few shared memory buffers, while
- being able to use whatever it finds in the cache already.  This also implies
- that most of the write traffic caused by a VACUUM will be done by the VACUUM
- itself and not pushed off onto other processes.
  
  
  Background writer's processing
  ------------------------------
--- 199,243 ----
  have to give up and try another buffer.  This however is not a concern
  of the basic select-a-victim-buffer algorithm.)
  
  
+ Buffer ring replacement strategy
+ ---------------------------------
+ 
+ When running a query that needs to access a large number of pages, like VACUUM,
+ COPY, or a large sequential scan, a different strategy is used.  A page that
+ has been touched only by such a scan is unlikely to be needed again soon, so
+ instead of running the normal clock sweep algorithm and blowing out the entire
+ buffer cache, a small ring of buffers is allocated using the normal clock sweep
+ algorithm and those buffers are reused for the whole scan.  This also implies
+ that most of the write traffic caused by such a statement will be done by the
+ backend itself and not pushed off onto other processes.
+ 
+ The size of the ring used depends on the kind of scan:
+ 
+ For sequential scans, a small 256 KB ring is used. That's small enough to fit
+ in L2 cache, which makes transferring pages from OS cache to shared buffer
+ cache efficient. Even less would often be enough, but the ring must be big
+ enough to accommodate all pages in the scan that are pinned concurrently. 
+ 256 KB should also be enough to leave a small cache trail for other backends to
+ join in a synchronized seq scan. If a buffer is dirtied and LSN set, the buffer
+ is removed from the ring and a replacement buffer is chosen using the normal
+ replacement strategy. In a scan that modifies every page in the scan, like a
+ bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and the
+ ring strategy effectively degrades to the normal strategy.
+ 
+ VACUUM uses a 256 KB ring like sequential scans, but dirty pages are not
+ removed from the ring. WAL is flushed instead to allow reuse of the buffers.
+ Before introducing the buffer ring strategy in 8.3, buffers were put to the
+ freelist, which was effectively a buffer ring of 1 buffer.
+ 
+ COPY behaves like VACUUM, but a much larger ring is used. The ring size is
+ chosen to be twice the WAL segment size. This avoids polluting the buffer cache
+ like the clock sweep would do, and using a ring larger than WAL segment size
+ avoids having to do any extra WAL flushes, since a WAL segment will always be
+ filled, forcing a WAL flush, before looping through the buffer ring and bumping
+ into a buffer that would force a WAL flush. However, for non-WAL-logged COPY
+ operations the smaller 256 KB ring is used because WAL flushes are not needed
+ to write the buffers.
  
  Background writer's processing
  ------------------------------
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.218
diff -c -r1.218 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c	2 May 2007 23:34:48 -0000	1.218
--- src/backend/storage/buffer/bufmgr.c	16 May 2007 12:34:10 -0000
***************
*** 419,431 ****
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
  		/*
  		 * Select a victim buffer.	The buffer is returned with its header
  		 * spinlock still held!  Also the BufFreelistLock is still held, since
  		 * it would be bad to hold the spinlock while possibly waking up other
  		 * processes.
  		 */
! 		buf = StrategyGetBuffer();
  
  		Assert(buf->refcount == 0);
  
--- 419,433 ----
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
+ 		bool lock_held;
+ 
  		/*
  		 * Select a victim buffer.	The buffer is returned with its header
  		 * spinlock still held!  Also the BufFreelistLock is still held, since
  		 * it would be bad to hold the spinlock while possibly waking up other
  		 * processes.
  		 */
! 		buf = StrategyGetBuffer(&lock_held);
  
  		Assert(buf->refcount == 0);
  
***************
*** 436,442 ****
  		PinBuffer_Locked(buf);
  
  		/* Now it's safe to release the freelist lock */
! 		LWLockRelease(BufFreelistLock);
  
  		/*
  		 * If the buffer was dirty, try to write it out.  There is a race
--- 438,445 ----
  		PinBuffer_Locked(buf);
  
  		/* Now it's safe to release the freelist lock */
! 		if (lock_held)
! 			LWLockRelease(BufFreelistLock);
  
  		/*
  		 * If the buffer was dirty, try to write it out.  There is a race
***************
*** 464,469 ****
--- 467,489 ----
  			 */
  			if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
  			{
+ 				/* In BULKREAD-mode, check if a WAL flush would be needed to
+ 				 * evict this buffer. If so, ask the replacement strategy if
+ 				 * we should go ahead and do it or choose another victim.
+ 				 */
+ 				if (active_access_pattern == AP_BULKREAD)
+ 				{
+ 					if (XLogNeedsFlush(BufferGetLSN(buf)))
+ 					{
+ 						if (StrategyRejectBuffer(buf))
+ 						{
+ 							LWLockRelease(buf->content_lock);
+ 							UnpinBuffer(buf, true, false);
+ 							continue;
+ 						}
+ 					}
+ 				}
+ 
  				FlushBuffer(buf, NULL);
  				LWLockRelease(buf->content_lock);
  			}
***************
*** 925,932 ****
  	PrivateRefCount[b]--;
  	if (PrivateRefCount[b] == 0)
  	{
- 		bool		immed_free_buffer = false;
- 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(buf->content_lock));
  		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
--- 945,950 ----
***************
*** 940,956 ****
  		/* Update buffer usage info, unless this is an internal access */
  		if (normalAccess)
  		{
! 			if (!strategy_hint_vacuum)
  			{
! 				if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 					buf->usage_count++;
  			}
  			else
! 			{
! 				/* VACUUM accesses don't bump usage count, instead... */
! 				if (buf->refcount == 0 && buf->usage_count == 0)
! 					immed_free_buffer = true;
! 			}
  		}
  
  		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
--- 958,975 ----
  		/* Update buffer usage info, unless this is an internal access */
  		if (normalAccess)
  		{
! 			if (active_access_pattern != AP_NORMAL)
  			{
! 				/* We don't want large one-off scans like vacuum to inflate 
! 				 * the usage_count. We do want to set it to 1, though, to keep
! 				 * other backends from hijacking it from the buffer ring.
! 				 */
! 				if (buf->usage_count == 0)
! 					buf->usage_count = 1;
  			}
  			else
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
  		}
  
  		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
***************
*** 965,978 ****
  		}
  		else
  			UnlockBufHdr(buf);
- 
- 		/*
- 		 * If VACUUM is releasing an otherwise-unused buffer, send it to the
- 		 * freelist for near-term reuse.  We put it at the tail so that it
- 		 * won't be used before any invalid buffers that may exist.
- 		 */
- 		if (immed_free_buffer)
- 			StrategyFreeBuffer(buf, false);
  	}
  }
  
--- 984,989 ----
Index: src/backend/storage/buffer/freelist.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/freelist.c,v
retrieving revision 1.58
diff -c -r1.58 freelist.c
*** src/backend/storage/buffer/freelist.c	5 Jan 2007 22:19:37 -0000	1.58
--- src/backend/storage/buffer/freelist.c	25 May 2007 19:25:16 -0000
***************
*** 18,23 ****
--- 18,25 ----
  #include "storage/buf_internals.h"
  #include "storage/bufmgr.h"
  
+ #include "utils/memutils.h"
+ 
  
  /*
   * The shared freelist control information.
***************
*** 39,47 ****
  /* Pointers to shared state */
  static BufferStrategyControl *StrategyControl = NULL;
  
! /* Backend-local state about whether currently vacuuming */
! bool		strategy_hint_vacuum = false;
  
  
  /*
   * StrategyGetBuffer
--- 41,59 ----
  /* Pointers to shared state */
  static BufferStrategyControl *StrategyControl = NULL;
  
! /* Currently active access pattern hint. */
! AccessPattern active_access_pattern = AP_NORMAL;
  
+ /* prototypes for internal functions */
+ static volatile BufferDesc *GetBufferFromRing(void);
+ static void PutBufferToRing(volatile BufferDesc *buf);
+ static void InitRing(void);
+ 
+ /* Did the last buffer returned by StrategyGetBuffer come from the buffer
+  * ring or from freelist/clock sweep? StrategyRejectBuffer needs to know
+  * that, see comments there.
+  */
+ static bool lastBufferCameFromRing = false;
  
  /*
   * StrategyGetBuffer
***************
*** 51,67 ****
   *	the selected buffer must not currently be pinned by anyone.
   *
   *	To ensure that no one else can pin the buffer before we do, we must
!  *	return the buffer with the buffer header spinlock still held.  That
!  *	means that we return with the BufFreelistLock still held, as well;
!  *	the caller must release that lock once the spinlock is dropped.
   */
  volatile BufferDesc *
! StrategyGetBuffer(void)
  {
  	volatile BufferDesc *buf;
  	int			trycounter;
  
  	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
  
  	/*
  	 * Try to get a buffer from the freelist.  Note that the freeNext fields
--- 63,98 ----
   *	the selected buffer must not currently be pinned by anyone.
   *
   *	To ensure that no one else can pin the buffer before we do, we must
!  *	return the buffer with the buffer header spinlock still held.  If
!  *	*lock_held is set at return, we return with the BufFreelistLock still
!  *	held, as well;	the caller must release that lock once the spinlock is
!  *	dropped.
   */
  volatile BufferDesc *
! StrategyGetBuffer(bool *lock_held)
  {
  	volatile BufferDesc *buf;
  	int			trycounter;
  
+ 	/* Get a buffer from the ring if we're doing a bulk scan */
+ 	if (active_access_pattern != AP_NORMAL)
+ 	{
+ 		buf = GetBufferFromRing();
+ 		if (buf != NULL)
+ 		{
+ 			*lock_held = false;
+ 			lastBufferCameFromRing = true;
+ 			return buf;
+ 		}
+ 	}
+ 
+ 	lastBufferCameFromRing = false;
+ 
+ 	/*
+ 	 * If our selected buffer wasn't available, pick another...
+ 	 */
  	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+ 	*lock_held = true;
  
  	/*
  	 * Try to get a buffer from the freelist.  Note that the freeNext fields
***************
*** 86,96 ****
  		 */
  		LockBufHdr(buf);
  		if (buf->refcount == 0 && buf->usage_count == 0)
  			return buf;
  		UnlockBufHdr(buf);
  	}
  
! 	/* Nothing on the freelist, so run the "clock sweep" algorithm */
  	trycounter = NBuffers;
  	for (;;)
  	{
--- 117,131 ----
  		 */
  		LockBufHdr(buf);
  		if (buf->refcount == 0 && buf->usage_count == 0)
+ 		{
+ 			if (active_access_pattern != AP_NORMAL)
+ 				PutBufferToRing(buf);
  			return buf;
+ 		}
  		UnlockBufHdr(buf);
  	}
  
! 	/* Nothing on the freelist, so run the shared "clock sweep" algorithm */
  	trycounter = NBuffers;
  	for (;;)
  	{
***************
*** 105,111 ****
--- 140,150 ----
  		 */
  		LockBufHdr(buf);
  		if (buf->refcount == 0 && buf->usage_count == 0)
+ 		{
+ 			if (active_access_pattern != AP_NORMAL)
+ 				PutBufferToRing(buf);
  			return buf;
+ 		}
  		if (buf->usage_count > 0)
  		{
  			buf->usage_count--;
***************
*** 191,204 ****
  }
  
  /*
!  * StrategyHintVacuum -- tell us whether VACUUM is active
   */
  void
! StrategyHintVacuum(bool vacuum_active)
  {
! 	strategy_hint_vacuum = vacuum_active;
! }
  
  
  /*
   * StrategyShmemSize
--- 230,254 ----
  }
  
  /*
!  * SetAccessPattern -- Sets the active access pattern hint
!  *
!  * Caller is responsible for resetting the hint to AP_NORMAL after the bulk
!  * operation is done. It's ok to switch repeatedly between AP_NORMAL and one of
!  * the other strategies, for example in a query with one large sequential scan
!  * nested loop joined to an index scan. Index tuples should be fetched with the
!  * normal strategy and the pages from the seq scan should be read in with the
!  * AP_BULKREAD strategy. The ring won't be affected by such switching, however
!  * switching to an access pattern with different ring size will invalidate the
!  * old ring.
   */
  void
! SetAccessPattern(AccessPattern new_pattern)
  {
! 	active_access_pattern = new_pattern;
  
+ 	if (active_access_pattern != AP_NORMAL)
+ 		InitRing();
+ }
  
  /*
   * StrategyShmemSize
***************
*** 274,276 ****
--- 324,504 ----
  	else
  		Assert(!init);
  }
+ 
+ /* ----------------------------------------------------------------
+  *				Backend-private buffer ring management
+  * ----------------------------------------------------------------
+  */
+ 
+ /*
+  * Ring sizes for different access patterns. See README for the rationale
+  * of these.
+  */
+ #define BULKREAD_RING_SIZE	256 * 1024 / BLCKSZ
+ #define VACUUM_RING_SIZE	256 * 1024 / BLCKSZ
+ #define COPY_RING_SIZE		Min(NBuffers / 8, (XLOG_SEG_SIZE / BLCKSZ) * 2)
+ 
+ /*
+  * BufferRing is an array of buffer ids, and RingSize it's size in number of
+  * elements. It's allocated in TopMemoryContext the first time it's needed.
+  */
+ static int *BufferRing = NULL;
+ static int RingSize = 0;
+ 
+ /* Index of the "current" slot in the ring. It's advanced every time a buffer
+  * is handed out from the ring with GetBufferFromRing and it points to the 
+  * last buffer returned from the ring. RingCurSlot + 1 is the next victim
+  * GetBufferRing will hand out.
+  */
+ static int RingCurSlot = 0;
+ 
+ /* magic value to mark empty slots in the ring */
+ #define BUF_ID_NOT_SET -1
+ 
+ 
+ /*
+  * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
+  *		ring is empty.
+  *
+  * The bufhdr spin lock is held on the returned buffer.
+  */
+ static volatile BufferDesc *
+ GetBufferFromRing(void)
+ {
+ 	volatile BufferDesc *buf;
+ 
+ 	/* ring should be initialized by now */
+ 	Assert(RingSize > 0 && BufferRing != NULL);
+ 
+ 	/* Run private "clock cycle" */
+ 	if (++RingCurSlot >= RingSize)
+ 		RingCurSlot = 0;
+ 
+ 	/*
+ 	 * If that slot hasn't been filled yet, tell the caller to allocate
+ 	 * a new buffer with the normal allocation strategy. He will then
+ 	 * fill this slot by calling PutBufferToRing with the new buffer.
+ 	 */
+ 	if (BufferRing[RingCurSlot] == BUF_ID_NOT_SET)
+ 		return NULL;
+ 
+ 	buf = &BufferDescriptors[BufferRing[RingCurSlot]];
+ 
+ 	/*
+ 	 * If the buffer is pinned we cannot use it under any circumstances.
+ 	 * If usage_count == 0 then the buffer is fair game. 
+ 	 *
+ 	 * We also choose this buffer if usage_count == 1. Strictly, this
+ 	 * might sometimes be the wrong thing to do, but we rely on the high
+ 	 * probability that it was this process that last touched the buffer.
+ 	 * If it wasn't, we'll choose a suboptimal victim, but  it shouldn't
+ 	 * make any difference in the big scheme of things.
+ 	 *
+ 	 */
+ 	LockBufHdr(buf);
+ 	if (buf->refcount == 0 && buf->usage_count <= 1)
+ 		return buf;
+ 	UnlockBufHdr(buf);
+ 
+ 	return NULL;
+ }
+ 
+ /*
+  * PutBufferToRing -- adds a buffer to the buffer ring
+  *
+  * Caller must hold the buffer header spinlock on the buffer.
+  */
+ static void
+ PutBufferToRing(volatile BufferDesc *buf)
+ {
+ 	/* ring should be initialized by now */
+ 	Assert(RingSize > 0 && BufferRing != NULL);
+ 
+ 	if (BufferRing[RingCurSlot] == BUF_ID_NOT_SET)
+ 		BufferRing[RingCurSlot] = buf->buf_id;
+ 
+ }
+ 
+ /*
+  * Initializes a ring buffer with correct size for the currently
+  * active strategy. Does nothing if the ring already has the right size.
+  */
+ static void
+ InitRing(void)
+ {
+ 	int new_size;
+ 	int old_size = RingSize;
+ 	int i;
+ 	MemoryContext oldcxt;
+ 
+ 	/* Determine new size */
+ 
+ 	switch(active_access_pattern)
+ 	{
+ 		case AP_BULKREAD:
+ 			new_size = BULKREAD_RING_SIZE;
+ 			break;
+ 		case AP_COPY:
+ 			new_size = COPY_RING_SIZE;
+ 			break;
+ 		case AP_VACUUM:
+ 			new_size = VACUUM_RING_SIZE;
+ 			break;
+ 		default:
+ 			elog(ERROR, "unexpected buffer cache strategy %d", 
+ 				 active_access_pattern);
+ 			return; /* keep compile happy */
+ 	}
+ 
+ 	/*
+ 	 * Seq scans set and reset the strategy on every page, so we better exit
+ 	 * quickly if no change in size is needed.
+ 	 */
+ 	if (new_size == old_size)
+ 		return;
+ 
+ 	/* Allocate array */
+ 
+ 	oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+ 
+ 	if (old_size == 0)
+ 	{
+ 		Assert(BufferRing == NULL);
+ 		BufferRing = palloc(new_size * sizeof(int));
+ 	}
+ 	else
+ 		BufferRing = repalloc(BufferRing, new_size * sizeof(int));
+ 
+ 	MemoryContextSwitchTo(oldcxt);
+ 
+ 	for(i = 0; i < new_size; i++)
+ 		BufferRing[i] = BUF_ID_NOT_SET;
+ 
+ 	RingCurSlot = 0;
+ 	RingSize = new_size;
+ }
+ 
+ /*
+  * In AP_BULKREAD mode, the buffer manager calls this function when it turns
+  * out that the buffer handed to it by StrategyGetBuffer needs a WAL flush to
+  * write out. That gives us a second chance to choose another victim.
+  *
+  * Returns true if buffer manager should ask for a new victim, and false
+  * if WAL should be flushed and this buffer used.
+  */
+ bool
+ StrategyRejectBuffer(volatile BufferDesc *buf)
+ {
+ 	Assert(RingSize > 0);
+ 
+ 	/* If the buffer didn't come from the ring, we tell the caller to do the
+ 	 * WAL flush and use the buffer. We don't want to mess with how the clock
+ 	 * sweep works; in worst case there's no buffers in the buffer cache that
+ 	 * can be reused without a WAL flush, and we'd get into an endless loop
+ 	 * trying.
+ 	 */
+ 	if (lastBufferCameFromRing)
+ 		BufferRing[RingCurSlot] = BUF_ID_NOT_SET;
+ 
+ 	return lastBufferCameFromRing;
+ }
Index: src/include/access/relscan.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/relscan.h,v
retrieving revision 1.52
diff -c -r1.52 relscan.h
*** src/include/access/relscan.h	20 Jan 2007 18:43:35 -0000	1.52
--- src/include/access/relscan.h	15 May 2007 17:01:31 -0000
***************
*** 28,33 ****
--- 28,34 ----
  	ScanKey		rs_key;			/* array of scan key descriptors */
  	BlockNumber rs_nblocks;		/* number of blocks to scan */
  	bool		rs_pageatatime; /* verify visibility page-at-a-time? */
+ 	AccessPattern rs_accesspattern; /* access pattern to use for reads */
  
  	/* scan current state */
  	bool		rs_inited;		/* false = scan not init'd yet */
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/xlog.h,v
retrieving revision 1.76
diff -c -r1.76 xlog.h
*** src/include/access/xlog.h	5 Jan 2007 22:19:51 -0000	1.76
--- src/include/access/xlog.h	14 May 2007 21:22:40 -0000
***************
*** 151,156 ****
--- 151,157 ----
  
  extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
  extern void XLogFlush(XLogRecPtr RecPtr);
+ extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
  
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
Index: src/include/storage/buf_internals.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.89
diff -c -r1.89 buf_internals.h
*** src/include/storage/buf_internals.h	5 Jan 2007 22:19:57 -0000	1.89
--- src/include/storage/buf_internals.h	15 May 2007 17:07:59 -0000
***************
*** 16,21 ****
--- 16,22 ----
  #define BUFMGR_INTERNALS_H
  
  #include "storage/buf.h"
+ #include "storage/bufmgr.h"
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/spin.h"
***************
*** 168,174 ****
  extern BufferDesc *LocalBufferDescriptors;
  
  /* in freelist.c */
! extern bool strategy_hint_vacuum;
  
  /* event counters in buf_init.c */
  extern long int ReadBufferCount;
--- 169,175 ----
  extern BufferDesc *LocalBufferDescriptors;
  
  /* in freelist.c */
! extern AccessPattern active_access_pattern;
  
  /* event counters in buf_init.c */
  extern long int ReadBufferCount;
***************
*** 184,195 ****
   */
  
  /* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(void);
  extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
  extern int	StrategySyncStart(void);
  extern Size StrategyShmemSize(void);
  extern void StrategyInitialize(bool init);
  
  /* buf_table.c */
  extern Size BufTableShmemSize(int size);
  extern void InitBufTable(int size);
--- 185,198 ----
   */
  
  /* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(bool *lock_held);
  extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
  extern int	StrategySyncStart(void);
  extern Size StrategyShmemSize(void);
  extern void StrategyInitialize(bool init);
  
+ extern bool StrategyRejectBuffer(volatile BufferDesc *buf);
+ 
  /* buf_table.c */
  extern Size BufTableShmemSize(int size);
  extern void InitBufTable(int size);
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.103
diff -c -r1.103 bufmgr.h
*** src/include/storage/bufmgr.h	2 May 2007 23:18:03 -0000	1.103
--- src/include/storage/bufmgr.h	25 May 2007 19:23:37 -0000
***************
*** 49,54 ****
--- 49,67 ----
  #define BUFFER_LOCK_EXCLUSIVE	2
  
  /*
+  * For better cache efficiency, we use different buffer replacement strategies 
+  * for different kinds of access patterns. Use SetAccessPattern to hint the
+  * buffer manager which kind of access we're doing at the moment.
+  */
+ typedef enum AccessPattern
+ {
+ 	AP_NORMAL,		/* Normal random access */
+ 	AP_BULKREAD,	/* Large read-only scan (hint bit updates are ok) */
+ 	AP_COPY,		/* Large updating scan, like COPY with WAL enabled */
+ 	AP_VACUUM,		/* VACUUM */
+ } AccessPattern;
+ 
+ /*
   * These routines are beaten on quite heavily, hence the macroization.
   */
  
***************
*** 157,162 ****
  extern void AtProcExit_LocalBuffers(void);
  
  /* in freelist.c */
! extern void StrategyHintVacuum(bool vacuum_active);
  
  #endif
--- 170,175 ----
  extern void AtProcExit_LocalBuffers(void);
  
  /* in freelist.c */
! extern void SetAccessPattern(AccessPattern new_pattern);
  
  #endif

---------------------------(end of broadcast)---------------------------
TIP 4: Have you searched our list archives?

               http://archives.postgresql.org

Re: [PATCHES] Seq scans status update

Reply via email to