Re: [HACKERS] Optimizing Read-Only Scalability

Simon Riggs Sun, 17 May 2009 15:12:13 -0700

On Sat, 2009-05-16 at 23:36 -0400, Jignesh K. Shah wrote:
> Simon Riggs wrote:
> >   
> >>> So we can optimize away the scan through the procarray by doing two "if"
> >>> tests, one outside of the lock, one inside. In normal running, both will
> >>> be optimized away, though in read-only periods we would avoid much work.
> >>>       
> >> How much work would it be to work up a test patch?
> >>     
> >
> > Not much. The most important thing is a place to test it and access to
> > detailed feedback. Let's see if Dimitri does this.
> >
> > There are some other tuning aspects to be got right first also, but
> > those are already known.
> >   
> I would be interested in testing it out.. I have been collecting some 
> sysbench read-scalability numbers and some other numbers that I can cook 
> up with dbt3 , igen.. So I have a frame of reference on those numbers .. 
> I am sure we can always use some extra performance.


I've added 

        shared_buffer_partitions = 16..256

plus the GetSnapshotData optimization discussed. I expect the buffer
locks to be the dominant problem.

Together, they should improve RO scalability at high end.

Note this renumbers LWlocks from current value of FirstLockMgrLock
onwards, which may skew the Dtrace reports.

-- 
 Simon Riggs           www.2ndQuadrant.com
 PostgreSQL Training, Services and Support

Index: src/backend/storage/ipc/procarray.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/ipc/procarray.c,v
retrieving revision 1.49
diff -c -r1.49 procarray.c
*** src/backend/storage/ipc/procarray.c	4 Apr 2009 17:40:36 -0000	1.49
--- src/backend/storage/ipc/procarray.c	17 May 2009 19:46:05 -0000
***************
*** 729,804 ****
  	/* initialize xmin calculation with xmax */
  	globalxmin = xmin = xmax;
  
! 	/*
! 	 * Spin over procArray checking xid, xmin, and subxids.  The goal is to
! 	 * gather all active xids, find the lowest xmin, and try to record
! 	 * subxids.
! 	 */
! 	for (index = 0; index < arrayP->numProcs; index++)
  	{
- 		volatile PGPROC *proc = arrayP->procs[index];
- 		TransactionId xid;
- 
- 		/* Ignore procs running LAZY VACUUM */
- 		if (proc->vacuumFlags & PROC_IN_VACUUM)
- 			continue;
- 
- 		/* Update globalxmin to be the smallest valid xmin */
- 		xid = proc->xmin;		/* fetch just once */
- 		if (TransactionIdIsNormal(xid) &&
- 			TransactionIdPrecedes(xid, globalxmin))
- 			globalxmin = xid;
- 
- 		/* Fetch xid just once - see GetNewTransactionId */
- 		xid = proc->xid;
- 
  		/*
! 		 * If the transaction has been assigned an xid < xmax we add it to the
! 		 * snapshot, and update xmin if necessary.	There's no need to store
! 		 * XIDs >= xmax, since we'll treat them as running anyway.  We don't
! 		 * bother to examine their subxids either.
! 		 *
! 		 * We don't include our own XID (if any) in the snapshot, but we must
! 		 * include it into xmin.
  		 */
! 		if (TransactionIdIsNormal(xid))
  		{
! 			if (TransactionIdFollowsOrEquals(xid, xmax))
  				continue;
- 			if (proc != MyProc)
- 				snapshot->xip[count++] = xid;
- 			if (TransactionIdPrecedes(xid, xmin))
- 				xmin = xid;
- 		}
  
! 		/*
! 		 * Save subtransaction XIDs if possible (if we've already overflowed,
! 		 * there's no point).  Note that the subxact XIDs must be later than
! 		 * their parent, so no need to check them against xmin.  We could
! 		 * filter against xmax, but it seems better not to do that much work
! 		 * while holding the ProcArrayLock.
! 		 *
! 		 * The other backend can add more subxids concurrently, but cannot
! 		 * remove any.	Hence it's important to fetch nxids just once. Should
! 		 * be safe to use memcpy, though.  (We needn't worry about missing any
! 		 * xids added concurrently, because they must postdate xmax.)
! 		 *
! 		 * Again, our own XIDs are not included in the snapshot.
! 		 */
! 		if (subcount >= 0 && proc != MyProc)
! 		{
! 			if (proc->subxids.overflowed)
! 				subcount = -1;	/* overflowed */
! 			else
  			{
! 				int			nxids = proc->subxids.nxids;
  
! 				if (nxids > 0)
  				{
! 					memcpy(snapshot->subxip + subcount,
! 						   (void *) proc->subxids.xids,
! 						   nxids * sizeof(TransactionId));
! 					subcount += nxids;
  				}
  			}
  		}
--- 729,808 ----
  	/* initialize xmin calculation with xmax */
  	globalxmin = xmin = xmax;
  
! 	/* If our snapshot is new, or snapshot has changed, re-calculate */
! 	if (snapshot->xmax != xmax || snapshot->xmin != snapshot->xmax)
  	{
  		/*
! 		 * Spin over procArray checking xid, xmin, and subxids.  The goal is to
! 		 * gather all active xids, find the lowest xmin, and try to record
! 		 * subxids.
  		 */
! 		for (index = 0; index < arrayP->numProcs; index++)
  		{
! 			volatile PGPROC *proc = arrayP->procs[index];
! 			TransactionId xid;
! 
! 			/* Ignore procs running LAZY VACUUM */
! 			if (proc->vacuumFlags & PROC_IN_VACUUM)
  				continue;
  
! 			/* Update globalxmin to be the smallest valid xmin */
! 			xid = proc->xmin;		/* fetch just once */
! 			if (TransactionIdIsNormal(xid) &&
! 				TransactionIdPrecedes(xid, globalxmin))
! 				globalxmin = xid;
! 
! 			/* Fetch xid just once - see GetNewTransactionId */
! 			xid = proc->xid;
! 
! 			/*
! 			 * If the transaction has been assigned an xid < xmax we add it to the
! 			 * snapshot, and update xmin if necessary.	There's no need to store
! 			 * XIDs >= xmax, since we'll treat them as running anyway.  We don't
! 			 * bother to examine their subxids either.
! 			 *
! 			 * We don't include our own XID (if any) in the snapshot, but we must
! 			 * include it into xmin.
! 			 */
! 			if (TransactionIdIsNormal(xid))
  			{
! 				if (TransactionIdFollowsOrEquals(xid, xmax))
! 					continue;
! 				if (proc != MyProc)
! 					snapshot->xip[count++] = xid;
! 				if (TransactionIdPrecedes(xid, xmin))
! 					xmin = xid;
! 			}
  
! 			/*
! 			 * Save subtransaction XIDs if possible (if we've already overflowed,
! 			 * there's no point).  Note that the subxact XIDs must be later than
! 			 * their parent, so no need to check them against xmin.  We could
! 			 * filter against xmax, but it seems better not to do that much work
! 			 * while holding the ProcArrayLock.
! 			 *
! 			 * The other backend can add more subxids concurrently, but cannot
! 			 * remove any.	Hence it's important to fetch nxids just once. Should
! 			 * be safe to use memcpy, though.  (We needn't worry about missing any
! 			 * xids added concurrently, because they must postdate xmax.)
! 			 *
! 			 * Again, our own XIDs are not included in the snapshot.
! 			 */
! 			if (subcount >= 0 && proc != MyProc)
! 			{
! 				if (proc->subxids.overflowed)
! 					subcount = -1;	/* overflowed */
! 				else
  				{
! 					int			nxids = proc->subxids.nxids;
! 
! 					if (nxids > 0)
! 					{
! 						memcpy(snapshot->subxip + subcount,
! 							   (void *) proc->subxids.xids,
! 							   nxids * sizeof(TransactionId));
! 						subcount += nxids;
! 					}
  				}
  			}
  		}

Index: src/backend/storage/buffer/buf_table.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/buffer/buf_table.c,v
retrieving revision 1.50
diff -c -r1.50 buf_table.c
*** src/backend/storage/buffer/buf_table.c	1 Jan 2009 17:23:47 -0000	1.50
--- src/backend/storage/buffer/buf_table.c	17 May 2009 20:44:14 -0000
***************
*** 23,28 ****
--- 23,29 ----
  
  #include "storage/bufmgr.h"
  #include "storage/buf_internals.h"
+ #include "storage/lwlock.h"
  
  
  /* entry for buffer lookup hashtable */
***************
*** 60,66 ****
  	info.keysize = sizeof(BufferTag);
  	info.entrysize = sizeof(BufferLookupEnt);
  	info.hash = tag_hash;
! 	info.num_partitions = NUM_BUFFER_PARTITIONS;
  
  	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
  								  size, size,
--- 61,67 ----
  	info.keysize = sizeof(BufferTag);
  	info.entrysize = sizeof(BufferLookupEnt);
  	info.hash = tag_hash;
! 	info.num_partitions = NBufferPartitions;
  
  	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
  								  size, size,
Index: src/backend/storage/buffer/freelist.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/storage/buffer/freelist.c,v
retrieving revision 1.66
diff -c -r1.66 freelist.c
*** src/backend/storage/buffer/freelist.c	1 Jan 2009 17:23:47 -0000	1.66
--- src/backend/storage/buffer/freelist.c	17 May 2009 20:39:29 -0000
***************
*** 284,290 ****
  	Size		size = 0;
  
  	/* size of lookup hash table ... see comment in StrategyInitialize */
! 	size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
  
  	/* size of the shared replacement strategy control block */
  	size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
--- 284,290 ----
  	Size		size = 0;
  
  	/* size of lookup hash table ... see comment in StrategyInitialize */
! 	size = add_size(size, BufTableShmemSize(NBuffers + NBufferPartitions));
  
  	/* size of the shared replacement strategy control block */
  	size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
***************
*** 312,320 ****
  	 * usage is of course NBuffers entries, but BufferAlloc() tries to insert
  	 * a new entry before deleting the old.  In principle this could be
  	 * happening in each partition concurrently, so we could need as many as
! 	 * NBuffers + NUM_BUFFER_PARTITIONS entries.
  	 */
! 	InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
  
  	/*
  	 * Get or create the shared strategy control block
--- 312,320 ----
  	 * usage is of course NBuffers entries, but BufferAlloc() tries to insert
  	 * a new entry before deleting the old.  In principle this could be
  	 * happening in each partition concurrently, so we could need as many as
! 	 * NBuffers + NBufferPartitions entries.
  	 */
! 	InitBufTable(NBuffers + NBufferPartitions);
  
  	/*
  	 * Get or create the shared strategy control block
Index: src/backend/utils/init/globals.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/utils/init/globals.c,v
retrieving revision 1.108
diff -c -r1.108 globals.c
*** src/backend/utils/init/globals.c	5 May 2009 19:59:00 -0000	1.108
--- src/backend/utils/init/globals.c	17 May 2009 20:49:07 -0000
***************
*** 104,109 ****
--- 104,110 ----
   * hook):
   */
  int			NBuffers = 1000;
+ int			NBufferPartitions = 16;
  int			MaxBackends = 100;
  int			MaxConnections = 90;
  
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.504
diff -c -r1.504 guc.c
*** src/backend/utils/misc/guc.c	3 May 2009 20:09:54 -0000	1.504
--- src/backend/utils/misc/guc.c	17 May 2009 21:36:21 -0000
***************
*** 58,63 ****
--- 58,64 ----
  #include "regex/regex.h"
  #include "storage/bufmgr.h"
  #include "storage/fd.h"
+ #include "storage/lwlock.h"
  #include "tcop/tcopprot.h"
  #include "tsearch/ts_cache.h"
  #include "utils/builtins.h"
***************
*** 165,170 ****
--- 166,172 ----
  static const char *show_tcp_keepalives_interval(void);
  static const char *show_tcp_keepalives_count(void);
  static bool assign_maxconnections(int newval, bool doit, GucSource source);
+ static bool assign_shared_buffer_partitions(int newval, bool doit, GucSource source);
  static bool assign_autovacuum_max_workers(int newval, bool doit, GucSource source);
  static bool assign_effective_io_concurrency(int newval, bool doit, GucSource source);
  static const char *assign_pgstat_temp_directory(const char *newval, bool doit, GucSource source);
***************
*** 1361,1366 ****
--- 1363,1378 ----
  	},
  
  	{
+ 		{"shared_buffer_partitions", PGC_POSTMASTER, RESOURCES_MEM,
+ 			gettext_noop("Sets the number of shared buffer partitions used by the server."),
+ 			NULL,
+ 			GUC_NOT_IN_SAMPLE
+ 		},
+ 		&NBufferPartitions,
+ 		16, 16, MAX_NUM_BUFFER_PARTITIONS, assign_shared_buffer_partitions, NULL
+ 	},
+ 
+ 	{
  		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
  			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
  			NULL,
***************
*** 7547,7552 ****
--- 7559,7586 ----
  }
  
  static bool
+ assign_shared_buffer_partitions(int newval, bool doit, GucSource source)
+ {
+ 	int b = 16;
+ 
+ 	for (;;)
+ 	{
+ 		if (b == newval)
+ 			break;
+ 
+ 		b *= 2;
+ 
+ 		if (b > newval || b > MAX_NUM_BUFFER_PARTITIONS)
+ 			return false;
+ 	}		
+ 
+ 	if (doit)
+ 		NBufferPartitions = newval;
+ 
+ 	return true;
+ }
+ 
+ static bool
  assign_autovacuum_max_workers(int newval, bool doit, GucSource source)
  {
  	if (newval + MaxConnections > INT_MAX / 4)
Index: src/include/storage/buf_internals.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.101
diff -c -r1.101 buf_internals.h
*** src/include/storage/buf_internals.h	12 Jan 2009 05:10:45 -0000	1.101
--- src/include/storage/buf_internals.h	17 May 2009 19:46:05 -0000
***************
*** 99,105 ****
   * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
   */
  #define BufTableHashPartition(hashcode) \
! 	((hashcode) % NUM_BUFFER_PARTITIONS)
  #define BufMappingPartitionLock(hashcode) \
  	((LWLockId) (FirstBufMappingLock + BufTableHashPartition(hashcode)))
  
--- 99,105 ----
   * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
   */
  #define BufTableHashPartition(hashcode) \
! 	((hashcode) % NBufferPartitions)
  #define BufMappingPartitionLock(hashcode) \
  	((LWLockId) (FirstBufMappingLock + BufTableHashPartition(hashcode)))
  
Index: src/include/storage/lwlock.h
===================================================================
RCS file: /home/sriggs/pg/REPOSITORY/pgsql/src/include/storage/lwlock.h,v
retrieving revision 1.42
diff -c -r1.42 lwlock.h
*** src/include/storage/lwlock.h	3 Mar 2009 08:11:24 -0000	1.42
--- src/include/storage/lwlock.h	17 May 2009 20:19:19 -0000
***************
*** 21,30 ****
   */
  
  /* Number of partitions of the shared buffer mapping hashtable */
! #define NUM_BUFFER_PARTITIONS  16
  
  /* Number of partitions the shared lock tables are divided into */
! #define LOG2_NUM_LOCK_PARTITIONS  4
  #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
  
  /*
--- 21,32 ----
   */
  
  /* Number of partitions of the shared buffer mapping hashtable */
! #define MAX_NUM_BUFFER_PARTITIONS  256
! 
! extern PGDLLIMPORT int NBufferPartitions;
  
  /* Number of partitions the shared lock tables are divided into */
! #define LOG2_NUM_LOCK_PARTITIONS 4
  #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
  
  /*
***************
*** 69,75 ****
  	SyncScanLock,
  	/* Individual lock IDs end here */
  	FirstBufMappingLock,
! 	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
  
  	/* must be last except for MaxDynamicLWLock: */
  	NumFixedLWLocks = FirstLockMgrLock + NUM_LOCK_PARTITIONS,
--- 71,77 ----
  	SyncScanLock,
  	/* Individual lock IDs end here */
  	FirstBufMappingLock,
! 	FirstLockMgrLock = FirstBufMappingLock + MAX_NUM_BUFFER_PARTITIONS,
  
  	/* must be last except for MaxDynamicLWLock: */
  	NumFixedLWLocks = FirstLockMgrLock + NUM_LOCK_PARTITIONS,

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Optimizing Read-Only Scalability

Reply via email to