Here's a patch that:

Moves the logic to find a page with enough room from _bt_insertonpg to a new function, _bt_findinsertloc. It makes the code more readable, and simplifies the forthcoming Grouped Index Tuples patch.


Also, the insert location within page used to be calculated twice for unique indexes, once in _bt_checkunique and second time in _bt_insertonpg. That's a waste of cycles, and this patch fixes that.


I couldn't measure a difference with pgbench, but this micro-benchmark shows it:

> psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);"
> psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync
> time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest; INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;"

Without patch:  real    0m7.260s
With patch:     real    0m6.963s

On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10, to remove any other variables.

It's not a huge difference, but it's worth having, and performance wasn't the main motivation of the patch anyway.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/nbtree/nbtinsert.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v
retrieving revision 1.152
diff -c -r1.152 nbtinsert.c
*** src/backend/access/nbtree/nbtinsert.c	21 Feb 2007 20:02:17 -0000	1.152
--- src/backend/access/nbtree/nbtinsert.c	26 Feb 2007 09:37:16 -0000
***************
*** 46,58 ****
  static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
  
  static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
! 				 Relation heapRel, Buffer buf,
  				 ScanKey itup_scankey);
  static void _bt_insertonpg(Relation rel, Buffer buf,
  			   BTStack stack,
- 			   int keysz, ScanKey scankey,
  			   IndexTuple itup,
! 			   OffsetNumber afteritem,
  			   bool split_only_page);
  static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
  		  OffsetNumber newitemoff, Size newitemsz,
--- 46,63 ----
  static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
  
  static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
! 				 Relation heapRel, Buffer buf, OffsetNumber ioffset,
  				 ScanKey itup_scankey);
+ static void _bt_findinsertloc(Relation rel,
+ 				  Buffer *bufptr, 
+ 				  OffsetNumber *offsetptr,
+ 				  int keysz,
+ 				  ScanKey scankey,
+ 				  IndexTuple newtup);
  static void _bt_insertonpg(Relation rel, Buffer buf,
  			   BTStack stack,
  			   IndexTuple itup,
! 			   OffsetNumber newitemoff,
  			   bool split_only_page);
  static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
  		  OffsetNumber newitemoff, Size newitemsz,
***************
*** 86,91 ****
--- 91,97 ----
  	ScanKey		itup_scankey;
  	BTStack		stack;
  	Buffer		buf;
+ 	OffsetNumber offset;
  
  	/* we need an insertion scan key to do our search, so build one */
  	itup_scankey = _bt_mkscankey(rel, itup);
***************
*** 94,99 ****
--- 100,107 ----
  	/* find the first page containing this key */
  	stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
  
+ 	offset = InvalidOffsetNumber;
+ 
  	/* trade in our read lock for a write lock */
  	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  	LockBuffer(buf, BT_WRITE);
***************
*** 128,134 ****
  	{
  		TransactionId xwait;
  
! 		xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey);
  
  		if (TransactionIdIsValid(xwait))
  		{
--- 136,143 ----
  	{
  		TransactionId xwait;
  
! 		offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
! 		xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey);
  
  		if (TransactionIdIsValid(xwait))
  		{
***************
*** 142,148 ****
  	}
  
  	/* do the insertion */
! 	_bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false);
  
  	/* be tidy */
  	_bt_freestack(stack);
--- 151,158 ----
  	}
  
  	/* do the insertion */
! 	_bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
! 	_bt_insertonpg(rel, buf, stack, itup, offset, false);
  
  	/* be tidy */
  	_bt_freestack(stack);
***************
*** 152,169 ****
  /*
   *	_bt_check_unique() -- Check for violation of unique index constraint
   *
   * Returns InvalidTransactionId if there is no conflict, else an xact ID
   * we must wait for to see if it commits a conflicting tuple.	If an actual
   * conflict is detected, no return --- just ereport().
   */
  static TransactionId
  _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
! 				 Buffer buf, ScanKey itup_scankey)
  {
  	TupleDesc	itupdesc = RelationGetDescr(rel);
  	int			natts = rel->rd_rel->relnatts;
! 	OffsetNumber offset,
! 				maxoff;
  	Page		page;
  	BTPageOpaque opaque;
  	Buffer		nbuf = InvalidBuffer;
--- 162,182 ----
  /*
   *	_bt_check_unique() -- Check for violation of unique index constraint
   *
+  * offset points to the first possible item that could conflict. It can
+  * also point to end-of-page, which means that the first tuple to check
+  * is the first tuple on the next page.
+  *
   * Returns InvalidTransactionId if there is no conflict, else an xact ID
   * we must wait for to see if it commits a conflicting tuple.	If an actual
   * conflict is detected, no return --- just ereport().
   */
  static TransactionId
  _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
! 				 Buffer buf, OffsetNumber offset, ScanKey itup_scankey)
  {
  	TupleDesc	itupdesc = RelationGetDescr(rel);
  	int			natts = rel->rd_rel->relnatts;
! 	OffsetNumber maxoff;
  	Page		page;
  	BTPageOpaque opaque;
  	Buffer		nbuf = InvalidBuffer;
***************
*** 173,184 ****
  	maxoff = PageGetMaxOffsetNumber(page);
  
  	/*
- 	 * Find first item >= proposed new item.  Note we could also get a pointer
- 	 * to end-of-page here.
- 	 */
- 	offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
- 
- 	/*
  	 * Scan over all equal tuples, looking for live conflicts.
  	 */
  	for (;;)
--- 186,191 ----
***************
*** 342,374 ****
  	return InvalidTransactionId;
  }
  
! /*----------
!  *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
!  *
!  *		This recursive procedure does the following things:
!  *
!  *			+  finds the right place to insert the tuple.
!  *			+  if necessary, splits the target page (making sure that the
!  *			   split is equitable as far as post-insert free space goes).
!  *			+  inserts the tuple.
!  *			+  if the page was split, pops the parent stack, and finds the
!  *			   right place to insert the new child pointer (by walking
!  *			   right using information stored in the parent stack).
!  *			+  invokes itself with the appropriate tuple for the right
!  *			   child page on the parent.
!  *			+  updates the metapage if a true root or fast root is split.
!  *
!  *		On entry, we must have the right buffer in which to do the
!  *		insertion, and the buffer must be pinned and write-locked.	On return,
!  *		we will have dropped both the pin and the lock on the buffer.
!  *
!  *		If 'afteritem' is >0 then the new tuple must be inserted after the
!  *		existing item of that number, noplace else.  If 'afteritem' is 0
!  *		then the procedure finds the exact spot to insert it by searching.
!  *		(keysz and scankey parameters are used ONLY if afteritem == 0.
!  *		The scankey must be an insertion-type scankey.)
   *
!  *		NOTE: if the new key is equal to one or more existing keys, we can
   *		legitimately place it anywhere in the series of equal keys --- in fact,
   *		if the new key is equal to the page's "high key" we can place it on
   *		the next page.	If it is equal to the high key, and there's not room
--- 349,359 ----
  	return InvalidTransactionId;
  }
  
! 
! /*
!  *	_bt_findinsertloc() -- Finds an insert location for a tuple
   *
!  *		If the new key is equal to one or more existing keys, we can
   *		legitimately place it anywhere in the series of equal keys --- in fact,
   *		if the new key is equal to the page's "high key" we can place it on
   *		the next page.	If it is equal to the high key, and there's not room
***************
*** 379,414 ****
   *		Once we have chosen the page to put the key on, we'll insert it before
   *		any existing equal keys because of the way _bt_binsrch() works.
   *
!  *		The locking interactions in this code are critical.  You should
!  *		grok Lehman and Yao's paper before making any changes.  In addition,
!  *		you need to understand how we disambiguate duplicate keys in this
!  *		implementation, in order to be able to find our location using
!  *		L&Y "move right" operations.  Since we may insert duplicate user
!  *		keys, and since these dups may propagate up the tree, we use the
!  *		'afteritem' parameter to position ourselves correctly for the
!  *		insertion on internal pages.
!  *----------
   */
  static void
! _bt_insertonpg(Relation rel,
! 			   Buffer buf,
! 			   BTStack stack,
! 			   int keysz,
! 			   ScanKey scankey,
! 			   IndexTuple itup,
! 			   OffsetNumber afteritem,
! 			   bool split_only_page)
  {
! 	Page		page;
  	BTPageOpaque lpageop;
  	OffsetNumber newitemoff;
! 	OffsetNumber firstright = InvalidOffsetNumber;
! 	Size		itemsz;
  
- 	page = BufferGetPage(buf);
  	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
  
! 	itemsz = IndexTupleDSize(*itup);
  	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
  								 * need to be consistent */
  
--- 364,403 ----
   *		Once we have chosen the page to put the key on, we'll insert it before
   *		any existing equal keys because of the way _bt_binsrch() works.
   *
!  *		If there's not enough room in the space, we try to make room by
!  *		removing any LP_DELETEd tuples.
!  *
!  *		On entry, *buf and *offsetptr point to the first legal position
!  *		where the new tuple could be inserted. The caller should hold an
!  *		exclusive lock on *buf. *offsetptr can also be set to
!  *		InvalidOffsetNumber, in which case the function will search the	right
!  *		location within the page if needed. On exit, they point to the chosen
!  *		insert location. If findinsertloc decided to move right, the lock and
!  *		pin on the original page will be released and the new page returned to
!  *		the caller is exclusively locked instead.
!  *
!  *		newtup is the new tuple we're inserting, and scankey is an insertion
!  *		type scan key for it.
   */
  static void
! _bt_findinsertloc(Relation rel,
! 				  Buffer *bufptr,
! 				  OffsetNumber *offsetptr,
! 				  int keysz,
! 				  ScanKey scankey,
! 				  IndexTuple newtup)
  {
! 	Buffer buf = *bufptr;
! 	Page page = BufferGetPage(buf);
! 	Size itemsz;
  	BTPageOpaque lpageop;
+ 	bool movedright, vacuumed;
  	OffsetNumber newitemoff;
! 	OffsetNumber firstlegaloff = *offsetptr;
  
  	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
  
! 	itemsz = IndexTupleDSize(*newtup);
  	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
  								 * need to be consistent */
  
***************
*** 429,525 ****
  				"Consider a function index of an MD5 hash of the value, "
  				"or use full text indexing.")));
  
- 	/*
- 	 * Determine exactly where new item will go.
- 	 */
- 	if (afteritem > 0)
- 		newitemoff = afteritem + 1;
- 	else
- 	{
- 		/*----------
- 		 * If we will need to split the page to put the item here,
- 		 * check whether we can put the tuple somewhere to the right,
- 		 * instead.  Keep scanning right until we
- 		 *		(a) find a page with enough free space,
- 		 *		(b) reach the last page where the tuple can legally go, or
- 		 *		(c) get tired of searching.
- 		 * (c) is not flippant; it is important because if there are many
- 		 * pages' worth of equal keys, it's better to split one of the early
- 		 * pages than to scan all the way to the end of the run of equal keys
- 		 * on every insert.  We implement "get tired" as a random choice,
- 		 * since stopping after scanning a fixed number of pages wouldn't work
- 		 * well (we'd never reach the right-hand side of previously split
- 		 * pages).	Currently the probability of moving right is set at 0.99,
- 		 * which may seem too high to change the behavior much, but it does an
- 		 * excellent job of preventing O(N^2) behavior with many equal keys.
- 		 *----------
- 		 */
- 		bool		movedright = false;
- 
- 		while (PageGetFreeSpace(page) < itemsz)
- 		{
- 			Buffer		rbuf;
  
- 			/*
- 			 * before considering moving right, see if we can obtain enough
- 			 * space by erasing LP_DELETE items
- 			 */
- 			if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
- 			{
- 				_bt_vacuum_one_page(rel, buf);
- 				if (PageGetFreeSpace(page) >= itemsz)
- 					break;		/* OK, now we have enough space */
- 			}
  
! 			/*
! 			 * nope, so check conditions (b) and (c) enumerated above
! 			 */
! 			if (P_RIGHTMOST(lpageop) ||
! 				_bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
! 				random() <= (MAX_RANDOM_VALUE / 100))
! 				break;
  
! 			/*
! 			 * step right to next non-dead page
! 			 *
! 			 * must write-lock that page before releasing write lock on
! 			 * current page; else someone else's _bt_check_unique scan could
! 			 * fail to see our insertion.  write locks on intermediate dead
! 			 * pages won't do because we don't know when they will get
! 			 * de-linked from the tree.
! 			 */
! 			rbuf = InvalidBuffer;
  
! 			for (;;)
! 			{
! 				BlockNumber rblkno = lpageop->btpo_next;
  
! 				rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
! 				page = BufferGetPage(rbuf);
! 				lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
! 				if (!P_IGNORE(lpageop))
! 					break;
! 				if (P_RIGHTMOST(lpageop))
! 					elog(ERROR, "fell off the end of \"%s\"",
! 						 RelationGetRelationName(rel));
! 			}
! 			_bt_relbuf(rel, buf);
! 			buf = rbuf;
! 			movedright = true;
  		}
  
  		/*
! 		 * Now we are on the right page, so find the insert position. If we
! 		 * moved right at all, we know we should insert at the start of the
! 		 * page, else must find the position by searching.
  		 */
! 		if (movedright)
! 			newitemoff = P_FIRSTDATAKEY(lpageop);
! 		else
! 			newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
  	}
  
  	/*
  	 * Do we need to split the page to fit the item on it?
  	 *
  	 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
--- 418,573 ----
  				"Consider a function index of an MD5 hash of the value, "
  				"or use full text indexing.")));
  
  
  
! 	/*----------
! 	 * If we will need to split the page to put the item on this page,
! 	 * check whether we can put the tuple somewhere to the right,
! 	 * instead.  Keep scanning right until we
! 	 *		(a) find a page with enough free space,
! 	 *		(b) reach the last page where the tuple can legally go, or
! 	 *		(c) get tired of searching.
! 	 * (c) is not flippant; it is important because if there are many
! 	 * pages' worth of equal keys, it's better to split one of the early
! 	 * pages than to scan all the way to the end of the run of equal keys
! 	 * on every insert.  We implement "get tired" as a random choice,
! 	 * since stopping after scanning a fixed number of pages wouldn't work
! 	 * well (we'd never reach the right-hand side of previously split
! 	 * pages).	Currently the probability of moving right is set at 0.99,
! 	 * which may seem too high to change the behavior much, but it does an
! 	 * excellent job of preventing O(N^2) behavior with many equal keys.
! 	 *----------
! 	 */
! 	movedright = false;
! 	vacuumed = false;
! 	while (PageGetFreeSpace(page) < itemsz)
! 	{
! 		Buffer		rbuf;
  
! 		/*
! 		 * before considering moving right, see if we can obtain enough
! 		 * space by erasing LP_DELETE items
! 		 */
! 		if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
! 		{
! 			_bt_vacuum_one_page(rel, buf);
  
! 			/* remember that we vacuumed this page, because that makes
! 			 * the hint supplied by the caller invalid */
! 			vacuumed = true;
  
! 			if (PageGetFreeSpace(page) >= itemsz) 
! 				break;		/* OK, now we have enough space */
  		}
  
  		/*
! 		 * nope, so check conditions (b) and (c) enumerated above
  		 */
! 		if (P_RIGHTMOST(lpageop) ||
! 			_bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
! 			random() <= (MAX_RANDOM_VALUE / 100))
! 			break;
! 
! 		/*
! 		 * step right to next non-dead page
! 		 *
! 		 * must write-lock that page before releasing write lock on
! 		 * current page; else someone else's _bt_check_unique scan could
! 		 * fail to see our insertion.  write locks on intermediate dead
! 		 * pages won't do because we don't know when they will get
! 		 * de-linked from the tree.
! 		 */
! 		rbuf = InvalidBuffer;
! 
! 		for (;;)
! 		{
! 			BlockNumber rblkno = lpageop->btpo_next;
! 
! 			rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
! 			page = BufferGetPage(rbuf);
! 			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
! 			if (!P_IGNORE(lpageop))
! 				break;
! 			if (P_RIGHTMOST(lpageop))
! 				elog(ERROR, "fell off the end of \"%s\"",
! 					 RelationGetRelationName(rel));
! 		}
! 		_bt_relbuf(rel, buf);
! 		buf = rbuf;
! 		movedright = true;
! 		vacuumed = false;
  	}
  
  	/*
+ 	 * Now we are on the right page, so find the insert position. If we
+ 	 * moved right at all, we know we should insert at the start of the
+ 	 * page. If we didn't move right, we can use the firstlegaloff hint
+ 	 * if the caller supplied one, unless we vacuumed the page which
+ 	 * might have moved tuples around making the hint invalid. If we 
+ 	 * didn't move right or can't use the hint, find the position
+ 	 * by searching.
+ 	 */
+ 	if (movedright)
+ 		newitemoff = P_FIRSTDATAKEY(lpageop);
+ 	else if(firstlegaloff != InvalidOffsetNumber && !vacuumed)
+ 		newitemoff = firstlegaloff;
+ 	else
+ 		newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
+ 
+ 	*bufptr = buf;
+ 	*offsetptr = newitemoff;
+ }
+ 
+ /*----------
+  *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
+  *
+  *		This recursive procedure does the following things:
+  *
+  *			+  if necessary, splits the target page (making sure that the
+  *			   split is equitable as far as post-insert free space goes).
+  *			+  inserts the tuple.
+  *			+  if the page was split, pops the parent stack, and finds the
+  *			   right place to insert the new child pointer (by walking
+  *			   right using information stored in the parent stack).
+  *			+  invokes itself with the appropriate tuple for the right
+  *			   child page on the parent.
+  *			+  updates the metapage if a true root or fast root is split.
+  *
+  *		On entry, we must have the right buffer in which to do the
+  *		insertion, and the buffer must be pinned and write-locked.	On return,
+  *		we will have dropped both the pin and the lock on the buffer.
+  *
+  *		The locking interactions in this code are critical.  You should
+  *		grok Lehman and Yao's paper before making any changes.  In addition,
+  *		you need to understand how we disambiguate duplicate keys in this
+  *		implementation, in order to be able to find our location using
+  *		L&Y "move right" operations.  Since we may insert duplicate user
+  *		keys, and since these dups may propagate up the tree, we use the
+  *		'afteritem' parameter to position ourselves correctly for the
+  *		insertion on internal pages.
+  *----------
+  */
+ static void
+ _bt_insertonpg(Relation rel,
+ 			   Buffer buf,
+ 			   BTStack stack,
+ 			   IndexTuple itup,
+ 			   OffsetNumber newitemoff,
+ 			   bool split_only_page)
+ {
+ 	Page		page;
+ 	BTPageOpaque lpageop;
+ 	OffsetNumber firstright = InvalidOffsetNumber;
+ 	Size		itemsz;
+ 
+ 	page = BufferGetPage(buf);
+ 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ 
+ 	itemsz = IndexTupleDSize(*itup);
+ 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
+ 								 * need to be consistent */
+ 
+ 	/*
  	 * Do we need to split the page to fit the item on it?
  	 *
  	 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
***************
*** 1427,1433 ****
  
  		/* Recursively update the parent */
  		_bt_insertonpg(rel, pbuf, stack->bts_parent,
! 					   0, NULL, new_item, stack->bts_offset,
  					   is_only);
  
  		/* be tidy */
--- 1475,1481 ----
  
  		/* Recursively update the parent */
  		_bt_insertonpg(rel, pbuf, stack->bts_parent,
! 					   new_item, stack->bts_offset + 1,
  					   is_only);
  
  		/* be tidy */
---------------------------(end of broadcast)---------------------------
TIP 7: You can help support the PostgreSQL project by donating at

                http://www.postgresql.org/about/donate

Reply via email to