diff -cpr pgsql-orig/src/backend/access/nbtree/README pgsql/src/backend/access/nbtree/README
*** pgsql-orig/src/backend/access/nbtree/README	Thu Jul 13 09:36:05 2006
--- pgsql/src/backend/access/nbtree/README	Thu Jul 13 10:47:57 2006
*************** item doesn't fit on the split page where
*** 128,146 ****
  The deletion algorithm
  ----------------------
  
! Before deleting a leaf item, we get a super-exclusive lock on the target
! page, so that no other backend has a pin on the page when the deletion
! starts.  This is not necessary for correctness in terms of the btree index
! operations themselves; as explained above, index scans logically stop
! "between" pages and so can't lose their place.  The reason we do it is to
! provide an interlock between non-full VACUUM and indexscans.  Since VACUUM
! deletes index entries before deleting tuples, the super-exclusive lock
! guarantees that VACUUM can't delete any heap tuple that an indexscanning
! process might be about to visit.  (This guarantee works only for simple
! indexscans that visit the heap in sync with the index scan, not for bitmap
! scans.  We only need the guarantee when using non-MVCC snapshot rules such
! as SnapshotNow, so in practice this is only important for system catalog
! accesses.)
  
  Because a page can be split even while someone holds a pin on it, it is
  possible that an indexscan will return items that are no longer stored on
--- 128,148 ----
  The deletion algorithm
  ----------------------
  
! Before deleting a leaf item and the corresponding heap tuple, we get a
! super-exclusive lock on the target page, so that no other backend has a
! pin on the page when the deletion starts.  This is not necessary for
! correctness in terms of the btree index operations themselves; as explained
! above, index scans logically stop "between" pages and so can't lose their
! place. The reason we do it is to provide an interlock between non-full
! VACUUM and indexscans.  Since VACUUM deletes index entries before deleting
! tuples, the super-exclusive lock guarantees that VACUUM can't delete any
! heap tuple that an indexscanning process might be about to visit.  (This
! guarantee works only for simple indexscans that visit the heap in sync with
! the index scan, not for bitmap scans.  We only need the guarantee when
! using non-MVCC snapshot rules such as SnapshotNow, so in practice this is
! only important for system catalog accesses.) When we sweep only leaf pages
! and don't delete heap tuples, we take just an exclusive lock on leaf pages.
! We use leaf-only sweep before leaf pages are splitting.
  
  Because a page can be split even while someone holds a pin on it, it is
  possible that an indexscan will return items that are no longer stored on
diff -cpr pgsql-orig/src/backend/access/nbtree/nbtinsert.c pgsql/src/backend/access/nbtree/nbtinsert.c
*** pgsql-orig/src/backend/access/nbtree/nbtinsert.c	Thu Jul 13 09:36:05 2006
--- pgsql/src/backend/access/nbtree/nbtinsert.c	Thu Jul 13 09:53:58 2006
*************** static void _bt_pgaddtup(Relation rel, P
*** 64,69 ****
--- 64,70 ----
  			 OffsetNumber itup_off, const char *where);
  static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
  			int keysz, ScanKey scankey);
+ static void _bt_vacuumpage(Relation rel, Buffer buffer);
  
  
  /*
*************** _bt_check_unique(Relation rel, IndexTupl
*** 261,266 ****
--- 262,268 ----
  												 hbuffer) == HEAPTUPLE_DEAD)
  					{
  						curitemid->lp_flags |= LP_DELETE;
+ 						opaque->btpo_flags |= BTP_HAS_GARBAGE;
  						if (nbuf != InvalidBuffer)
  							SetBufferCommitInfoNeedsSave(nbuf);
  						else
*************** _bt_insertonpg(Relation rel,
*** 423,433 ****
  		 */
  		bool		movedright = false;
  
! 		while (PageGetFreeSpace(page) < itemsz &&
! 			   !P_RIGHTMOST(lpageop) &&
! 			   _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 &&
! 			   random() > (MAX_RANDOM_VALUE / 100))
  		{
  			/*
  			 * step right to next non-dead page
  			 *
--- 425,449 ----
  		 */
  		bool		movedright = false;
  
! 		while (PageGetFreeSpace(page) < itemsz)
  		{
+ 			Buffer		rbuf;
+ 
+ 			/*
+ 			 * Erase dead items to obtain enough space.
+ 			 */
+ 			if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
+ 			{
+ 				_bt_vacuumpage(rel, buf);
+ 				if (PageGetFreeSpace(page) >= itemsz)
+ 					break; /* Ok, now we have enough space. */
+ 			}
+ 
+ 			if (P_RIGHTMOST(lpageop) ||
+ 				_bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
+ 				random() <= (MAX_RANDOM_VALUE / 100))
+ 				break;
+ 
  			/*
  			 * step right to next non-dead page
  			 *
*************** _bt_insertonpg(Relation rel,
*** 437,443 ****
  			 * pages won't do because we don't know when they will get
  			 * de-linked from the tree.
  			 */
! 			Buffer		rbuf = InvalidBuffer;
  
  			for (;;)
  			{
--- 453,459 ----
  			 * pages won't do because we don't know when they will get
  			 * de-linked from the tree.
  			 */
! 			rbuf = InvalidBuffer;
  
  			for (;;)
  			{
*************** _bt_isequal(TupleDesc itupdesc, Page pag
*** 1649,1652 ****
--- 1665,1704 ----
  
  	/* if we get here, the keys are equal */
  	return true;
+ }
+ 
+ /*
+  * _bt_vacuumpage --- Vacuum just one page.
+  *
+  * Simplifed version of btvacuumpage. It cleans the passed page only.
+  * The passed bufffer must be exclusive-locked.
+  */
+ static void
+ _bt_vacuumpage(Relation rel, Buffer buffer)
+ {
+ 	OffsetNumber	deletable[MaxOffsetNumber];
+ 	int				ndeletable = 0;
+ 	OffsetNumber	offnum,
+ 					minoff,
+ 					maxoff;
+ 	Page			page = BufferGetPage(buffer);
+ 	BTPageOpaque	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ 
+ 	/*
+ 	 * Scan over all items to see which ones need deleted
+ 	 * according to LP_DELETE flags.
+ 	 */
+ 	minoff = P_FIRSTDATAKEY(opaque);
+ 	maxoff = PageGetMaxOffsetNumber(page);
+ 	for (offnum = minoff;
+ 		 offnum <= maxoff;
+ 		 offnum = OffsetNumberNext(offnum))
+ 	{
+ 		ItemId itemId = PageGetItemId(page, offnum);
+ 		if (ItemIdDeleted(itemId))
+ 			deletable[ndeletable++] = offnum;
+ 	}
+ 
+ 	if (ndeletable > 0)
+ 		_bt_delitems(rel, buffer, deletable, ndeletable);
  }
diff -cpr pgsql-orig/src/backend/access/nbtree/nbtpage.c pgsql/src/backend/access/nbtree/nbtpage.c
*** pgsql-orig/src/backend/access/nbtree/nbtpage.c	Thu Jul 13 09:36:05 2006
--- pgsql/src/backend/access/nbtree/nbtpage.c	Thu Jul 13 09:45:19 2006
*************** _bt_delitems(Relation rel, Buffer buf,
*** 666,671 ****
--- 666,672 ----
  	 */
  	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
  	opaque->btpo_cycleid = 0;
+ 	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
  
  	MarkBufferDirty(buf);
  
diff -cpr pgsql-orig/src/include/access/nbtree.h pgsql/src/include/access/nbtree.h
*** pgsql-orig/src/include/access/nbtree.h	Thu Jul 13 09:36:09 2006
--- pgsql/src/include/access/nbtree.h	Thu Jul 13 09:45:19 2006
*************** typedef BTPageOpaqueData *BTPageOpaque;
*** 71,76 ****
--- 71,77 ----
  #define BTP_META		(1 << 3)	/* meta-page */
  #define BTP_HALF_DEAD	(1 << 4)	/* empty, but still in tree */
  #define BTP_SPLIT_END	(1 << 5)	/* rightmost page of split group */
+ #define BTP_HAS_GARBAGE	(1 << 6)	/* page has dead tuples */
  
  
  /*
*************** typedef struct BTMetaPageData
*** 163,168 ****
--- 164,170 ----
  #define P_ISROOT(opaque)		((opaque)->btpo_flags & BTP_ROOT)
  #define P_ISDELETED(opaque)		((opaque)->btpo_flags & BTP_DELETED)
  #define P_IGNORE(opaque)		((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
+ #define P_HAS_GARBAGE(opaque)	((opaque)->btpo_flags & BTP_HAS_GARBAGE)
  
  /*
   *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
