Here's a patch that:
Moves the logic to find a page with enough room from _bt_insertonpg to a
new function, _bt_findinsertloc. It makes the code more readable, and
simplifies the forthcoming Grouped Index Tuples patch.
Also, the insert location within page used to be calculated twice for
unique indexes, once in _bt_checkunique and second time in
_bt_insertonpg. That's a waste of cycles, and this patch fixes that.
I couldn't measure a difference with pgbench, but this micro-benchmark
shows it:
> psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);"
> psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync
> time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest;
INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;"
Without patch: real 0m7.260s
With patch: real 0m6.963s
On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10,
to remove any other variables.
It's not a huge difference, but it's worth having, and performance
wasn't the main motivation of the patch anyway.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
Index: src/backend/access/nbtree/nbtinsert.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v
retrieving revision 1.152
diff -c -r1.152 nbtinsert.c
*** src/backend/access/nbtree/nbtinsert.c 21 Feb 2007 20:02:17 -0000 1.152
--- src/backend/access/nbtree/nbtinsert.c 26 Feb 2007 09:37:16 -0000
***************
*** 46,58 ****
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
! Relation heapRel, Buffer buf,
ScanKey itup_scankey);
static void _bt_insertonpg(Relation rel, Buffer buf,
BTStack stack,
- int keysz, ScanKey scankey,
IndexTuple itup,
! OffsetNumber afteritem,
bool split_only_page);
static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
OffsetNumber newitemoff, Size newitemsz,
--- 46,63 ----
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
static TransactionId _bt_check_unique(Relation rel, IndexTuple itup,
! Relation heapRel, Buffer buf, OffsetNumber ioffset,
ScanKey itup_scankey);
+ static void _bt_findinsertloc(Relation rel,
+ Buffer *bufptr,
+ OffsetNumber *offsetptr,
+ int keysz,
+ ScanKey scankey,
+ IndexTuple newtup);
static void _bt_insertonpg(Relation rel, Buffer buf,
BTStack stack,
IndexTuple itup,
! OffsetNumber newitemoff,
bool split_only_page);
static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
OffsetNumber newitemoff, Size newitemsz,
***************
*** 86,91 ****
--- 91,97 ----
ScanKey itup_scankey;
BTStack stack;
Buffer buf;
+ OffsetNumber offset;
/* we need an insertion scan key to do our search, so build one */
itup_scankey = _bt_mkscankey(rel, itup);
***************
*** 94,99 ****
--- 100,107 ----
/* find the first page containing this key */
stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);
+ offset = InvalidOffsetNumber;
+
/* trade in our read lock for a write lock */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBuffer(buf, BT_WRITE);
***************
*** 128,134 ****
{
TransactionId xwait;
! xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey);
if (TransactionIdIsValid(xwait))
{
--- 136,143 ----
{
TransactionId xwait;
! offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
! xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey);
if (TransactionIdIsValid(xwait))
{
***************
*** 142,148 ****
}
/* do the insertion */
! _bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false);
/* be tidy */
_bt_freestack(stack);
--- 151,158 ----
}
/* do the insertion */
! _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
! _bt_insertonpg(rel, buf, stack, itup, offset, false);
/* be tidy */
_bt_freestack(stack);
***************
*** 152,169 ****
/*
* _bt_check_unique() -- Check for violation of unique index constraint
*
* Returns InvalidTransactionId if there is no conflict, else an xact ID
* we must wait for to see if it commits a conflicting tuple. If an actual
* conflict is detected, no return --- just ereport().
*/
static TransactionId
_bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
! Buffer buf, ScanKey itup_scankey)
{
TupleDesc itupdesc = RelationGetDescr(rel);
int natts = rel->rd_rel->relnatts;
! OffsetNumber offset,
! maxoff;
Page page;
BTPageOpaque opaque;
Buffer nbuf = InvalidBuffer;
--- 162,182 ----
/*
* _bt_check_unique() -- Check for violation of unique index constraint
*
+ * offset points to the first possible item that could conflict. It can
+ * also point to end-of-page, which means that the first tuple to check
+ * is the first tuple on the next page.
+ *
* Returns InvalidTransactionId if there is no conflict, else an xact ID
* we must wait for to see if it commits a conflicting tuple. If an actual
* conflict is detected, no return --- just ereport().
*/
static TransactionId
_bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
! Buffer buf, OffsetNumber offset, ScanKey itup_scankey)
{
TupleDesc itupdesc = RelationGetDescr(rel);
int natts = rel->rd_rel->relnatts;
! OffsetNumber maxoff;
Page page;
BTPageOpaque opaque;
Buffer nbuf = InvalidBuffer;
***************
*** 173,184 ****
maxoff = PageGetMaxOffsetNumber(page);
/*
- * Find first item >= proposed new item. Note we could also get a pointer
- * to end-of-page here.
- */
- offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
-
- /*
* Scan over all equal tuples, looking for live conflicts.
*/
for (;;)
--- 186,191 ----
***************
*** 342,374 ****
return InvalidTransactionId;
}
! /*----------
! * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
! *
! * This recursive procedure does the following things:
! *
! * + finds the right place to insert the tuple.
! * + if necessary, splits the target page (making sure that the
! * split is equitable as far as post-insert free space goes).
! * + inserts the tuple.
! * + if the page was split, pops the parent stack, and finds the
! * right place to insert the new child pointer (by walking
! * right using information stored in the parent stack).
! * + invokes itself with the appropriate tuple for the right
! * child page on the parent.
! * + updates the metapage if a true root or fast root is split.
! *
! * On entry, we must have the right buffer in which to do the
! * insertion, and the buffer must be pinned and write-locked. On return,
! * we will have dropped both the pin and the lock on the buffer.
! *
! * If 'afteritem' is >0 then the new tuple must be inserted after the
! * existing item of that number, noplace else. If 'afteritem' is 0
! * then the procedure finds the exact spot to insert it by searching.
! * (keysz and scankey parameters are used ONLY if afteritem == 0.
! * The scankey must be an insertion-type scankey.)
*
! * NOTE: if the new key is equal to one or more existing keys, we can
* legitimately place it anywhere in the series of equal keys --- in fact,
* if the new key is equal to the page's "high key" we can place it on
* the next page. If it is equal to the high key, and there's not room
--- 349,359 ----
return InvalidTransactionId;
}
!
! /*
! * _bt_findinsertloc() -- Finds an insert location for a tuple
*
! * If the new key is equal to one or more existing keys, we can
* legitimately place it anywhere in the series of equal keys --- in fact,
* if the new key is equal to the page's "high key" we can place it on
* the next page. If it is equal to the high key, and there's not room
***************
*** 379,414 ****
* Once we have chosen the page to put the key on, we'll insert it before
* any existing equal keys because of the way _bt_binsrch() works.
*
! * The locking interactions in this code are critical. You should
! * grok Lehman and Yao's paper before making any changes. In addition,
! * you need to understand how we disambiguate duplicate keys in this
! * implementation, in order to be able to find our location using
! * L&Y "move right" operations. Since we may insert duplicate user
! * keys, and since these dups may propagate up the tree, we use the
! * 'afteritem' parameter to position ourselves correctly for the
! * insertion on internal pages.
! *----------
*/
static void
! _bt_insertonpg(Relation rel,
! Buffer buf,
! BTStack stack,
! int keysz,
! ScanKey scankey,
! IndexTuple itup,
! OffsetNumber afteritem,
! bool split_only_page)
{
! Page page;
BTPageOpaque lpageop;
OffsetNumber newitemoff;
! OffsetNumber firstright = InvalidOffsetNumber;
! Size itemsz;
- page = BufferGetPage(buf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
! itemsz = IndexTupleDSize(*itup);
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
* need to be consistent */
--- 364,403 ----
* Once we have chosen the page to put the key on, we'll insert it before
* any existing equal keys because of the way _bt_binsrch() works.
*
! * If there's not enough room in the space, we try to make room by
! * removing any LP_DELETEd tuples.
! *
! * On entry, *buf and *offsetptr point to the first legal position
! * where the new tuple could be inserted. The caller should hold an
! * exclusive lock on *buf. *offsetptr can also be set to
! * InvalidOffsetNumber, in which case the function will search the right
! * location within the page if needed. On exit, they point to the chosen
! * insert location. If findinsertloc decided to move right, the lock and
! * pin on the original page will be released and the new page returned to
! * the caller is exclusively locked instead.
! *
! * newtup is the new tuple we're inserting, and scankey is an insertion
! * type scan key for it.
*/
static void
! _bt_findinsertloc(Relation rel,
! Buffer *bufptr,
! OffsetNumber *offsetptr,
! int keysz,
! ScanKey scankey,
! IndexTuple newtup)
{
! Buffer buf = *bufptr;
! Page page = BufferGetPage(buf);
! Size itemsz;
BTPageOpaque lpageop;
+ bool movedright, vacuumed;
OffsetNumber newitemoff;
! OffsetNumber firstlegaloff = *offsetptr;
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
! itemsz = IndexTupleDSize(*newtup);
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
* need to be consistent */
***************
*** 429,525 ****
"Consider a function index of an MD5 hash of the value, "
"or use full text indexing.")));
- /*
- * Determine exactly where new item will go.
- */
- if (afteritem > 0)
- newitemoff = afteritem + 1;
- else
- {
- /*----------
- * If we will need to split the page to put the item here,
- * check whether we can put the tuple somewhere to the right,
- * instead. Keep scanning right until we
- * (a) find a page with enough free space,
- * (b) reach the last page where the tuple can legally go, or
- * (c) get tired of searching.
- * (c) is not flippant; it is important because if there are many
- * pages' worth of equal keys, it's better to split one of the early
- * pages than to scan all the way to the end of the run of equal keys
- * on every insert. We implement "get tired" as a random choice,
- * since stopping after scanning a fixed number of pages wouldn't work
- * well (we'd never reach the right-hand side of previously split
- * pages). Currently the probability of moving right is set at 0.99,
- * which may seem too high to change the behavior much, but it does an
- * excellent job of preventing O(N^2) behavior with many equal keys.
- *----------
- */
- bool movedright = false;
-
- while (PageGetFreeSpace(page) < itemsz)
- {
- Buffer rbuf;
- /*
- * before considering moving right, see if we can obtain enough
- * space by erasing LP_DELETE items
- */
- if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
- {
- _bt_vacuum_one_page(rel, buf);
- if (PageGetFreeSpace(page) >= itemsz)
- break; /* OK, now we have enough space */
- }
! /*
! * nope, so check conditions (b) and (c) enumerated above
! */
! if (P_RIGHTMOST(lpageop) ||
! _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
! random() <= (MAX_RANDOM_VALUE / 100))
! break;
! /*
! * step right to next non-dead page
! *
! * must write-lock that page before releasing write lock on
! * current page; else someone else's _bt_check_unique scan could
! * fail to see our insertion. write locks on intermediate dead
! * pages won't do because we don't know when they will get
! * de-linked from the tree.
! */
! rbuf = InvalidBuffer;
! for (;;)
! {
! BlockNumber rblkno = lpageop->btpo_next;
! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
! page = BufferGetPage(rbuf);
! lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
! if (!P_IGNORE(lpageop))
! break;
! if (P_RIGHTMOST(lpageop))
! elog(ERROR, "fell off the end of \"%s\"",
! RelationGetRelationName(rel));
! }
! _bt_relbuf(rel, buf);
! buf = rbuf;
! movedright = true;
}
/*
! * Now we are on the right page, so find the insert position. If we
! * moved right at all, we know we should insert at the start of the
! * page, else must find the position by searching.
*/
! if (movedright)
! newitemoff = P_FIRSTDATAKEY(lpageop);
! else
! newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
}
/*
* Do we need to split the page to fit the item on it?
*
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
--- 418,573 ----
"Consider a function index of an MD5 hash of the value, "
"or use full text indexing.")));
! /*----------
! * If we will need to split the page to put the item on this page,
! * check whether we can put the tuple somewhere to the right,
! * instead. Keep scanning right until we
! * (a) find a page with enough free space,
! * (b) reach the last page where the tuple can legally go, or
! * (c) get tired of searching.
! * (c) is not flippant; it is important because if there are many
! * pages' worth of equal keys, it's better to split one of the early
! * pages than to scan all the way to the end of the run of equal keys
! * on every insert. We implement "get tired" as a random choice,
! * since stopping after scanning a fixed number of pages wouldn't work
! * well (we'd never reach the right-hand side of previously split
! * pages). Currently the probability of moving right is set at 0.99,
! * which may seem too high to change the behavior much, but it does an
! * excellent job of preventing O(N^2) behavior with many equal keys.
! *----------
! */
! movedright = false;
! vacuumed = false;
! while (PageGetFreeSpace(page) < itemsz)
! {
! Buffer rbuf;
! /*
! * before considering moving right, see if we can obtain enough
! * space by erasing LP_DELETE items
! */
! if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
! {
! _bt_vacuum_one_page(rel, buf);
! /* remember that we vacuumed this page, because that makes
! * the hint supplied by the caller invalid */
! vacuumed = true;
! if (PageGetFreeSpace(page) >= itemsz)
! break; /* OK, now we have enough space */
}
/*
! * nope, so check conditions (b) and (c) enumerated above
*/
! if (P_RIGHTMOST(lpageop) ||
! _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 ||
! random() <= (MAX_RANDOM_VALUE / 100))
! break;
!
! /*
! * step right to next non-dead page
! *
! * must write-lock that page before releasing write lock on
! * current page; else someone else's _bt_check_unique scan could
! * fail to see our insertion. write locks on intermediate dead
! * pages won't do because we don't know when they will get
! * de-linked from the tree.
! */
! rbuf = InvalidBuffer;
!
! for (;;)
! {
! BlockNumber rblkno = lpageop->btpo_next;
!
! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
! page = BufferGetPage(rbuf);
! lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
! if (!P_IGNORE(lpageop))
! break;
! if (P_RIGHTMOST(lpageop))
! elog(ERROR, "fell off the end of \"%s\"",
! RelationGetRelationName(rel));
! }
! _bt_relbuf(rel, buf);
! buf = rbuf;
! movedright = true;
! vacuumed = false;
}
/*
+ * Now we are on the right page, so find the insert position. If we
+ * moved right at all, we know we should insert at the start of the
+ * page. If we didn't move right, we can use the firstlegaloff hint
+ * if the caller supplied one, unless we vacuumed the page which
+ * might have moved tuples around making the hint invalid. If we
+ * didn't move right or can't use the hint, find the position
+ * by searching.
+ */
+ if (movedright)
+ newitemoff = P_FIRSTDATAKEY(lpageop);
+ else if(firstlegaloff != InvalidOffsetNumber && !vacuumed)
+ newitemoff = firstlegaloff;
+ else
+ newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false);
+
+ *bufptr = buf;
+ *offsetptr = newitemoff;
+ }
+
+ /*----------
+ * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ *
+ * This recursive procedure does the following things:
+ *
+ * + if necessary, splits the target page (making sure that the
+ * split is equitable as far as post-insert free space goes).
+ * + inserts the tuple.
+ * + if the page was split, pops the parent stack, and finds the
+ * right place to insert the new child pointer (by walking
+ * right using information stored in the parent stack).
+ * + invokes itself with the appropriate tuple for the right
+ * child page on the parent.
+ * + updates the metapage if a true root or fast root is split.
+ *
+ * On entry, we must have the right buffer in which to do the
+ * insertion, and the buffer must be pinned and write-locked. On return,
+ * we will have dropped both the pin and the lock on the buffer.
+ *
+ * The locking interactions in this code are critical. You should
+ * grok Lehman and Yao's paper before making any changes. In addition,
+ * you need to understand how we disambiguate duplicate keys in this
+ * implementation, in order to be able to find our location using
+ * L&Y "move right" operations. Since we may insert duplicate user
+ * keys, and since these dups may propagate up the tree, we use the
+ * 'afteritem' parameter to position ourselves correctly for the
+ * insertion on internal pages.
+ *----------
+ */
+ static void
+ _bt_insertonpg(Relation rel,
+ Buffer buf,
+ BTStack stack,
+ IndexTuple itup,
+ OffsetNumber newitemoff,
+ bool split_only_page)
+ {
+ Page page;
+ BTPageOpaque lpageop;
+ OffsetNumber firstright = InvalidOffsetNumber;
+ Size itemsz;
+
+ page = BufferGetPage(buf);
+ lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ itemsz = IndexTupleDSize(*itup);
+ itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
+ * need to be consistent */
+
+ /*
* Do we need to split the page to fit the item on it?
*
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
***************
*** 1427,1433 ****
/* Recursively update the parent */
_bt_insertonpg(rel, pbuf, stack->bts_parent,
! 0, NULL, new_item, stack->bts_offset,
is_only);
/* be tidy */
--- 1475,1481 ----
/* Recursively update the parent */
_bt_insertonpg(rel, pbuf, stack->bts_parent,
! new_item, stack->bts_offset + 1,
is_only);
/* be tidy */
---------------------------(end of broadcast)---------------------------
TIP 7: You can help support the PostgreSQL project by donating at
http://www.postgresql.org/about/donate