Patch applied. Thanks. ---------------------------------------------------------------------------
Heikki Linnakangas wrote: > Here's a patch that: > > Moves the logic to find a page with enough room from _bt_insertonpg to a > new function, _bt_findinsertloc. It makes the code more readable, and > simplifies the forthcoming Grouped Index Tuples patch. > > Also, the insert location within page used to be calculated twice for > unique indexes, once in _bt_checkunique and second time in > _bt_insertonpg. That's a waste of cycles, and this patch fixes that. > > > I couldn't measure a difference with pgbench, but this micro-benchmark > shows it: > > > psql postgres -c "CREATE TABLE inserttest (i int PRIMARY KEY);" > > psql postgres -c "TRUNCATE inserttest; checkpoint;"; sync > > time ~/pgsql.cvshead/bin/psql postgres -c "TRUNCATE inserttest; > INSERT INTO inserttest SELECT a FROM generate_series(1,1000000) a;" > > Without patch: real 0m7.260s > With patch: real 0m6.963s > > On my laptop, fsync=off, full_page_writes=off, checkpoint_segments = 10, > to remove any other variables. > > It's not a huge difference, but it's worth having, and performance > wasn't the main motivation of the patch anyway. > > -- > Heikki Linnakangas > EnterpriseDB http://www.enterprisedb.com [ text/x-patch is unsupported, treating like TEXT/PLAIN ] > Index: src/backend/access/nbtree/nbtinsert.c > =================================================================== > RCS file: > /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/nbtree/nbtinsert.c,v > retrieving revision 1.152 > diff -c -r1.152 nbtinsert.c > *** src/backend/access/nbtree/nbtinsert.c 21 Feb 2007 20:02:17 -0000 > 1.152 > --- src/backend/access/nbtree/nbtinsert.c 26 Feb 2007 09:37:16 -0000 > *************** > *** 46,58 **** > static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); > > static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, > ! Relation heapRel, Buffer buf, > ScanKey itup_scankey); > static void _bt_insertonpg(Relation rel, Buffer buf, > BTStack stack, > - int keysz, ScanKey scankey, > IndexTuple itup, > ! OffsetNumber afteritem, > bool split_only_page); > static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, > OffsetNumber newitemoff, Size newitemsz, > --- 46,63 ---- > static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); > > static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, > ! Relation heapRel, Buffer buf, OffsetNumber > ioffset, > ScanKey itup_scankey); > + static void _bt_findinsertloc(Relation rel, > + Buffer *bufptr, > + OffsetNumber *offsetptr, > + int keysz, > + ScanKey scankey, > + IndexTuple newtup); > static void _bt_insertonpg(Relation rel, Buffer buf, > BTStack stack, > IndexTuple itup, > ! OffsetNumber newitemoff, > bool split_only_page); > static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, > OffsetNumber newitemoff, Size newitemsz, > *************** > *** 86,91 **** > --- 91,97 ---- > ScanKey itup_scankey; > BTStack stack; > Buffer buf; > + OffsetNumber offset; > > /* we need an insertion scan key to do our search, so build one */ > itup_scankey = _bt_mkscankey(rel, itup); > *************** > *** 94,99 **** > --- 100,107 ---- > /* find the first page containing this key */ > stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE); > > + offset = InvalidOffsetNumber; > + > /* trade in our read lock for a write lock */ > LockBuffer(buf, BUFFER_LOCK_UNLOCK); > LockBuffer(buf, BT_WRITE); > *************** > *** 128,134 **** > { > TransactionId xwait; > > ! xwait = _bt_check_unique(rel, itup, heapRel, buf, itup_scankey); > > if (TransactionIdIsValid(xwait)) > { > --- 136,143 ---- > { > TransactionId xwait; > > ! offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); > ! xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, > itup_scankey); > > if (TransactionIdIsValid(xwait)) > { > *************** > *** 142,148 **** > } > > /* do the insertion */ > ! _bt_insertonpg(rel, buf, stack, natts, itup_scankey, itup, 0, false); > > /* be tidy */ > _bt_freestack(stack); > --- 151,158 ---- > } > > /* do the insertion */ > ! _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup); > ! _bt_insertonpg(rel, buf, stack, itup, offset, false); > > /* be tidy */ > _bt_freestack(stack); > *************** > *** 152,169 **** > /* > * _bt_check_unique() -- Check for violation of unique index constraint > * > * Returns InvalidTransactionId if there is no conflict, else an xact ID > * we must wait for to see if it commits a conflicting tuple. If an > actual > * conflict is detected, no return --- just ereport(). > */ > static TransactionId > _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, > ! Buffer buf, ScanKey itup_scankey) > { > TupleDesc itupdesc = RelationGetDescr(rel); > int natts = rel->rd_rel->relnatts; > ! OffsetNumber offset, > ! maxoff; > Page page; > BTPageOpaque opaque; > Buffer nbuf = InvalidBuffer; > --- 162,182 ---- > /* > * _bt_check_unique() -- Check for violation of unique index constraint > * > + * offset points to the first possible item that could conflict. It can > + * also point to end-of-page, which means that the first tuple to check > + * is the first tuple on the next page. > + * > * Returns InvalidTransactionId if there is no conflict, else an xact ID > * we must wait for to see if it commits a conflicting tuple. If an > actual > * conflict is detected, no return --- just ereport(). > */ > static TransactionId > _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, > ! Buffer buf, OffsetNumber offset, ScanKey > itup_scankey) > { > TupleDesc itupdesc = RelationGetDescr(rel); > int natts = rel->rd_rel->relnatts; > ! OffsetNumber maxoff; > Page page; > BTPageOpaque opaque; > Buffer nbuf = InvalidBuffer; > *************** > *** 173,184 **** > maxoff = PageGetMaxOffsetNumber(page); > > /* > - * Find first item >= proposed new item. Note we could also get a > pointer > - * to end-of-page here. > - */ > - offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); > - > - /* > * Scan over all equal tuples, looking for live conflicts. > */ > for (;;) > --- 186,191 ---- > *************** > *** 342,374 **** > return InvalidTransactionId; > } > > ! /*---------- > ! * _bt_insertonpg() -- Insert a tuple on a particular page in the index. > ! * > ! * This recursive procedure does the following things: > ! * > ! * + finds the right place to insert the tuple. > ! * + if necessary, splits the target page (making sure > that the > ! * split is equitable as far as post-insert free space > goes). > ! * + inserts the tuple. > ! * + if the page was split, pops the parent stack, and > finds the > ! * right place to insert the new child pointer (by > walking > ! * right using information stored in the parent stack). > ! * + invokes itself with the appropriate tuple for the > right > ! * child page on the parent. > ! * + updates the metapage if a true root or fast root is > split. > ! * > ! * On entry, we must have the right buffer in which to do the > ! * insertion, and the buffer must be pinned and write-locked. > On return, > ! * we will have dropped both the pin and the lock on the buffer. > ! * > ! * If 'afteritem' is >0 then the new tuple must be inserted after > the > ! * existing item of that number, noplace else. If 'afteritem' is 0 > ! * then the procedure finds the exact spot to insert it by > searching. > ! * (keysz and scankey parameters are used ONLY if afteritem == 0. > ! * The scankey must be an insertion-type scankey.) > * > ! * NOTE: if the new key is equal to one or more existing keys, we > can > * legitimately place it anywhere in the series of equal keys --- > in fact, > * if the new key is equal to the page's "high key" we can place > it on > * the next page. If it is equal to the high key, and there's not > room > --- 349,359 ---- > return InvalidTransactionId; > } > > ! > ! /* > ! * _bt_findinsertloc() -- Finds an insert location for a tuple > * > ! * If the new key is equal to one or more existing keys, we can > * legitimately place it anywhere in the series of equal keys --- > in fact, > * if the new key is equal to the page's "high key" we can place > it on > * the next page. If it is equal to the high key, and there's not > room > *************** > *** 379,414 **** > * Once we have chosen the page to put the key on, we'll insert it > before > * any existing equal keys because of the way _bt_binsrch() works. > * > ! * The locking interactions in this code are critical. You should > ! * grok Lehman and Yao's paper before making any changes. In > addition, > ! * you need to understand how we disambiguate duplicate keys in > this > ! * implementation, in order to be able to find our location using > ! * L&Y "move right" operations. Since we may insert duplicate user > ! * keys, and since these dups may propagate up the tree, we use the > ! * 'afteritem' parameter to position ourselves correctly for the > ! * insertion on internal pages. > ! *---------- > */ > static void > ! _bt_insertonpg(Relation rel, > ! Buffer buf, > ! BTStack stack, > ! int keysz, > ! ScanKey scankey, > ! IndexTuple itup, > ! OffsetNumber afteritem, > ! bool split_only_page) > { > ! Page page; > BTPageOpaque lpageop; > OffsetNumber newitemoff; > ! OffsetNumber firstright = InvalidOffsetNumber; > ! Size itemsz; > > - page = BufferGetPage(buf); > lpageop = (BTPageOpaque) PageGetSpecialPointer(page); > > ! itemsz = IndexTupleDSize(*itup); > itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this > but we > * need to be > consistent */ > > --- 364,403 ---- > * Once we have chosen the page to put the key on, we'll insert it > before > * any existing equal keys because of the way _bt_binsrch() works. > * > ! * If there's not enough room in the space, we try to make room by > ! * removing any LP_DELETEd tuples. > ! * > ! * On entry, *buf and *offsetptr point to the first legal position > ! * where the new tuple could be inserted. The caller should hold an > ! * exclusive lock on *buf. *offsetptr can also be set to > ! * InvalidOffsetNumber, in which case the function will search the > right > ! * location within the page if needed. On exit, they point to the > chosen > ! * insert location. If findinsertloc decided to move right, the > lock and > ! * pin on the original page will be released and the new page > returned to > ! * the caller is exclusively locked instead. > ! * > ! * newtup is the new tuple we're inserting, and scankey is an > insertion > ! * type scan key for it. > */ > static void > ! _bt_findinsertloc(Relation rel, > ! Buffer *bufptr, > ! OffsetNumber *offsetptr, > ! int keysz, > ! ScanKey scankey, > ! IndexTuple newtup) > { > ! Buffer buf = *bufptr; > ! Page page = BufferGetPage(buf); > ! Size itemsz; > BTPageOpaque lpageop; > + bool movedright, vacuumed; > OffsetNumber newitemoff; > ! OffsetNumber firstlegaloff = *offsetptr; > > lpageop = (BTPageOpaque) PageGetSpecialPointer(page); > > ! itemsz = IndexTupleDSize(*newtup); > itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this > but we > * need to be > consistent */ > > *************** > *** 429,525 **** > "Consider a function index of an MD5 hash of > the value, " > "or use full text indexing."))); > > - /* > - * Determine exactly where new item will go. > - */ > - if (afteritem > 0) > - newitemoff = afteritem + 1; > - else > - { > - /*---------- > - * If we will need to split the page to put the item here, > - * check whether we can put the tuple somewhere to the right, > - * instead. Keep scanning right until we > - * (a) find a page with enough free space, > - * (b) reach the last page where the tuple can > legally go, or > - * (c) get tired of searching. > - * (c) is not flippant; it is important because if there are > many > - * pages' worth of equal keys, it's better to split one of the > early > - * pages than to scan all the way to the end of the run of > equal keys > - * on every insert. We implement "get tired" as a random > choice, > - * since stopping after scanning a fixed number of pages > wouldn't work > - * well (we'd never reach the right-hand side of previously > split > - * pages). Currently the probability of moving right is > set at 0.99, > - * which may seem too high to change the behavior much, but it > does an > - * excellent job of preventing O(N^2) behavior with many equal > keys. > - *---------- > - */ > - bool movedright = false; > - > - while (PageGetFreeSpace(page) < itemsz) > - { > - Buffer rbuf; > > - /* > - * before considering moving right, see if we can > obtain enough > - * space by erasing LP_DELETE items > - */ > - if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) > - { > - _bt_vacuum_one_page(rel, buf); > - if (PageGetFreeSpace(page) >= itemsz) > - break; /* OK, now we have > enough space */ > - } > > ! /* > ! * nope, so check conditions (b) and (c) enumerated > above > ! */ > ! if (P_RIGHTMOST(lpageop) || > ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) > != 0 || > ! random() <= (MAX_RANDOM_VALUE / 100)) > ! break; > > ! /* > ! * step right to next non-dead page > ! * > ! * must write-lock that page before releasing write > lock on > ! * current page; else someone else's _bt_check_unique > scan could > ! * fail to see our insertion. write locks on > intermediate dead > ! * pages won't do because we don't know when they will > get > ! * de-linked from the tree. > ! */ > ! rbuf = InvalidBuffer; > > ! for (;;) > ! { > ! BlockNumber rblkno = lpageop->btpo_next; > > ! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, > BT_WRITE); > ! page = BufferGetPage(rbuf); > ! lpageop = (BTPageOpaque) > PageGetSpecialPointer(page); > ! if (!P_IGNORE(lpageop)) > ! break; > ! if (P_RIGHTMOST(lpageop)) > ! elog(ERROR, "fell off the end of > \"%s\"", > ! RelationGetRelationName(rel)); > ! } > ! _bt_relbuf(rel, buf); > ! buf = rbuf; > ! movedright = true; > } > > /* > ! * Now we are on the right page, so find the insert position. > If we > ! * moved right at all, we know we should insert at the start of > the > ! * page, else must find the position by searching. > */ > ! if (movedright) > ! newitemoff = P_FIRSTDATAKEY(lpageop); > ! else > ! newitemoff = _bt_binsrch(rel, buf, keysz, scankey, > false); > } > > /* > * Do we need to split the page to fit the item on it? > * > * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its > result, > --- 418,573 ---- > "Consider a function index of an MD5 hash of > the value, " > "or use full text indexing."))); > > > > ! /*---------- > ! * If we will need to split the page to put the item on this page, > ! * check whether we can put the tuple somewhere to the right, > ! * instead. Keep scanning right until we > ! * (a) find a page with enough free space, > ! * (b) reach the last page where the tuple can legally go, > or > ! * (c) get tired of searching. > ! * (c) is not flippant; it is important because if there are many > ! * pages' worth of equal keys, it's better to split one of the early > ! * pages than to scan all the way to the end of the run of equal keys > ! * on every insert. We implement "get tired" as a random choice, > ! * since stopping after scanning a fixed number of pages wouldn't work > ! * well (we'd never reach the right-hand side of previously split > ! * pages). Currently the probability of moving right is set at > 0.99, > ! * which may seem too high to change the behavior much, but it does an > ! * excellent job of preventing O(N^2) behavior with many equal keys. > ! *---------- > ! */ > ! movedright = false; > ! vacuumed = false; > ! while (PageGetFreeSpace(page) < itemsz) > ! { > ! Buffer rbuf; > > ! /* > ! * before considering moving right, see if we can obtain enough > ! * space by erasing LP_DELETE items > ! */ > ! if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) > ! { > ! _bt_vacuum_one_page(rel, buf); > > ! /* remember that we vacuumed this page, because that > makes > ! * the hint supplied by the caller invalid */ > ! vacuumed = true; > > ! if (PageGetFreeSpace(page) >= itemsz) > ! break; /* OK, now we have enough space > */ > } > > /* > ! * nope, so check conditions (b) and (c) enumerated above > */ > ! if (P_RIGHTMOST(lpageop) || > ! _bt_compare(rel, keysz, scankey, page, P_HIKEY) != 0 || > ! random() <= (MAX_RANDOM_VALUE / 100)) > ! break; > ! > ! /* > ! * step right to next non-dead page > ! * > ! * must write-lock that page before releasing write lock on > ! * current page; else someone else's _bt_check_unique scan could > ! * fail to see our insertion. write locks on intermediate dead > ! * pages won't do because we don't know when they will get > ! * de-linked from the tree. > ! */ > ! rbuf = InvalidBuffer; > ! > ! for (;;) > ! { > ! BlockNumber rblkno = lpageop->btpo_next; > ! > ! rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); > ! page = BufferGetPage(rbuf); > ! lpageop = (BTPageOpaque) PageGetSpecialPointer(page); > ! if (!P_IGNORE(lpageop)) > ! break; > ! if (P_RIGHTMOST(lpageop)) > ! elog(ERROR, "fell off the end of \"%s\"", > ! RelationGetRelationName(rel)); > ! } > ! _bt_relbuf(rel, buf); > ! buf = rbuf; > ! movedright = true; > ! vacuumed = false; > } > > /* > + * Now we are on the right page, so find the insert position. If we > + * moved right at all, we know we should insert at the start of the > + * page. If we didn't move right, we can use the firstlegaloff hint > + * if the caller supplied one, unless we vacuumed the page which > + * might have moved tuples around making the hint invalid. If we > + * didn't move right or can't use the hint, find the position > + * by searching. > + */ > + if (movedright) > + newitemoff = P_FIRSTDATAKEY(lpageop); > + else if(firstlegaloff != InvalidOffsetNumber && !vacuumed) > + newitemoff = firstlegaloff; > + else > + newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); > + > + *bufptr = buf; > + *offsetptr = newitemoff; > + } > + > + /*---------- > + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. > + * > + * This recursive procedure does the following things: > + * > + * + if necessary, splits the target page (making sure > that the > + * split is equitable as far as post-insert free space > goes). > + * + inserts the tuple. > + * + if the page was split, pops the parent stack, and > finds the > + * right place to insert the new child pointer (by > walking > + * right using information stored in the parent stack). > + * + invokes itself with the appropriate tuple for the > right > + * child page on the parent. > + * + updates the metapage if a true root or fast root is > split. > + * > + * On entry, we must have the right buffer in which to do the > + * insertion, and the buffer must be pinned and write-locked. > On return, > + * we will have dropped both the pin and the lock on the buffer. > + * > + * The locking interactions in this code are critical. You should > + * grok Lehman and Yao's paper before making any changes. In > addition, > + * you need to understand how we disambiguate duplicate keys in > this > + * implementation, in order to be able to find our location using > + * L&Y "move right" operations. Since we may insert duplicate user > + * keys, and since these dups may propagate up the tree, we use the > + * 'afteritem' parameter to position ourselves correctly for the > + * insertion on internal pages. > + *---------- > + */ > + static void > + _bt_insertonpg(Relation rel, > + Buffer buf, > + BTStack stack, > + IndexTuple itup, > + OffsetNumber newitemoff, > + bool split_only_page) > + { > + Page page; > + BTPageOpaque lpageop; > + OffsetNumber firstright = InvalidOffsetNumber; > + Size itemsz; > + > + page = BufferGetPage(buf); > + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); > + > + itemsz = IndexTupleDSize(*itup); > + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this > but we > + * need to be > consistent */ > + > + /* > * Do we need to split the page to fit the item on it? > * > * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its > result, > *************** > *** 1427,1433 **** > > /* Recursively update the parent */ > _bt_insertonpg(rel, pbuf, stack->bts_parent, > ! 0, NULL, new_item, stack->bts_offset, > is_only); > > /* be tidy */ > --- 1475,1481 ---- > > /* Recursively update the parent */ > _bt_insertonpg(rel, pbuf, stack->bts_parent, > ! new_item, stack->bts_offset + 1, > is_only); > > /* be tidy */ > > ---------------------------(end of broadcast)--------------------------- > TIP 7: You can help support the PostgreSQL project by donating at > > http://www.postgresql.org/about/donate -- Bruce Momjian <[EMAIL PROTECTED]> http://momjian.us EnterpriseDB http://www.enterprisedb.com + If your life is a hard drive, Christ can be your backup. + ---------------------------(end of broadcast)--------------------------- TIP 3: Have you checked our extensive FAQ? http://www.postgresql.org/docs/faq