I've been getting less and less excited about this patch, because I still couldn't measure any above-the-noise performance improvement without artificial exaggerations, and some cases seemed actually slower.
However, this morning I had an epiphany: why are we sorting at all? There is no requirement that these functions preserve the physical ordering of the tuples' data areas, only that the line-pointer ordering be preserved. Indeed, reorganizing the data areas into an ordering matching the line pointers is probably a good thing, because it should improve locality of access in future scans of the page. This is trivial to implement if we copy the data into a workspace area and back again, as I was already proposing to do to avoid memmove. Moreover, at that point there's little value in a separate compactify function at all: we can integrate the data-copying logic into the line pointer scan loops in PageRepairFragmentation and PageIndexMultiDelete, and get rid of the costs of constructing the intermediate itemIdSortData arrays. That led me to the attached patch, which is the first version of any of this work that produces an above-the-noise performance win for me. I'm seeing 10-20% gains on this modified version of Yura's original example: psql -f test3setup.sql pgbench -M prepared -c 3 -s 10000000 -T 300 -P 3 -n -f test3.sql (sql scripts also attached below; I'm using 1GB shared_buffers and fsync off, other parameters stock.) However, there are a couple of objections that could be raised to this patch: 1. It's trading off per-byte work, in the form of an extra memcpy, to save sorting work that has per-tuple costs. Therefore, the relatively narrow tuples used in Yura's example offer a best-case scenario; with wider tuples the performance might be worse. 2. On a platform with memmove not so much worse than memcpy as I'm seeing on my RHEL6 server, trading memmove for memcpy might not be such a win. To address point 1, I tried some measurements on the standard pgbench scenario, which uses significantly wider tuples. In hopes of addressing point 2, I also ran the measurements on a laptop running Fedora 25 (gcc 6.4.1, glibc 2.24); I haven't actually checked memmove vs memcpy on that machine, but at least it's a reasonably late-model glibc. What I'm getting from the standard pgbench measurements, on both machines, is that this patch might be a couple percent slower than HEAD, but that is barely above the noise floor so I'm not too sure about it. So I think we should seriously consider the attached, but it'd be a good idea to benchmark it on a wider variety of platforms and test cases. regards, tom lane
drop table if exists test3; create unlogged table test3 ( id integer PRIMARY KEY with (fillfactor=85), val text ) WITH (fillfactor=85); insert into test3 select i, '!'||i from generate_series(1, 10000000) as i; vacuum analyze; checkpoint; create or replace function dotest3(n int, scale float8) returns void language plpgsql as $$ begin for i in 1..n loop declare id1 int := random() * scale; id2 int := random() * scale; begin perform * from test3 where id = id1; update test3 set val = '!'|| id2 where id = id1; end; end loop; end $$;
select dotest3(100, :scale);
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index b6aa2af..73b73de 100644 *** a/src/backend/storage/page/bufpage.c --- b/src/backend/storage/page/bufpage.c *************** PageRestoreTempPage(Page tempPage, Page *** 415,471 **** } /* - * sorting support for PageRepairFragmentation and PageIndexMultiDelete - */ - typedef struct itemIdSortData - { - uint16 offsetindex; /* linp array index */ - int16 itemoff; /* page offset of item data */ - uint16 alignedlen; /* MAXALIGN(item data len) */ - } itemIdSortData; - typedef itemIdSortData *itemIdSort; - - static int - itemoffcompare(const void *itemidp1, const void *itemidp2) - { - /* Sort in decreasing itemoff order */ - return ((itemIdSort) itemidp2)->itemoff - - ((itemIdSort) itemidp1)->itemoff; - } - - /* - * After removing or marking some line pointers unused, move the tuples to - * remove the gaps caused by the removed items. - */ - static void - compactify_tuples(itemIdSort itemidbase, int nitems, Page page) - { - PageHeader phdr = (PageHeader) page; - Offset upper; - int i; - - /* sort itemIdSortData array into decreasing itemoff order */ - qsort((char *) itemidbase, nitems, sizeof(itemIdSortData), - itemoffcompare); - - upper = phdr->pd_special; - for (i = 0; i < nitems; i++) - { - itemIdSort itemidptr = &itemidbase[i]; - ItemId lp; - - lp = PageGetItemId(page, itemidptr->offsetindex + 1); - upper -= itemidptr->alignedlen; - memmove((char *) page + upper, - (char *) page + itemidptr->itemoff, - itemidptr->alignedlen); - lp->lp_off = upper; - } - - phdr->pd_upper = upper; - } - - /* * PageRepairFragmentation * * Frees fragmented space on a page. --- 415,420 ---- *************** PageRepairFragmentation(Page page) *** 481,494 **** Offset pd_lower = ((PageHeader) page)->pd_lower; Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; ! itemIdSortData itemidbase[MaxHeapTuplesPerPage]; ! itemIdSort itemidptr; ! ItemId lp; ! int nline, ! nstorage, ! nunused; ! int i; ! Size totallen; /* * It's worth the trouble to be more paranoid here than in most places, --- 430,444 ---- Offset pd_lower = ((PageHeader) page)->pd_lower; Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; ! int new_pd_upper, ! nline, ! nunused, ! i; ! union ! { ! char page[BLCKSZ]; ! double align; /* force workspace to be MAXALIGN'd */ ! } workspace; /* * It's worth the trouble to be more paranoid here than in most places, *************** PageRepairFragmentation(Page page) *** 508,563 **** pd_lower, pd_upper, pd_special))); /* ! * Run through the line pointer array and collect data about live items. */ nline = PageGetMaxOffsetNumber(page); ! itemidptr = itemidbase; ! nunused = totallen = 0; for (i = FirstOffsetNumber; i <= nline; i++) { ! lp = PageGetItemId(page, i); if (ItemIdIsUsed(lp)) { if (ItemIdHasStorage(lp)) { ! itemidptr->offsetindex = i - 1; ! itemidptr->itemoff = ItemIdGetOffset(lp); ! if (unlikely(itemidptr->itemoff < (int) pd_upper || ! itemidptr->itemoff >= (int) pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), ! errmsg("corrupted item pointer: %u", ! itemidptr->itemoff))); ! itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); ! totallen += itemidptr->alignedlen; ! itemidptr++; } } else { ! /* Unused entries should have lp_len = 0, but make sure */ ! ItemIdSetUnused(lp); nunused++; } } ! nstorage = itemidptr - itemidbase; ! if (nstorage == 0) ! { ! /* Page is completely empty, so just reset it quickly */ ! ((PageHeader) page)->pd_upper = pd_special; ! } ! else ! { ! /* Need to compact the page the hard way */ ! if (totallen > (Size) (pd_special - pd_lower)) ! ereport(ERROR, ! (errcode(ERRCODE_DATA_CORRUPTED), ! errmsg("corrupted item lengths: total %u, available space %u", ! (unsigned int) totallen, pd_special - pd_lower))); ! compactify_tuples(itemidbase, nstorage, page); ! } /* Set hint bit for PageAddItem */ if (nunused > 0) --- 458,526 ---- pd_lower, pd_upper, pd_special))); /* ! * We build updated copies of the line pointer array and tuple data area ! * in workspace.page, and then copy them back to the real page when done. ! * This ensures that if we error out partway through, we have not changed ! * the real page. It also lets us use memcpy rather than memmove for the ! * data transfers, which is faster on some machines. ! * ! * A useful side effect of this approach is that the tuples are re-sorted ! * so that their physical order matches their line pointer order, which ! * should improve locality of access in future scans of the page. */ nline = PageGetMaxOffsetNumber(page); ! new_pd_upper = pd_special; ! nunused = 0; for (i = FirstOffsetNumber; i <= nline; i++) { ! ItemId lp = PageGetItemId(page, i); ! ItemId newlp = PageGetItemId(workspace.page, i); ! if (ItemIdIsUsed(lp)) { + *newlp = *lp; if (ItemIdHasStorage(lp)) { ! int offset = ItemIdGetOffset(lp); ! int alignedlen = MAXALIGN(ItemIdGetLength(lp)); ! ! new_pd_upper -= alignedlen; ! newlp->lp_off = new_pd_upper; ! if (unlikely(offset < (int) pd_upper || ! (offset + alignedlen) > (int) pd_special || ! offset != MAXALIGN(offset) || ! new_pd_upper < (int) pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), ! errmsg("corrupted item pointer: offset = %u, length = %u", ! offset, alignedlen))); ! memcpy(workspace.page + new_pd_upper, ! (char *) page + offset, ! alignedlen); } } else { ! /* We can just zero out all the fields in *newlp */ ! ItemIdSetUnused(newlp); nunused++; } } ! /* ! * Okay, copy lower and upper workspace areas back to the real page. ! */ ! if (pd_lower > SizeOfPageHeaderData) ! memcpy((char *) page + SizeOfPageHeaderData, ! workspace.page + SizeOfPageHeaderData, ! pd_lower - SizeOfPageHeaderData); ! if (new_pd_upper < pd_special) ! memcpy((char *) page + new_pd_upper, ! workspace.page + new_pd_upper, ! pd_special - new_pd_upper); ! /* Page's pd_lower doesn't change, but pd_upper does */ ! ((PageHeader) page)->pd_upper = new_pd_upper; /* Set hint bit for PageAddItem */ if (nunused > 0) *************** PageIndexTupleDelete(Page page, OffsetNu *** 831,853 **** void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) { ! PageHeader phdr = (PageHeader) page; ! Offset pd_lower = phdr->pd_lower; ! Offset pd_upper = phdr->pd_upper; ! Offset pd_special = phdr->pd_special; ! itemIdSortData itemidbase[MaxIndexTuplesPerPage]; ! ItemIdData newitemids[MaxIndexTuplesPerPage]; ! itemIdSort itemidptr; ! ItemId lp; ! int nline, ! nused; ! Size totallen; ! Size size; ! unsigned offset; ! int nextitm; OffsetNumber offnum; ! ! Assert(nitems <= MaxIndexTuplesPerPage); /* * If there aren't very many items to delete, then retail --- 794,813 ---- void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) { ! Offset pd_lower = ((PageHeader) page)->pd_lower; ! Offset pd_upper = ((PageHeader) page)->pd_upper; ! Offset pd_special = ((PageHeader) page)->pd_special; ! int new_pd_upper, ! nline, ! nextnew, ! nextitm, ! lpalen; OffsetNumber offnum; ! union ! { ! char page[BLCKSZ]; ! double align; /* force workspace to be MAXALIGN'd */ ! } workspace; /* * If there aren't very many items to delete, then retail *************** PageIndexMultiDelete(Page page, OffsetNu *** 878,906 **** pd_lower, pd_upper, pd_special))); /* ! * Scan the item pointer array and build a list of just the ones we are ! * going to keep. Notice we do not modify the page yet, since we are ! * still validity-checking. */ nline = PageGetMaxOffsetNumber(page); ! itemidptr = itemidbase; ! totallen = 0; ! nused = 0; nextitm = 0; for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) { - lp = PageGetItemId(page, offnum); - Assert(ItemIdHasStorage(lp)); - size = ItemIdGetLength(lp); - offset = ItemIdGetOffset(lp); - if (offset < pd_upper || - (offset + size) > pd_special || - offset != MAXALIGN(offset)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted item pointer: offset = %u, length = %u", - offset, (unsigned int) size))); - if (nextitm < nitems && offnum == itemnos[nextitm]) { /* skip item to be deleted */ --- 838,853 ---- pd_lower, pd_upper, pd_special))); /* ! * As in PageRepairFragmentation, we build new copies of the line pointer ! * array and tuple data area in workspace.page, then transfer them back to ! * the real page. */ nline = PageGetMaxOffsetNumber(page); ! new_pd_upper = pd_special; ! nextnew = FirstOffsetNumber; nextitm = 0; for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) { if (nextitm < nitems && offnum == itemnos[nextitm]) { /* skip item to be deleted */ *************** PageIndexMultiDelete(Page page, OffsetNu *** 908,920 **** } else { ! itemidptr->offsetindex = nused; /* where it will go */ ! itemidptr->itemoff = offset; ! itemidptr->alignedlen = MAXALIGN(size); ! totallen += itemidptr->alignedlen; ! newitemids[nused] = *lp; ! itemidptr++; ! nused++; } } --- 855,884 ---- } else { ! ItemId lp = PageGetItemId(page, offnum); ! ItemId newlp; ! int offset; ! int alignedlen; ! ! Assert(ItemIdHasStorage(lp)); ! offset = ItemIdGetOffset(lp); ! alignedlen = MAXALIGN(ItemIdGetLength(lp)); ! new_pd_upper -= alignedlen; ! if (unlikely(offset < (int) pd_upper || ! (offset + alignedlen) > (int) pd_special || ! offset != MAXALIGN(offset) || ! new_pd_upper < (int) pd_lower)) ! ereport(ERROR, ! (errcode(ERRCODE_DATA_CORRUPTED), ! errmsg("corrupted item pointer: offset = %u, length = %u", ! offset, alignedlen))); ! memcpy(workspace.page + new_pd_upper, ! (char *) page + offset, ! alignedlen); ! newlp = PageGetItemId(workspace.page, nextnew); ! *newlp = *lp; ! newlp->lp_off = new_pd_upper; ! nextnew++; } } *************** PageIndexMultiDelete(Page page, OffsetNu *** 922,942 **** if (nextitm != nitems) elog(ERROR, "incorrect index offsets supplied"); - if (totallen > (Size) (pd_special - pd_lower)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted item lengths: total %u, available space %u", - (unsigned int) totallen, pd_special - pd_lower))); - /* ! * Looks good. Overwrite the line pointers with the copy, from which we've ! * removed all the unused items. */ ! memcpy(phdr->pd_linp, newitemids, nused * sizeof(ItemIdData)); ! phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData); ! /* and compactify the tuple data */ ! compactify_tuples(itemidbase, nused, page); } --- 886,906 ---- if (nextitm != nitems) elog(ERROR, "incorrect index offsets supplied"); /* ! * Okay, copy lower and upper workspace areas back to the real page. */ ! lpalen = (nextnew - FirstOffsetNumber) * sizeof(ItemIdData); ! if (lpalen > 0) ! memcpy((char *) page + SizeOfPageHeaderData, ! workspace.page + SizeOfPageHeaderData, ! lpalen); ! if (new_pd_upper < pd_special) ! memcpy((char *) page + new_pd_upper, ! workspace.page + new_pd_upper, ! pd_special - new_pd_upper); ! ((PageHeader) page)->pd_lower = SizeOfPageHeaderData + lpalen; ! ((PageHeader) page)->pd_upper = new_pd_upper; }
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers