Here is the latest version of my patch that's revised according to my
discussions with Heikki and Simon:
Changes:
* uses LWLocks when accessing shared memory
* removes the "sync_seqscan_offset" feature
* uses the relfilenode as a key rather than relation OID
* fixes regression test failure
* uses a simple LRU (that stays in fixed-size shared memory) to track
the locations of other concurrent scans
For the LRU I used a doubly-linked list, which isn't strictly necessary.
However, we may decide to use a hash table for the lookup, in which case
the extra pointers will be useful.
Regards,
Jeff Davis
diff -rc postgresql-8.2.4/src/backend/access/heap/heapam.c postgresql-8.2.4-ss/src/backend/access/heap/heapam.c
*** postgresql-8.2.4/src/backend/access/heap/heapam.c 2007-02-04 12:00:49.000000000 -0800
--- postgresql-8.2.4-ss/src/backend/access/heap/heapam.c 2007-05-20 14:26:43.000000000 -0700
***************
*** 65,70 ****
--- 65,298 ----
* ----------------------------------------------------------------
*/
+ static BlockNumber ss_init(HeapScanDesc);
+ static int ss_store_hint(HeapScanDesc,BlockNumber);
+ static ss_hints_t *ss_hints_init();
+ static BlockNumber ss_hints_search(ss_hints_t*,RelFileNode,BlockNumber,bool);
+
+ bool Trace_sync_seqscan = false;
+ double sync_seqscan_threshold = DEFAULT_SYNC_SCAN_THRESHOLD;
+
+
+ /*
+ * ss_hints_init:
+ *
+ * Creates and initializes hint structure of size
+ * nelem. If the structure already exists,
+ * it will simply be returned.
+ *
+ * It is assumed that SYNC_SCAN_NELEM > 1.
+ *
+ * The hint structure is essentially a
+ * doubly-linked LRU with head and tail
+ * pointer, but designed to hold a fixed maximum
+ * number of elements in fixed-size shared memory.
+ */
+ static ss_hints_t *ss_hints_init()
+ {
+ int i;
+ int size;
+ int nelem = SYNC_SCAN_NELEM;
+ bool found;
+ ss_hints_t *hints;
+
+ Assert(nelem > 1);
+
+ size = sizeof(ss_hints_t) + sizeof(ss_lru_t) + \
+ (sizeof(ss_lru_item_t) + sizeof(ss_lru_item_t*)) * nelem;
+
+ hints = (ss_hints_t*)ShmemInitStruct("Sync Scan Hint List",
+ size,&found);
+
+ if(found)
+ return hints;
+
+ if(Trace_sync_seqscan)
+ elog(DEBUG2,"SYNC_SCAN: Created Hint List");
+
+ /*
+ * Because it needs to fit in one fixed-size shared memory chunk,
+ * the code below uses some ugly offset math and typecasts.
+ *
+ * Also, it maintains it's own rudimentary freelist to manage the
+ * memory it has.
+ */
+ hints->lru = (ss_lru_t*)((char*)hints + sizeof(ss_hints_t));
+ hints->lru->nelem = nelem;
+ hints->lru->head = NULL;
+ hints->lru->tail = NULL;
+ hints->lru->first = (ss_lru_item_t*)((char*)hints->lru + sizeof(ss_lru_t));
+ hints->lru->freelist = (ss_lru_item_t**)(hints->lru->first + nelem);
+ hints->lru->free = hints->lru->freelist;
+ for( i = nelem-1; i >= 0; i-- ) {
+ *hints->lru->free = hints->lru->first + i;
+ hints->lru->free++;
+ }
+ return hints;
+ }
+
+ /*
+ * ss_hints_search:
+ *
+ * This searches the hint structure for a hint with the given
+ * relfilenode.
+ *
+ * If "set" is true, it will set the location in the hint to
+ * the given location, and return that location. If not, it
+ * will return the pre-existing location without modification.
+ *
+ * If the hint is not found, it will be created
+ * at the head of the list with the given location, and that
+ * location will be returned.
+ *
+ */
+ static BlockNumber ss_hints_search(ss_hints_t *hints, RelFileNode relfilenode,
+ BlockNumber location, bool set)
+ {
+ ss_lru_item_t *item;
+ ss_lru_t *lru = hints->lru;
+
+ item = lru->head;
+
+ while(item != NULL) {
+ if(RelFileNodeEquals(item->hint.relfilenode,relfilenode)) {
+ if(set)
+ item->hint.location = location;
+ if(item == lru->head)
+ return lru->head->hint.location;
+
+ if(item == lru->tail)
+ lru->tail = item->prev;
+ item->prev->next = item->next;
+ if(item->next)
+ item->next->prev = item->prev;
+
+ item->prev = NULL;
+ item->next = lru->head;
+ lru->head->prev = item;
+ lru->head = item;
+
+ return lru->head->hint.location;
+ }
+ item = item->next;
+ }
+
+ // if list is full, remove tail
+ if(lru->free == lru->freelist) {
+ *lru->free = lru->tail;
+ lru->free++;
+ Assert(lru->tail->prev != NULL);
+ lru->tail->prev->next = NULL;
+ lru->tail = lru->tail->prev;
+ }
+
+ lru->free--;
+ item = *lru->free;
+ item->hint.relfilenode = relfilenode;
+ item->hint.location = location;
+ if(lru->head != NULL)
+ lru->head->prev = item;
+ item->next = lru->head;
+ item->prev = NULL;
+ lru->head = item;
+ if(item->next == NULL)
+ lru->tail = item;
+
+ return lru->head->hint.location;
+ }
+
+ /*
+ * ss_init:
+ *
+ * This function reads the Sync Scan hint structure
+ * (creating it if it doesn't already exist) to
+ * find a possible location for an already running
+ * sequential scan on this relation.
+ *
+ * By starting a sequential scan near the location
+ * of an already running scan, we improve the chance
+ * of finding pages in cache.
+ *
+ * This function assumes that scan->rs_nblocks is
+ * already properly set, and sets scan->rs_start_page
+ * to a value based on the hint found. Also, it sets
+ * scan->rs_hints to point to the hint structure.
+ */
+ static BlockNumber ss_init(HeapScanDesc scan)
+ {
+ int threshold = sync_seqscan_threshold * NBuffers;
+
+ /*
+ * If the table is not large enough, or sync_scan_threshold
+ * is disabled (negative), don't Sync Scan.
+ */
+ if(threshold < 0 || scan->rs_nblocks < threshold)
+ {
+ scan->rs_start_page = 0;
+ return 0;
+ }
+
+ LWLockAcquire(SyncScanLock,LW_EXCLUSIVE);
+ scan->rs_hints = ss_hints_init();
+ scan->rs_start_page = ss_hints_search(scan->rs_hints,scan->rs_rd->rd_node,0,false);
+ LWLockRelease(SyncScanLock);
+
+ /*
+ * If the hint is not a valid block number
+ * for this relation, start at 0.
+ *
+ * This can happen if, for instance, someone
+ * TRUNCATEd the table between when the hint
+ * was set and now.
+ */
+ if(scan->rs_start_page < 0 ||
+ scan->rs_start_page >= scan->rs_nblocks)
+ {
+ if(Trace_sync_seqscan)
+ elog(DEBUG2,"SYNC_SCAN: Hint %d out of range." \
+ " Relation has %d pages.",
+ scan->rs_start_page,scan->rs_nblocks);
+ scan->rs_start_page = 0;
+ return 0;
+ }
+
+ if(Trace_sync_seqscan)
+ elog(DEBUG2,"SYNC_SCAN: START: OID = %d; Location = %d; Size: %d",
+ RelationGetRelid(scan->rs_rd),
+ scan->rs_start_page,scan->rs_nblocks);
+
+ return 0;
+ }
+
+ /*
+ * ss_store_hint:
+ *
+ * Writes an entry in the Sync Scan hint structure
+ * of the form (relfilenode,blocknumber), overwriting
+ * any existing hint for the same relfilenode.
+ *
+ * It's possible the hint returned is invalid, so the
+ * caller should check to see whether it's a valid
+ * BlockNumber for the relation in question.
+ */
+ static int ss_store_hint(HeapScanDesc scan, BlockNumber location)
+ {
+ int threshold = sync_seqscan_threshold * NBuffers;
+
+ /*
+ * If the table is not large enough, or sync_scan_threshold
+ * is disabled (negative), don't Sync Scan.
+ */
+ if(threshold < 0 || scan->rs_nblocks < threshold)
+ return 0;
+
+ LWLockAcquire(SyncScanLock,LW_EXCLUSIVE);
+ ss_hints_search(scan->rs_hints,scan->rs_rd->rd_node,location,true);
+ LWLockRelease(SyncScanLock);
+
+ return 0;
+ }
+
/* ----------------
* initscan - scan code common to heap_beginscan and heap_rescan
* ----------------
***************
*** 81,86 ****
--- 309,319 ----
*/
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
+ /*
+ * Choose an good place to start the relation scan.
+ */
+ ss_init(scan);
+
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL;
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 201,206 ****
--- 434,440 ----
Snapshot snapshot = scan->rs_snapshot;
bool backward = ScanDirectionIsBackward(dir);
BlockNumber page;
+ BlockNumber prevpage;
Page dp;
int lines;
OffsetNumber lineoff;
***************
*** 223,229 ****
tuple->t_data = NULL;
return;
}
! page = 0; /* first page */
heapgetpage(scan, page);
lineoff = FirstOffsetNumber; /* first offnum */
scan->rs_inited = true;
--- 457,467 ----
tuple->t_data = NULL;
return;
}
! /*
! * start the scan at the location that we chose
! * in ss_init()
! */
! page = scan->rs_start_page;
heapgetpage(scan, page);
lineoff = FirstOffsetNumber; /* first offnum */
scan->rs_inited = true;
***************
*** 257,263 ****
tuple->t_data = NULL;
return;
}
! page = scan->rs_nblocks - 1; /* final page */
heapgetpage(scan, page);
}
else
--- 495,501 ----
tuple->t_data = NULL;
return;
}
! page = scan->rs_start_page;
heapgetpage(scan, page);
}
else
***************
*** 364,378 ****
}
/*
! * if we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
*/
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
/*
! * return NULL if we've exhausted all the pages
*/
! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
--- 602,643 ----
}
/*
! * If we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
+ *
+ * For the forward scan, we need to wrap around to the beginning
+ * of the relation file if we reach the end.
*/
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+ prevpage = page;
+ if(backward) {
+ if(page == 0)
+ page = scan->rs_nblocks;
+ page--;
+ }
+ else
+ page = (page + 1) % (scan->rs_nblocks);
+
+ if(Trace_sync_seqscan)
+ {
+ if (!(page%50000))
+ elog(DEBUG2,"page: %d",page);
+ else if (!(page%5000))
+ elog(DEBUG3,"page: %d",page);
+ }
+
+ if(! (page % SYNC_SCAN_REPORT_INTERVAL) )
+ ss_store_hint(scan,page);
+
/*
! * Return NULL if we've exhausted all the pages.
! * For reverse scans, that means we've reached 0. For
! * forward scans, that means we've reached the page on
! * which we started.
*/
! if((ScanDirectionIsForward(dir) && page == scan->rs_start_page) ||
! (ScanDirectionIsBackward(dir) && prevpage == scan->rs_start_page))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
***************
*** 383,390 ****
return;
}
- page = backward ? (page - 1) : (page + 1);
-
heapgetpage(scan, page);
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
--- 648,653 ----
***************
*** 427,432 ****
--- 690,696 ----
HeapTuple tuple = &(scan->rs_ctup);
bool backward = ScanDirectionIsBackward(dir);
BlockNumber page;
+ BlockNumber prevpage;
Page dp;
int lines;
int lineindex;
***************
*** 450,456 ****
tuple->t_data = NULL;
return;
}
! page = 0; /* first page */
heapgetpage(scan, page);
lineindex = 0;
scan->rs_inited = true;
--- 714,724 ----
tuple->t_data = NULL;
return;
}
! /*
! * start the scan at the location that we chose
! * in ss_init()
! */
! page = scan->rs_start_page;
heapgetpage(scan, page);
lineindex = 0;
scan->rs_inited = true;
***************
*** 481,487 ****
tuple->t_data = NULL;
return;
}
! page = scan->rs_nblocks - 1; /* final page */
heapgetpage(scan, page);
}
else
--- 749,755 ----
tuple->t_data = NULL;
return;
}
! page = scan->rs_start_page;
heapgetpage(scan, page);
}
else
***************
*** 585,598 ****
}
/*
! * if we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
*/
/*
! * return NULL if we've exhausted all the pages
*/
! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
--- 853,892 ----
}
/*
! * If we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
+ *
+ * For the forward scan, we need to wrap around to the beginning
+ * of the relation file if we reach the end.
*/
+ prevpage = page;
+ if(backward) {
+ if(page == 0)
+ page = scan->rs_nblocks;
+ page--;
+ }
+ else
+ page = (page + 1) % (scan->rs_nblocks);
+
+ if(Trace_sync_seqscan)
+ {
+ if (!(page%50000))
+ elog(DEBUG2,"page: %d",page);
+ else if (!(page%5000))
+ elog(DEBUG3,"page: %d",page);
+ }
+
+ if(! (page % SYNC_SCAN_REPORT_INTERVAL) )
+ ss_store_hint(scan,page);
/*
! * Return NULL if we've exhausted all the pages.
! * For reverse scans, that means we've reached 0. For
! * forward scans, that means we've reached the page on
! * which we started.
*/
! if((ScanDirectionIsForward(dir) && page == scan->rs_start_page) ||
! (ScanDirectionIsBackward(dir) && prevpage == scan->rs_start_page))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
***************
*** 603,609 ****
return;
}
- page = backward ? (page - 1) : (page + 1);
heapgetpage(scan, page);
dp = (Page) BufferGetPage(scan->rs_cbuf);
--- 897,902 ----
***************
*** 616,621 ****
--- 909,926 ----
}
}
+ /*
+ * SyncScanShmemSize:
+ *
+ * Called by CreateSharedMemoryAndSemaphores()
+ * to find out how much room the Sync Scan hint
+ * structure will need to occupy.
+ */
+ Size SyncScanShmemSize(void)
+ {
+ return sizeof(ss_hints_t) + sizeof(ss_lru_t) + \
+ (sizeof(ss_lru_item_t) + sizeof(ss_lru_item_t*)) * SYNC_SCAN_NELEM;
+ }
#if defined(DISABLE_COMPLEX_MACRO)
/*
diff -rc postgresql-8.2.4/src/backend/storage/ipc/ipci.c postgresql-8.2.4-ss/src/backend/storage/ipc/ipci.c
*** postgresql-8.2.4/src/backend/storage/ipc/ipci.c 2006-10-15 15:04:07.000000000 -0700
--- postgresql-8.2.4-ss/src/backend/storage/ipc/ipci.c 2007-05-17 08:17:59.000000000 -0700
***************
*** 19,24 ****
--- 19,25 ----
#include "access/nbtree.h"
#include "access/subtrans.h"
#include "access/twophase.h"
+ #include "access/heapam.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
***************
*** 110,115 ****
--- 111,117 ----
size = add_size(size, FreeSpaceShmemSize());
size = add_size(size, BgWriterShmemSize());
size = add_size(size, BTreeShmemSize());
+ size = add_size(size, SyncScanShmemSize());
#ifdef EXEC_BACKEND
size = add_size(size, ShmemBackendArraySize());
#endif
diff -rc postgresql-8.2.4/src/backend/utils/misc/guc.c postgresql-8.2.4-ss/src/backend/utils/misc/guc.c
*** postgresql-8.2.4/src/backend/utils/misc/guc.c 2006-11-29 06:50:07.000000000 -0800
--- postgresql-8.2.4-ss/src/backend/utils/misc/guc.c 2007-05-17 08:18:58.000000000 -0700
***************
*** 25,31 ****
#include <syslog.h>
#endif
!
#include "access/gin.h"
#include "access/twophase.h"
#include "access/xact.h"
--- 25,31 ----
#include <syslog.h>
#endif
! #include "access/heapam.h"
#include "access/gin.h"
#include "access/twophase.h"
#include "access/xact.h"
***************
*** 758,763 ****
--- 758,773 ----
false, NULL, NULL
},
+ {
+ {"trace_sync_seqscan", PGC_USERSET, DEVELOPER_OPTIONS,
+ gettext_noop("Generates debugging output for Synchronized Scans."),
+ NULL,
+ GUC_NOT_IN_SAMPLE
+ },
+ &Trace_sync_seqscan,
+ false, NULL, NULL
+ },
+
#ifdef LOCK_DEBUG
{
{"trace_locks", PGC_SUSET, DEVELOPER_OPTIONS,
***************
*** 1723,1729 ****
DEFAULT_GEQO_SELECTION_BIAS, MIN_GEQO_SELECTION_BIAS,
MAX_GEQO_SELECTION_BIAS, NULL, NULL
},
!
{
{"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES,
gettext_noop("Background writer percentage of LRU buffers to flush per round"),
--- 1733,1746 ----
DEFAULT_GEQO_SELECTION_BIAS, MIN_GEQO_SELECTION_BIAS,
MAX_GEQO_SELECTION_BIAS, NULL, NULL
},
! {
! {"sync_seqscan_threshold", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN,
! gettext_noop("Minimum size of table before synchronized scanning takes effect, as a fraction of shared_buffers."),
! NULL
! },
! &sync_seqscan_threshold,
! DEFAULT_SYNC_SCAN_THRESHOLD, -1.0, 100.0, NULL, NULL
! },
{
{"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES,
gettext_noop("Background writer percentage of LRU buffers to flush per round"),
diff -rc postgresql-8.2.4/src/include/access/heapam.h postgresql-8.2.4-ss/src/include/access/heapam.h
*** postgresql-8.2.4/src/include/access/heapam.h 2006-11-05 14:42:10.000000000 -0800
--- postgresql-8.2.4-ss/src/include/access/heapam.h 2007-05-20 14:04:51.000000000 -0700
***************
*** 25,30 ****
--- 25,47 ----
#include "utils/rel.h"
#include "utils/tqual.h"
+ /*
+ * Size of the Sync Scan Hint list.
+ */
+ #define SYNC_SCAN_NELEM 100
+
+ /*
+ * Interval between reports of the location
+ * of the current scan, in pages.
+ */
+ #define SYNC_SCAN_REPORT_INTERVAL 16
+
+ #define DEFAULT_SYNC_SCAN_THRESHOLD 1.0
+
+ extern DLLIMPORT bool Trace_sync_seqscan;
+ extern DLLIMPORT double sync_seqscan_threshold;
+ extern Size SyncScanShmemSize(void);
+
/* ----------------
* fastgetattr
*
diff -rc postgresql-8.2.4/src/include/access/relscan.h postgresql-8.2.4-ss/src/include/access/relscan.h
*** postgresql-8.2.4/src/include/access/relscan.h 2006-10-03 17:30:07.000000000 -0700
--- postgresql-8.2.4-ss/src/include/access/relscan.h 2007-05-20 14:04:11.000000000 -0700
***************
*** 16,22 ****
--- 16,54 ----
#include "access/skey.h"
#include "storage/bufpage.h"
+ #include "storage/lwlock.h"
#include "utils/tqual.h"
+ #include "utils/dynahash.h"
+
+
+ /*
+ * These structures (beginning with ss_) are used for
+ * synchronized scans, see heapam.c.
+ */
+ typedef struct ss_hint_t {
+ RelFileNode relfilenode; /* The relfilenode that tags this hint entry */
+ BlockNumber location; /* The location in the relation */
+ } ss_hint_t;
+
+ typedef struct ss_lru_item_t {
+ struct ss_lru_item_t *prev;
+ struct ss_lru_item_t *next;
+ ss_hint_t hint;
+ } ss_lru_item_t;
+
+ typedef struct ss_lru_t {
+ ss_lru_item_t *head;
+ ss_lru_item_t *tail;
+ ss_lru_item_t *first;
+ ss_lru_item_t **freelist;
+ ss_lru_item_t **free;
+ int nelem;
+ } ss_lru_t;
+
+ typedef struct ss_hints_t {
+ ss_lru_t *lru;
+ } ss_hints_t;
+
typedef struct HeapScanDescData
***************
*** 33,38 ****
--- 65,72 ----
bool rs_inited; /* false = scan not init'd yet */
HeapTupleData rs_ctup; /* current tuple in scan, if any */
BlockNumber rs_cblock; /* current block # in scan, if any */
+ BlockNumber rs_start_page; /* page where this scan began */
+ ss_hints_t *rs_hints; /* pointer to scan hint */
Buffer rs_cbuf; /* current buffer in scan, if any */
/* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
ItemPointerData rs_mctid; /* marked scan position, if any */
diff -rc postgresql-8.2.4/src/include/storage/lwlock.h postgresql-8.2.4-ss/src/include/storage/lwlock.h
*** postgresql-8.2.4/src/include/storage/lwlock.h 2006-10-15 15:04:07.000000000 -0700
--- postgresql-8.2.4-ss/src/include/storage/lwlock.h 2007-05-17 22:28:46.000000000 -0700
***************
*** 61,66 ****
--- 61,67 ----
TablespaceCreateLock,
BtreeVacuumLock,
AddinShmemInitLock,
+ SyncScanLock,
FirstBufMappingLock,
FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
diff -rc postgresql-8.2.4/src/include/utils/guc_tables.h postgresql-8.2.4-ss/src/include/utils/guc_tables.h
*** postgresql-8.2.4/src/include/utils/guc_tables.h 2006-10-03 14:11:55.000000000 -0700
--- postgresql-8.2.4-ss/src/include/utils/guc_tables.h 2007-05-17 08:17:59.000000000 -0700
***************
*** 56,61 ****
--- 56,62 ----
QUERY_TUNING_METHOD,
QUERY_TUNING_COST,
QUERY_TUNING_GEQO,
+ QUERY_TUNING_SYNC_SEQSCAN,
QUERY_TUNING_OTHER,
LOGGING,
LOGGING_WHERE,
---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend