On Mon, 2007-05-21 at 16:03 +0200, Peter Eisentraut wrote: > Am Montag, 21. Mai 2007 00:01 schrieb Jeff Davis: > > Here is the latest version of my patch that's revised according to my > > discussions with Heikki and Simon: > > This patch was apparently done against 8.2.4, but it doesn't apply to CVS > head. >
Thanks. Here's a version that applies cleanly to CVS head. Regards, Jeff Davis
diff -rc pgsql/src/backend/access/heap/heapam.c pgsql-ss/src/backend/access/heap/heapam.c *** pgsql/src/backend/access/heap/heapam.c 2007-04-07 18:26:27.000000000 -0700 --- pgsql-ss/src/backend/access/heap/heapam.c 2007-05-21 07:49:17.000000000 -0700 *************** *** 67,72 **** --- 67,300 ---- * ---------------------------------------------------------------- */ + static BlockNumber ss_init(HeapScanDesc); + static int ss_store_hint(HeapScanDesc,BlockNumber); + static ss_hints_t *ss_hints_init(); + static BlockNumber ss_hints_search(ss_hints_t*,RelFileNode,BlockNumber,bool); + + bool Trace_sync_seqscan = false; + double sync_seqscan_threshold = DEFAULT_SYNC_SCAN_THRESHOLD; + + + /* + * ss_hints_init: + * + * Creates and initializes hint structure of size + * nelem. If the structure already exists, + * it will simply be returned. + * + * It is assumed that SYNC_SCAN_NELEM > 1. + * + * The hint structure is essentially a + * doubly-linked LRU with head and tail + * pointer, but designed to hold a fixed maximum + * number of elements in fixed-size shared memory. + */ + static ss_hints_t *ss_hints_init() + { + int i; + int size; + int nelem = SYNC_SCAN_NELEM; + bool found; + ss_hints_t *hints; + + Assert(nelem > 1); + + size = sizeof(ss_hints_t) + sizeof(ss_lru_t) + \ + (sizeof(ss_lru_item_t) + sizeof(ss_lru_item_t*)) * nelem; + + hints = (ss_hints_t*)ShmemInitStruct("Sync Scan Hint List", + size,&found); + + if(found) + return hints; + + if(Trace_sync_seqscan) + elog(DEBUG2,"SYNC_SCAN: Created Hint List"); + + /* + * Because it needs to fit in one fixed-size shared memory chunk, + * the code below uses some ugly offset math and typecasts. + * + * Also, it maintains it's own rudimentary freelist to manage the + * memory it has. + */ + hints->lru = (ss_lru_t*)((char*)hints + sizeof(ss_hints_t)); + hints->lru->nelem = nelem; + hints->lru->head = NULL; + hints->lru->tail = NULL; + hints->lru->first = (ss_lru_item_t*)((char*)hints->lru + sizeof(ss_lru_t)); + hints->lru->freelist = (ss_lru_item_t**)(hints->lru->first + nelem); + hints->lru->free = hints->lru->freelist; + for( i = nelem-1; i >= 0; i-- ) { + *hints->lru->free = hints->lru->first + i; + hints->lru->free++; + } + return hints; + } + + /* + * ss_hints_search: + * + * This searches the hint structure for a hint with the given + * relfilenode. + * + * If "set" is true, it will set the location in the hint to + * the given location, and return that location. If not, it + * will return the pre-existing location without modification. + * + * If the hint is not found, it will be created + * at the head of the list with the given location, and that + * location will be returned. + * + */ + static BlockNumber ss_hints_search(ss_hints_t *hints, RelFileNode relfilenode, + BlockNumber location, bool set) + { + ss_lru_item_t *item; + ss_lru_t *lru = hints->lru; + + item = lru->head; + + while(item != NULL) { + if(RelFileNodeEquals(item->hint.relfilenode,relfilenode)) { + if(set) + item->hint.location = location; + if(item == lru->head) + return lru->head->hint.location; + + if(item == lru->tail) + lru->tail = item->prev; + item->prev->next = item->next; + if(item->next) + item->next->prev = item->prev; + + item->prev = NULL; + item->next = lru->head; + lru->head->prev = item; + lru->head = item; + + return lru->head->hint.location; + } + item = item->next; + } + + // if list is full, remove tail + if(lru->free == lru->freelist) { + *lru->free = lru->tail; + lru->free++; + Assert(lru->tail->prev != NULL); + lru->tail->prev->next = NULL; + lru->tail = lru->tail->prev; + } + + lru->free--; + item = *lru->free; + item->hint.relfilenode = relfilenode; + item->hint.location = location; + if(lru->head != NULL) + lru->head->prev = item; + item->next = lru->head; + item->prev = NULL; + lru->head = item; + if(item->next == NULL) + lru->tail = item; + + return lru->head->hint.location; + } + + /* + * ss_init: + * + * This function reads the Sync Scan hint structure + * (creating it if it doesn't already exist) to + * find a possible location for an already running + * sequential scan on this relation. + * + * By starting a sequential scan near the location + * of an already running scan, we improve the chance + * of finding pages in cache. + * + * This function assumes that scan->rs_nblocks is + * already properly set, and sets scan->rs_start_page + * to a value based on the hint found. Also, it sets + * scan->rs_hints to point to the hint structure. + */ + static BlockNumber ss_init(HeapScanDesc scan) + { + int threshold = sync_seqscan_threshold * NBuffers; + + /* + * If the table is not large enough, or sync_scan_threshold + * is disabled (negative), don't Sync Scan. + */ + if(threshold < 0 || scan->rs_nblocks < threshold) + { + scan->rs_start_page = 0; + return 0; + } + + LWLockAcquire(SyncScanLock,LW_EXCLUSIVE); + scan->rs_hints = ss_hints_init(); + scan->rs_start_page = ss_hints_search(scan->rs_hints,scan->rs_rd->rd_node,0,false); + LWLockRelease(SyncScanLock); + + /* + * If the hint is not a valid block number + * for this relation, start at 0. + * + * This can happen if, for instance, someone + * TRUNCATEd the table between when the hint + * was set and now. + */ + if(scan->rs_start_page < 0 || + scan->rs_start_page >= scan->rs_nblocks) + { + if(Trace_sync_seqscan) + elog(DEBUG2,"SYNC_SCAN: Hint %d out of range." \ + " Relation has %d pages.", + scan->rs_start_page,scan->rs_nblocks); + scan->rs_start_page = 0; + return 0; + } + + if(Trace_sync_seqscan) + elog(DEBUG2,"SYNC_SCAN: START: OID = %d; Location = %d; Size: %d", + RelationGetRelid(scan->rs_rd), + scan->rs_start_page,scan->rs_nblocks); + + return 0; + } + + /* + * ss_store_hint: + * + * Writes an entry in the Sync Scan hint structure + * of the form (relfilenode,blocknumber), overwriting + * any existing hint for the same relfilenode. + * + * It's possible the hint returned is invalid, so the + * caller should check to see whether it's a valid + * BlockNumber for the relation in question. + */ + static int ss_store_hint(HeapScanDesc scan, BlockNumber location) + { + int threshold = sync_seqscan_threshold * NBuffers; + + /* + * If the table is not large enough, or sync_scan_threshold + * is disabled (negative), don't Sync Scan. + */ + if(threshold < 0 || scan->rs_nblocks < threshold) + return 0; + + LWLockAcquire(SyncScanLock,LW_EXCLUSIVE); + ss_hints_search(scan->rs_hints,scan->rs_rd->rd_node,location,true); + LWLockRelease(SyncScanLock); + + return 0; + } + /* ---------------- * initscan - scan code common to heap_beginscan and heap_rescan * ---------------- *************** *** 83,88 **** --- 311,321 ---- */ scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd); + /* + * Choose an good place to start the relation scan. + */ + ss_init(scan); + scan->rs_inited = false; scan->rs_ctup.t_data = NULL; ItemPointerSetInvalid(&scan->rs_ctup.t_self); *************** *** 203,208 **** --- 436,442 ---- Snapshot snapshot = scan->rs_snapshot; bool backward = ScanDirectionIsBackward(dir); BlockNumber page; + BlockNumber prevpage; Page dp; int lines; OffsetNumber lineoff; *************** *** 225,231 **** tuple->t_data = NULL; return; } ! page = 0; /* first page */ heapgetpage(scan, page); lineoff = FirstOffsetNumber; /* first offnum */ scan->rs_inited = true; --- 459,469 ---- tuple->t_data = NULL; return; } ! /* ! * start the scan at the location that we chose ! * in ss_init() ! */ ! page = scan->rs_start_page; heapgetpage(scan, page); lineoff = FirstOffsetNumber; /* first offnum */ scan->rs_inited = true; *************** *** 259,265 **** tuple->t_data = NULL; return; } ! page = scan->rs_nblocks - 1; /* final page */ heapgetpage(scan, page); } else --- 497,503 ---- tuple->t_data = NULL; return; } ! page = scan->rs_start_page; heapgetpage(scan, page); } else *************** *** 366,380 **** } /* ! * if we get here, it means we've exhausted the items on this page and * it's time to move to the next. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); /* ! * return NULL if we've exhausted all the pages */ ! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); --- 604,645 ---- } /* ! * If we get here, it means we've exhausted the items on this page and * it's time to move to the next. + * + * For the forward scan, we need to wrap around to the beginning + * of the relation file if we reach the end. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + prevpage = page; + if(backward) { + if(page == 0) + page = scan->rs_nblocks; + page--; + } + else + page = (page + 1) % (scan->rs_nblocks); + + if(Trace_sync_seqscan) + { + if (!(page%50000)) + elog(DEBUG2,"page: %d",page); + else if (!(page%5000)) + elog(DEBUG3,"page: %d",page); + } + + if(! (page % SYNC_SCAN_REPORT_INTERVAL) ) + ss_store_hint(scan,page); + /* ! * Return NULL if we've exhausted all the pages. ! * For reverse scans, that means we've reached 0. For ! * forward scans, that means we've reached the page on ! * which we started. */ ! if((ScanDirectionIsForward(dir) && page == scan->rs_start_page) || ! (ScanDirectionIsBackward(dir) && prevpage == scan->rs_start_page)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); *************** *** 385,392 **** return; } - page = backward ? (page - 1) : (page + 1); - heapgetpage(scan, page); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); --- 650,655 ---- *************** *** 429,434 **** --- 692,698 ---- HeapTuple tuple = &(scan->rs_ctup); bool backward = ScanDirectionIsBackward(dir); BlockNumber page; + BlockNumber prevpage; Page dp; int lines; int lineindex; *************** *** 452,458 **** tuple->t_data = NULL; return; } ! page = 0; /* first page */ heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; --- 716,726 ---- tuple->t_data = NULL; return; } ! /* ! * start the scan at the location that we chose ! * in ss_init() ! */ ! page = scan->rs_start_page; heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; *************** *** 483,489 **** tuple->t_data = NULL; return; } ! page = scan->rs_nblocks - 1; /* final page */ heapgetpage(scan, page); } else --- 751,757 ---- tuple->t_data = NULL; return; } ! page = scan->rs_start_page; heapgetpage(scan, page); } else *************** *** 587,600 **** } /* ! * if we get here, it means we've exhausted the items on this page and * it's time to move to the next. */ /* ! * return NULL if we've exhausted all the pages */ ! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); --- 855,894 ---- } /* ! * If we get here, it means we've exhausted the items on this page and * it's time to move to the next. + * + * For the forward scan, we need to wrap around to the beginning + * of the relation file if we reach the end. */ + prevpage = page; + if(backward) { + if(page == 0) + page = scan->rs_nblocks; + page--; + } + else + page = (page + 1) % (scan->rs_nblocks); + + if(Trace_sync_seqscan) + { + if (!(page%50000)) + elog(DEBUG2,"page: %d",page); + else if (!(page%5000)) + elog(DEBUG3,"page: %d",page); + } + + if(! (page % SYNC_SCAN_REPORT_INTERVAL) ) + ss_store_hint(scan,page); /* ! * Return NULL if we've exhausted all the pages. ! * For reverse scans, that means we've reached 0. For ! * forward scans, that means we've reached the page on ! * which we started. */ ! if((ScanDirectionIsForward(dir) && page == scan->rs_start_page) || ! (ScanDirectionIsBackward(dir) && prevpage == scan->rs_start_page)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); *************** *** 605,611 **** return; } - page = backward ? (page - 1) : (page + 1); heapgetpage(scan, page); dp = (Page) BufferGetPage(scan->rs_cbuf); --- 899,904 ---- *************** *** 618,623 **** --- 911,928 ---- } } + /* + * SyncScanShmemSize: + * + * Called by CreateSharedMemoryAndSemaphores() + * to find out how much room the Sync Scan hint + * structure will need to occupy. + */ + Size SyncScanShmemSize(void) + { + return sizeof(ss_hints_t) + sizeof(ss_lru_t) + \ + (sizeof(ss_lru_item_t) + sizeof(ss_lru_item_t*)) * SYNC_SCAN_NELEM; + } #if defined(DISABLE_COMPLEX_MACRO) /* Only in pgsql-ss/src/backend/access/heap: heapam.c.orig diff -rc pgsql/src/backend/storage/ipc/ipci.c pgsql-ss/src/backend/storage/ipc/ipci.c *** pgsql/src/backend/storage/ipc/ipci.c 2007-02-15 15:23:23.000000000 -0800 --- pgsql-ss/src/backend/storage/ipc/ipci.c 2007-05-21 07:49:17.000000000 -0700 *************** *** 19,24 **** --- 19,25 ---- #include "access/nbtree.h" #include "access/subtrans.h" #include "access/twophase.h" + #include "access/heapam.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/autovacuum.h" *************** *** 112,117 **** --- 113,119 ---- size = add_size(size, BgWriterShmemSize()); size = add_size(size, AutoVacuumShmemSize()); size = add_size(size, BTreeShmemSize()); + size = add_size(size, SyncScanShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif Only in pgsql-ss/src/backend/storage/ipc: ipci.c.orig diff -rc pgsql/src/backend/utils/misc/guc.c pgsql-ss/src/backend/utils/misc/guc.c *** pgsql/src/backend/utils/misc/guc.c 2007-05-08 09:33:51.000000000 -0700 --- pgsql-ss/src/backend/utils/misc/guc.c 2007-05-21 07:49:17.000000000 -0700 *************** *** 25,31 **** #include <syslog.h> #endif ! #include "access/gin.h" #include "access/transam.h" #include "access/twophase.h" --- 25,31 ---- #include <syslog.h> #endif ! #include "access/heapam.h" #include "access/gin.h" #include "access/transam.h" #include "access/twophase.h" *************** *** 783,788 **** --- 783,798 ---- false, NULL, NULL }, + { + {"trace_sync_seqscan", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Generates debugging output for Synchronized Scans."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &Trace_sync_seqscan, + false, NULL, NULL + }, + #ifdef LOCK_DEBUG { {"trace_locks", PGC_SUSET, DEVELOPER_OPTIONS, *************** *** 1803,1809 **** DEFAULT_GEQO_SELECTION_BIAS, MIN_GEQO_SELECTION_BIAS, MAX_GEQO_SELECTION_BIAS, NULL, NULL }, ! { {"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES, gettext_noop("Background writer percentage of LRU buffers to flush per round."), --- 1813,1826 ---- DEFAULT_GEQO_SELECTION_BIAS, MIN_GEQO_SELECTION_BIAS, MAX_GEQO_SELECTION_BIAS, NULL, NULL }, ! { ! {"sync_seqscan_threshold", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN, ! gettext_noop("Minimum size of table before synchronized scanning takes effect, as a fraction of shared_buffers."), ! NULL ! }, ! &sync_seqscan_threshold, ! DEFAULT_SYNC_SCAN_THRESHOLD, -1.0, 100.0, NULL, NULL ! }, { {"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES, gettext_noop("Background writer percentage of LRU buffers to flush per round."), Only in pgsql-ss/src/backend/utils/misc: guc.c.orig diff -rc pgsql/src/include/access/heapam.h pgsql-ss/src/include/access/heapam.h *** pgsql/src/include/access/heapam.h 2007-04-07 18:26:33.000000000 -0700 --- pgsql-ss/src/include/access/heapam.h 2007-05-21 07:49:17.000000000 -0700 *************** *** 25,30 **** --- 25,47 ---- #include "utils/rel.h" #include "utils/tqual.h" + /* + * Size of the Sync Scan Hint list. + */ + #define SYNC_SCAN_NELEM 100 + + /* + * Interval between reports of the location + * of the current scan, in pages. + */ + #define SYNC_SCAN_REPORT_INTERVAL 16 + + #define DEFAULT_SYNC_SCAN_THRESHOLD 1.0 + + extern DLLIMPORT bool Trace_sync_seqscan; + extern DLLIMPORT double sync_seqscan_threshold; + extern Size SyncScanShmemSize(void); + /* ---------------- * fastgetattr * diff -rc pgsql/src/include/access/relscan.h pgsql-ss/src/include/access/relscan.h *** pgsql/src/include/access/relscan.h 2007-01-20 10:43:35.000000000 -0800 --- pgsql-ss/src/include/access/relscan.h 2007-05-21 07:49:17.000000000 -0700 *************** *** 16,22 **** --- 16,54 ---- #include "access/skey.h" #include "storage/bufpage.h" + #include "storage/lwlock.h" #include "utils/tqual.h" + #include "utils/dynahash.h" + + + /* + * These structures (beginning with ss_) are used for + * synchronized scans, see heapam.c. + */ + typedef struct ss_hint_t { + RelFileNode relfilenode; /* The relfilenode that tags this hint entry */ + BlockNumber location; /* The location in the relation */ + } ss_hint_t; + + typedef struct ss_lru_item_t { + struct ss_lru_item_t *prev; + struct ss_lru_item_t *next; + ss_hint_t hint; + } ss_lru_item_t; + + typedef struct ss_lru_t { + ss_lru_item_t *head; + ss_lru_item_t *tail; + ss_lru_item_t *first; + ss_lru_item_t **freelist; + ss_lru_item_t **free; + int nelem; + } ss_lru_t; + + typedef struct ss_hints_t { + ss_lru_t *lru; + } ss_hints_t; + typedef struct HeapScanDescData *************** *** 33,38 **** --- 65,72 ---- bool rs_inited; /* false = scan not init'd yet */ HeapTupleData rs_ctup; /* current tuple in scan, if any */ BlockNumber rs_cblock; /* current block # in scan, if any */ + BlockNumber rs_start_page; /* page where this scan began */ + ss_hints_t *rs_hints; /* pointer to scan hint */ Buffer rs_cbuf; /* current buffer in scan, if any */ /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ ItemPointerData rs_mctid; /* marked scan position, if any */ diff -rc pgsql/src/include/storage/lwlock.h pgsql-ss/src/include/storage/lwlock.h *** pgsql/src/include/storage/lwlock.h 2007-04-16 11:30:04.000000000 -0700 --- pgsql-ss/src/include/storage/lwlock.h 2007-05-21 07:52:17.000000000 -0700 *************** *** 62,67 **** --- 62,68 ---- AddinShmemInitLock, AutovacuumLock, AutovacuumScheduleLock, + SyncScanLock, /* Individual lock IDs end here */ FirstBufMappingLock, FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS, Only in pgsql-ss/src/include/storage: lwlock.h.orig diff -rc pgsql/src/include/utils/guc_tables.h pgsql-ss/src/include/utils/guc_tables.h *** pgsql/src/include/utils/guc_tables.h 2007-04-21 13:02:41.000000000 -0700 --- pgsql-ss/src/include/utils/guc_tables.h 2007-05-21 07:49:17.000000000 -0700 *************** *** 56,61 **** --- 56,62 ---- QUERY_TUNING_METHOD, QUERY_TUNING_COST, QUERY_TUNING_GEQO, + QUERY_TUNING_SYNC_SEQSCAN, QUERY_TUNING_OTHER, LOGGING, LOGGING_WHERE,
---------------------------(end of broadcast)--------------------------- TIP 5: don't forget to increase your free space map settings