This is my latest revision of the Sync Scan patch, and it implements the
observability as discussed with Simon.
Changes:
* ss_report_loc() called once per hundred pages rather than once per
page
* DEBUG messages are a little cleaner and easier to parse, for the sake
of analysis after the fact.
* DEBUG2 reports a sync scan starting, the relation size in pages, and
the location at which the scan starts.
* DEBUG2 reports the location of a scan every 50k pages, DEBUG3 every
5k pages (before it was 100k/10k at DEBUG3/DEBUG4, respectively).
Numbers are aligned along 5k boundaries to make analysis easier.
* GUCs:
* sync_seqscan_threshold: fraction of NBuffers for the threshold
* sync_seqscan_offset: fraction of NBuffers for the offset
* trace_sync_seqscan: will be used in final version of patch to
control DEBUG output
Sync_scan_offset may be eliminated completely if it's not shown to be
useful enough in conjunction with Simon's patch. Sync Scans are still a
big win without sync_seqscan_offset.
Sync_scan_threshold=<real> may be turned into sync_seqscan=<boolean>
with a fixed activation threshold (NBuffers/2 per Simon's suggestion).
The reason is that synchronized scans should activate at the same
threshold as Simon's scan_recycle_buffers feature. Should we make a
"#define BIG_SCAN_THRESHOLD NBuffers/2" to use for both sync_seqscan and
for scan_recycle_buffers?
Regards,
Jeff Davis
diff -cr postgresql-8.2.3/src/backend/access/heap/heapam.c postgresql-8.2.3-syncscan/src/backend/access/heap/heapam.c
*** postgresql-8.2.3/src/backend/access/heap/heapam.c 2007-02-04 12:00:49.000000000 -0800
--- postgresql-8.2.3-syncscan/src/backend/access/heap/heapam.c 2007-03-13 23:21:27.000000000 -0700
***************
*** 65,70 ****
--- 65,279 ----
* ----------------------------------------------------------------
*/
+ static BlockNumber ss_init(HeapScanDesc);
+ static int ss_store_hint(HeapScanDesc,BlockNumber);
+ static int ss_hash(HeapScanDesc);
+ bool Trace_sync_seqscan = false;
+ double sync_seqscan_threshold = DEFAULT_SYNC_SCAN_THRESHOLD;
+ double sync_seqscan_offset = DEFAULT_SYNC_SCAN_OFFSET;
+
+ /*
+ * ss_init:
+ *
+ * This function reads the Sync Scan Hint Table
+ * (creating it if it doesn't already exist) to
+ * find a possible location for an already running
+ * sequential scan on this relation.
+ *
+ * By starting a sequential scan near the location
+ * of an already running scan, we improve the chance
+ * of finding pages in cache.
+ *
+ * Also, depending on SYNC_SCAN_START_OFFSET, this
+ * function will subtract from the hint before
+ * starting the scan, in order to pick up pages that
+ * are likely to already be in cache.
+ *
+ * This function assumes that scan->rs_nblocks is
+ * already properly set, and sets scan->rs_start_page
+ * to a value based on the hint found. Also, it sets
+ * scan->rs_hint to point to the location of the hint
+ * in the hint table.
+ */
+ static BlockNumber ss_init(HeapScanDesc scan)
+ {
+ ss_hint_t *hint_table;
+ int table_offset;
+ bool found;
+ int threshold = sync_seqscan_threshold * NBuffers;
+ int offset = sync_seqscan_offset * NBuffers;
+
+ /*
+ * If the table is not large compared to effective_cache_size,
+ * don't Sync Scan.
+ */
+ if(scan->rs_nblocks < threshold)
+ {
+ elog(DEBUG2,"SYNC_SCAN: Table too small to sync scan");
+ scan->rs_start_page = 0;
+ return 0;
+ }
+
+ table_offset = ss_hash(scan);
+ hint_table = (ss_hint_t*)ShmemInitStruct("Sync Scan Hint Table",
+ SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t),&found);
+
+ scan->rs_hint = &hint_table[table_offset];
+
+ /*
+ * If we just created the hint table for the first time,
+ * initialize the table to zero and start the scan at page 0.
+ */
+ if(!found) {
+ elog(DEBUG2,"SYNC_SCAN: Created Hint Table");
+ memset(hint_table,0,sizeof(ss_hint_t)*SYNC_SCAN_TABLE_SIZE);
+ scan->rs_start_page = 0;
+ return 0;
+ }
+
+ /*
+ * If the hint's relid is 0, that means
+ * we have not previously created a hint
+ * at this location in the table.
+ */
+ if(scan->rs_hint->relid == 0) {
+ elog(DEBUG2, "SYNC_SCAN: Hint empty");
+ scan->rs_start_page = 0;
+ return 0;
+ }
+
+ /*
+ * If the relid doesn't match the one in the hint,
+ * we have a hash collision.
+ */
+ if(RelationGetRelid(scan->rs_rd) != scan->rs_hint->relid)
+ {
+ elog(DEBUG1,"SYNC_SCAN: Hash collision");
+ scan->rs_start_page = 0;
+ return 0;
+ }
+
+ /*
+ * If the hint is not a valid block number
+ * for this relation, start at 0.
+ *
+ * This can happen if, for instance, someone
+ * TRUNCATEd the table between when the hint
+ * was set and now.
+ */
+ if(scan->rs_hint->location < 0 ||
+ scan->rs_hint->location >= scan->rs_nblocks)
+ {
+ elog(DEBUG2,"SYNC_SCAN: Hint %d out of range." \
+ " Relation has %d pages.",
+ scan->rs_hint->location,scan->rs_nblocks);
+ scan->rs_start_page = 0;
+ return 0;
+ }
+
+ scan->rs_start_page = scan->rs_hint->location;
+
+ /*
+ * By starting at offset earlier than the hint,
+ * it's likely that all of the blocks will already be
+ * cached, and the scan will quickly catch up to the head.
+ *
+ * offset is a positive value that will be
+ * subtracted from the hint.
+ */
+ if(offset > scan->rs_nblocks)
+ {
+ elog(DEBUG2,"SYNC_SCAN: Relation smaller than start offset: %d",
+ offset);
+ return 0;
+ }
+
+ /*
+ * If subtracting the offset would bring the value
+ * to less than 0, we circle backwards to the end of the
+ * file.
+ */
+ if(offset > scan->rs_start_page)
+ scan->rs_start_page += scan->rs_nblocks;
+
+ scan->rs_start_page -= offset;
+
+ elog(DEBUG2,"SYNC_SCAN: START: OID = %d; Location = %d; Size: %d",
+ RelationGetRelid(scan->rs_rd),
+ scan->rs_start_page,scan->rs_nblocks);
+
+ return 0;
+ }
+
+ /*
+ * ss_store_hint:
+ *
+ * Writes an entry in the Sync Scan Hint Table
+ * of the form (relid,blocknumber). This will
+ * overwrite any existing entry that may collide
+ * with this entry in the table.
+ *
+ * No locking is performed here. When this data is
+ * later read by ss_init(), sanity checking is
+ * performed to ensure we don't use an invalid
+ * relation block number.
+ */
+ static int ss_store_hint(HeapScanDesc scan, BlockNumber location)
+ {
+ ss_hint_t hint;
+ int threshold = sync_seqscan_threshold * NBuffers;
+ int offset = sync_seqscan_offset * NBuffers;
+
+ /*
+ * Print every 100k pages to DEBUG3
+ * and every 10k pages to DEBUG4.
+ */
+ if (!(location%50000))
+ elog(DEBUG2,"page: %d",location);
+ else if (!(location%5000))
+ elog(DEBUG3,"page: %d",location);
+
+ /*
+ * If the table is too small, don't bother
+ * with Sync Scan.
+ */
+ if(scan->rs_nblocks < threshold)
+ return 0;
+
+ /*
+ * If this scan has been progressing for less
+ * than offset pages, don't store the hint.
+ */
+ if(location >= scan->rs_start_page)
+ {
+ if((location - scan->rs_start_page) < offset)
+ return 0;
+ }
+ else
+ {
+ if((location + scan->rs_nblocks - scan->rs_start_page)
+ < offset)
+ return 0;
+ }
+
+ hint.relid = RelationGetRelid(scan->rs_rd);
+ hint.location = location;
+
+ *scan->rs_hint = hint;
+
+ return 0;
+ }
+
+ /*
+ * This is a simplistic function to hash
+ * the Oid of the relation for placement in
+ * the Sync Scan Hint Table
+ */
+ static int ss_hash(HeapScanDesc scan)
+ {
+ return RelationGetRelid(scan->rs_rd) % SYNC_SCAN_TABLE_SIZE;
+ }
+
/* ----------------
* initscan - scan code common to heap_beginscan and heap_rescan
* ----------------
***************
*** 81,86 ****
--- 290,300 ----
*/
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
+ /*
+ * Choose an good place to start the relation scan.
+ */
+ ss_init(scan);
+
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL;
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 223,229 ****
tuple->t_data = NULL;
return;
}
! page = 0; /* first page */
heapgetpage(scan, page);
lineoff = FirstOffsetNumber; /* first offnum */
scan->rs_inited = true;
--- 437,447 ----
tuple->t_data = NULL;
return;
}
! /*
! * start the scan at the location that we chose
! * in ss_init()
! */
! page = scan->rs_start_page;
heapgetpage(scan, page);
lineoff = FirstOffsetNumber; /* first offnum */
scan->rs_inited = true;
***************
*** 364,378 ****
}
/*
! * if we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
*/
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
/*
! * return NULL if we've exhausted all the pages
*/
! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
--- 582,611 ----
}
/*
! * If we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
+ *
+ * For the forward scan, we need to wrap around to the beginning
+ * of the relation file if we reach the end.
*/
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+ if(backward)
+ page--;
+ else
+ page = (page + 1) % (scan->rs_nblocks);
+
+ if(! (page % SYNC_SCAN_REPORT_INTERVAL) )
+ ss_store_hint(scan,page);
+
/*
! * Return NULL if we've exhausted all the pages.
! * For reverse scans, that means we've reached 0. For
! * forward scans, that means we've reached the page on
! * which we started.
*/
! if ((backward && (page == 0)) ||
! ((page%(scan->rs_nblocks)) == scan->rs_start_page))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
***************
*** 383,390 ****
return;
}
- page = backward ? (page - 1) : (page + 1);
-
heapgetpage(scan, page);
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
--- 616,621 ----
***************
*** 450,456 ****
tuple->t_data = NULL;
return;
}
! page = 0; /* first page */
heapgetpage(scan, page);
lineindex = 0;
scan->rs_inited = true;
--- 681,691 ----
tuple->t_data = NULL;
return;
}
! /*
! * start the scan at the location that we chose
! * in ss_init()
! */
! page = scan->rs_start_page;
heapgetpage(scan, page);
lineindex = 0;
scan->rs_inited = true;
***************
*** 585,598 ****
}
/*
! * if we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
*/
/*
! * return NULL if we've exhausted all the pages
*/
! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
--- 820,847 ----
}
/*
! * If we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
+ *
+ * For the forward scan, we need to wrap around to the beginning
+ * of the relation file if we reach the end.
*/
+ if(backward)
+ page--;
+ else
+ page = (page + 1) % (scan->rs_nblocks);
+
+ if(! (page % SYNC_SCAN_REPORT_INTERVAL) )
+ ss_store_hint(scan,page);
/*
! * Return NULL if we've exhausted all the pages.
! * For reverse scans, that means we've reached 0. For
! * forward scans, that means we've reached the page on
! * which we started.
*/
! if ((backward && (page == 0)) ||
! ((page%(scan->rs_nblocks)) == scan->rs_start_page))
{
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
***************
*** 603,609 ****
return;
}
- page = backward ? (page - 1) : (page + 1);
heapgetpage(scan, page);
dp = (Page) BufferGetPage(scan->rs_cbuf);
--- 852,857 ----
***************
*** 616,621 ****
--- 864,880 ----
}
}
+ /*
+ * SyncScanShmemSize:
+ *
+ * Called by CreateSharedMemoryAndSemaphores()
+ * to find out how much room the Sync Scan Hint
+ * Table will need to occupy.
+ */
+ Size SyncScanShmemSize(void)
+ {
+ return SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t);
+ }
#if defined(DISABLE_COMPLEX_MACRO)
/*
diff -cr postgresql-8.2.3/src/backend/storage/ipc/ipci.c postgresql-8.2.3-syncscan/src/backend/storage/ipc/ipci.c
*** postgresql-8.2.3/src/backend/storage/ipc/ipci.c 2006-10-15 15:04:07.000000000 -0700
--- postgresql-8.2.3-syncscan/src/backend/storage/ipc/ipci.c 2007-03-13 21:58:56.000000000 -0700
***************
*** 19,24 ****
--- 19,25 ----
#include "access/nbtree.h"
#include "access/subtrans.h"
#include "access/twophase.h"
+ #include "access/heapam.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
***************
*** 110,115 ****
--- 111,117 ----
size = add_size(size, FreeSpaceShmemSize());
size = add_size(size, BgWriterShmemSize());
size = add_size(size, BTreeShmemSize());
+ size = add_size(size, SyncScanShmemSize());
#ifdef EXEC_BACKEND
size = add_size(size, ShmemBackendArraySize());
#endif
diff -cr postgresql-8.2.3/src/backend/utils/misc/guc.c postgresql-8.2.3-syncscan/src/backend/utils/misc/guc.c
*** postgresql-8.2.3/src/backend/utils/misc/guc.c 2006-11-29 06:50:07.000000000 -0800
--- postgresql-8.2.3-syncscan/src/backend/utils/misc/guc.c 2007-03-13 23:23:31.000000000 -0700
***************
*** 25,31 ****
#include <syslog.h>
#endif
!
#include "access/gin.h"
#include "access/twophase.h"
#include "access/xact.h"
--- 25,31 ----
#include <syslog.h>
#endif
! #include "access/heapam.h"
#include "access/gin.h"
#include "access/twophase.h"
#include "access/xact.h"
***************
*** 758,763 ****
--- 758,773 ----
false, NULL, NULL
},
+ {
+ {"trace_sync_seqscan", PGC_USERSET, DEVELOPER_OPTIONS,
+ gettext_noop("Generates debugging output for Synchronized Scans."),
+ NULL,
+ GUC_NOT_IN_SAMPLE
+ },
+ &Trace_sync_seqscan,
+ false, NULL, NULL
+ },
+
#ifdef LOCK_DEBUG
{
{"trace_locks", PGC_SUSET, DEVELOPER_OPTIONS,
***************
*** 1723,1728 ****
--- 1733,1754 ----
DEFAULT_GEQO_SELECTION_BIAS, MIN_GEQO_SELECTION_BIAS,
MAX_GEQO_SELECTION_BIAS, NULL, NULL
},
+ {
+ {"sync_seqscan_threshold", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN,
+ gettext_noop("Minimum size of table before synchronized scanning takes effect, as a fraction of shared_buffers."),
+ NULL
+ },
+ &sync_seqscan_threshold,
+ DEFAULT_SYNC_SCAN_THRESHOLD, 0.0, 100.0, NULL, NULL
+ },
+ {
+ {"sync_seqscan_offset", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN,
+ gettext_noop("Start synchronized scans at this offset (as a fraction of shared_buffers) before other scans."),
+ NULL
+ },
+ &sync_seqscan_offset,
+ DEFAULT_SYNC_SCAN_OFFSET, 0.0, 100.0, NULL, NULL
+ },
{
{"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES,
diff -cr postgresql-8.2.3/src/include/access/heapam.h postgresql-8.2.3-syncscan/src/include/access/heapam.h
*** postgresql-8.2.3/src/include/access/heapam.h 2006-11-05 14:42:10.000000000 -0800
--- postgresql-8.2.3-syncscan/src/include/access/heapam.h 2007-03-13 23:22:11.000000000 -0700
***************
*** 25,30 ****
--- 25,49 ----
#include "utils/rel.h"
#include "utils/tqual.h"
+ /*
+ * Size of the Sync Scan Hint Table.
+ */
+ #define SYNC_SCAN_TABLE_SIZE 1000
+
+ /*
+ * Interval between reports of the location
+ * of the current scan, in pages.
+ */
+ #define SYNC_SCAN_REPORT_INTERVAL 100
+
+ #define DEFAULT_SYNC_SCAN_THRESHOLD 1.0
+ #define DEFAULT_SYNC_SCAN_OFFSET 0.0
+
+ extern DLLIMPORT bool Trace_sync_seqscan;
+ extern DLLIMPORT double sync_seqscan_threshold;
+ extern DLLIMPORT double sync_seqscan_offset;
+ extern Size SyncScanShmemSize(void);
+
/* ----------------
* fastgetattr
*
diff -cr postgresql-8.2.3/src/include/access/relscan.h postgresql-8.2.3-syncscan/src/include/access/relscan.h
*** postgresql-8.2.3/src/include/access/relscan.h 2006-10-03 17:30:07.000000000 -0700
--- postgresql-8.2.3-syncscan/src/include/access/relscan.h 2007-03-13 21:58:56.000000000 -0700
***************
*** 19,24 ****
--- 19,33 ----
#include "utils/tqual.h"
+ /*
+ * Structure of an entry in the
+ * Sync Scan Hint Table.
+ */
+ typedef struct {
+ Oid relid; /* The relid that tags this hint entry */
+ BlockNumber location; /* The location in the relation */
+ } ss_hint_t;
+
typedef struct HeapScanDescData
{
/* scan parameters */
***************
*** 33,38 ****
--- 42,49 ----
bool rs_inited; /* false = scan not init'd yet */
HeapTupleData rs_ctup; /* current tuple in scan, if any */
BlockNumber rs_cblock; /* current block # in scan, if any */
+ BlockNumber rs_start_page; /* page where this scan began */
+ ss_hint_t *rs_hint; /* pointer to scan hint */
Buffer rs_cbuf; /* current buffer in scan, if any */
/* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
ItemPointerData rs_mctid; /* marked scan position, if any */
diff -cr postgresql-8.2.3/src/include/utils/guc_tables.h postgresql-8.2.3-syncscan/src/include/utils/guc_tables.h
*** postgresql-8.2.3/src/include/utils/guc_tables.h 2006-10-03 14:11:55.000000000 -0700
--- postgresql-8.2.3-syncscan/src/include/utils/guc_tables.h 2007-03-13 22:41:15.000000000 -0700
***************
*** 56,61 ****
--- 56,62 ----
QUERY_TUNING_METHOD,
QUERY_TUNING_COST,
QUERY_TUNING_GEQO,
+ QUERY_TUNING_SYNC_SEQSCAN,
QUERY_TUNING_OTHER,
LOGGING,
LOGGING_WHERE,
---------------------------(end of broadcast)---------------------------
TIP 1: if posting/reading through Usenet, please send an appropriate
subscribe-nomail command to [EMAIL PROTECTED] so that your
message can get through to the mailing list cleanly