Hi,
Please find attached a v2 of the patch. See below for changes.
On 02/09/2015 15:53, Andres Freund wrote:
>
> Hi,
>
> On 2015-07-18 12:17:39 +0200, Julien Rouhaud wrote:
>> I didn't know that the thread must exists on -hackers to be able to add
>> a commitfest entry, so I transfer the thread here.
>
> Please, in the future, also update the title of the thread to something
> fitting.
>
>> @@ -539,6 +541,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState
>> *estate, int eflags)
>> {
>> BitmapHeapScanState *scanstate;
>> Relation currentRelation;
>> +#ifdef USE_PREFETCH
>> + int new_io_concurrency;
>> +#endif
>>
>> /* check for unsupported flags */
>> Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
>> @@ -598,6 +603,25 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState
>> *estate, int eflags)
>> */
>> currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid,
>> eflags);
>>
>> +#ifdef USE_PREFETCH
>> + /* check if the effective_io_concurrency has been overloaded for the
>> + * tablespace storing the relation and compute the
>> target_prefetch_pages,
>> + * or just get the current target_prefetch_pages
>> + */
>> + new_io_concurrency = get_tablespace_io_concurrency(
>> + currentRelation->rd_rel->reltablespace);
>> +
>> +
>> + scanstate->target_prefetch_pages = target_prefetch_pages;
>> +
>> + if (new_io_concurrency != effective_io_concurrency)
>> + {
>> + double prefetch_pages;
>> + if (compute_io_concurrency(new_io_concurrency, &prefetch_pages))
>> + scanstate->target_prefetch_pages = rint(prefetch_pages);
>> + }
>> +#endif
>
> Maybe it's just me - but imo there should be as few USE_PREFETCH
> dependant places in the code as possible. It'll just be 0 when not
> supported, that's fine? Especially changing the size of externally
> visible structs depending on a configure detected ifdef seems wrong to
> me.
>
I removed these ifdefs, and the more problematic one in the struct.
>> +bool
>> +compute_io_concurrency(int io_concurrency, double *target_prefetch_pages)
>> +{
>> + double new_prefetch_pages = 0.0;
>> + int i;
>> +
>> + /* make sure the io_concurrency value is correct, it may have been
>> forced
>> + * with a pg_tablespace UPDATE
>> + */
>
> Nitpick: Wrong comment style (/* stands on its own line).
>
>> + if (io_concurrency > MAX_IO_CONCURRENCY)
>> + io_concurrency = MAX_IO_CONCURRENCY;
>> +
>> + /*----------
>> + * The user-visible GUC parameter is the number of drives (spindles),
>> + * which we need to translate to a number-of-pages-to-prefetch target.
>> + * The target value is stashed in *extra and then assigned to the actual
>> + * variable by assign_effective_io_concurrency.
>> + *
>> + * The expected number of prefetch pages needed to keep N drives busy
>> is:
>> + *
>> + * drives | I/O requests
>> + * -------+----------------
>> + * 1 | 1
>> + * 2 | 2/1 + 2/2 = 3
>> + * 3 | 3/1 + 3/2 + 3/3 = 5 1/2
>> + * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
>> + * n | n * H(n)
>
> I know you just moved this code. But: I don't buy this formula. Like at
> all. Doesn't queuing and reordering entirely invalidate the logic here?
>
Changing the formula, or changing the GUC to a number of pages to
prefetch is still discussed, so no change here.
> Perhaps more relevantly: Imo nodeBitmapHeapscan.c is the wrong place for
> this. bufmgr.c maybe?
>
Moved to bufmgr.c
> You also didn't touch
> /*
> * How many buffers PrefetchBuffer callers should try to stay ahead of their
> * ReadBuffer calls by. This is maintained by the assign hook for
> * effective_io_concurrency. Zero means "never prefetch".
> */
> int target_prefetch_pages = 0;
> which surely doesn't make sense anymore after these changes.
>
> But do we even need that variable now?
I slighty updated the comment. If the table doesn't belong to a
tablespace with an overloaded effective_io_concurrency, keeping this
pre-computed target_prefetch_pages can save a few cycles on each
execution, so I think it's better to keep it.
>
>> diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
>> index dc167f9..57008fc 100644
>> --- a/src/include/utils/guc.h
>> +++ b/src/include/utils/guc.h
>> @@ -26,6 +26,9 @@
>> #define MAX_KILOBYTES (INT_MAX / 1024)
>> #endif
>>
>> +/* upper limit for effective_io_concurrency */
>> +#define MAX_IO_CONCURRENCY 1000
>> +
>> /*
>> * Automatic configuration file name for ALTER SYSTEM.
>> * This file will be used to store values of configuration parameters
>> @@ -256,6 +259,8 @@ extern int temp_file_limit;
>>
>> extern int num_temp_buffers;
>>
>> +extern int effective_io_concurrency;
>> +
>
> target_prefetch_pages is declared in bufmgr.h - that seems like a better
> place for these.
>
Moved to bufmgr.h
As said in a previous mail, I also fixed a problem when having settings
other than effective_io_concurrency for a tablespace lead to ignore the
regular effective_io_concurrency.
I also added the forgotten lock level (AccessExclusiveLock) for this
tablespace setting, which was leading to a failed assert during initdb.
Regards.
--
Julien Rouhaud
http://dalibo.com - http://dalibo.org
diff --git a/doc/src/sgml/ref/create_tablespace.sgml b/doc/src/sgml/ref/create_tablespace.sgml
index 5756c3e..cf08408 100644
--- a/doc/src/sgml/ref/create_tablespace.sgml
+++ b/doc/src/sgml/ref/create_tablespace.sgml
@@ -104,14 +104,15 @@ CREATE TABLESPACE <replaceable class="parameter">tablespace_name</replaceable>
<listitem>
<para>
A tablespace parameter to be set or reset. Currently, the only
- available parameters are <varname>seq_page_cost</> and
- <varname>random_page_cost</>. Setting either value for a particular
- tablespace will override the planner's usual estimate of the cost of
- reading pages from tables in that tablespace, as established by
- the configuration parameters of the same name (see
- <xref linkend="guc-seq-page-cost">,
- <xref linkend="guc-random-page-cost">). This may be useful if one
- tablespace is located on a disk which is faster or slower than the
+ available parameters are <varname>seq_page_cost</>,
+ <varname>random_page_cost</> and <varname>effective_io_concurrency</>.
+ Setting either value for a particular tablespace will override the
+ planner's usual estimate of the cost of reading pages from tables in
+ that tablespace, as established by the configuration parameters of the
+ same name (see <xref linkend="guc-seq-page-cost">,
+ <xref linkend="guc-random-page-cost">,
+ <xref linkend="guc-effective-io-concurrency">). This may be useful if
+ one tablespace is located on a disk which is faster or slower than the
remainder of the I/O subsystem.
</para>
</listitem>
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 7479d40..d817eba 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -254,6 +254,19 @@ static relopt_int intRelOpts[] =
},
-1, 64, MAX_KILOBYTES
},
+ {
+ {
+ "effective_io_concurrency",
+ "Number of simultaneous requests that can be handled efficiently by the disk subsystem.",
+ RELOPT_KIND_TABLESPACE,
+ AccessExclusiveLock
+ },
+#ifdef USE_PREFETCH
+ -1, 0, MAX_IO_CONCURRENCY
+#else
+ 0, 0, 0
+#endif
+ },
/* list terminator */
{{NULL}}
@@ -1438,7 +1451,8 @@ tablespace_reloptions(Datum reloptions, bool validate)
int numoptions;
static const relopt_parse_elt tab[] = {
{"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)},
- {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)}
+ {"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)},
+ {"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)}
};
options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE,
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 4597437..5af942e 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -44,6 +44,7 @@
#include "storage/predicate.h"
#include "utils/memutils.h"
#include "utils/rel.h"
+#include "utils/spccache.h"
#include "utils/snapmgr.h"
#include "utils/tqual.h"
@@ -111,7 +112,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
node->tbmres = tbmres = NULL;
#ifdef USE_PREFETCH
- if (target_prefetch_pages > 0)
+ if (node->target_prefetch_pages > 0)
{
node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
node->prefetch_pages = 0;
@@ -188,10 +189,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
* page/tuple, then to one after the second tuple is fetched, then
* it doubles as later pages are fetched.
*/
- if (node->prefetch_target >= target_prefetch_pages)
+ if (node->prefetch_target >= node->target_prefetch_pages)
/* don't increase any further */ ;
- else if (node->prefetch_target >= target_prefetch_pages / 2)
- node->prefetch_target = target_prefetch_pages;
+ else if (node->prefetch_target >= node->target_prefetch_pages / 2)
+ node->prefetch_target = node->target_prefetch_pages;
else if (node->prefetch_target > 0)
node->prefetch_target *= 2;
else
@@ -211,7 +212,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
* Try to prefetch at least a few pages even before we get to the
* second page if we don't stop reading after the first tuple.
*/
- if (node->prefetch_target < target_prefetch_pages)
+ if (node->prefetch_target < node->target_prefetch_pages)
node->prefetch_target++;
#endif /* USE_PREFETCH */
}
@@ -539,6 +540,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
{
BitmapHeapScanState *scanstate;
Relation currentRelation;
+ int new_io_concurrency;
/* check for unsupported flags */
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
@@ -598,6 +600,25 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
*/
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
+ /*
+ * check if the effective_io_concurrency has been overloaded for the
+ * tablespace storing the relation and compute the target_prefetch_pages,
+ * or just get the current target_prefetch_pages
+ */
+ new_io_concurrency = get_tablespace_io_concurrency(
+ currentRelation->rd_rel->reltablespace);
+
+
+ scanstate->target_prefetch_pages = target_prefetch_pages;
+
+ if (new_io_concurrency != effective_io_concurrency)
+ {
+ double prefetch_pages;
+
+ if (ComputeIoConcurrency(new_io_concurrency, &prefetch_pages))
+ scanstate->target_prefetch_pages = rint(prefetch_pages);
+ }
+
scanstate->ss.ss_currentRelation = currentRelation;
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index cd3aaad..e3055c7 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -84,7 +84,9 @@ bool track_io_timing = false;
/*
* How many buffers PrefetchBuffer callers should try to stay ahead of their
* ReadBuffer calls by. This is maintained by the assign hook for
- * effective_io_concurrency. Zero means "never prefetch".
+ * effective_io_concurrency. Zero means "never prefetch". This value only
+ * applies if the related tablespace doesn't have a specific
+ * effective_io_concurrency setting.
*/
int target_prefetch_pages = 0;
@@ -415,6 +417,67 @@ static void CheckForBufferLeaks(void);
static int rnode_comparator(const void *p1, const void *p2);
+
+/*
+ * ComputeIoConcurrency -- get the number of pages to prefetch for a given
+ * number of spindles.
+ */
+bool
+ComputeIoConcurrency(int io_concurrency, double *target_prefetch_pages)
+{
+ double new_prefetch_pages = 0.0;
+ int i;
+
+ /*
+ * Make sure the io_concurrency value is correct, it may have been forced
+ * with a pg_tablespace UPDATE.
+ */
+ if (io_concurrency > MAX_IO_CONCURRENCY)
+ io_concurrency = MAX_IO_CONCURRENCY;
+
+ /*----------
+ * The user-visible GUC parameter is the number of drives (spindles),
+ * which we need to translate to a number-of-pages-to-prefetch target.
+ * The target value is stashed in *extra and then assigned to the actual
+ * variable by assign_effective_io_concurrency.
+ *
+ * The expected number of prefetch pages needed to keep N drives busy is:
+ *
+ * drives | I/O requests
+ * -------+----------------
+ * 1 | 1
+ * 2 | 2/1 + 2/2 = 3
+ * 3 | 3/1 + 3/2 + 3/3 = 5 1/2
+ * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
+ * n | n * H(n)
+ *
+ * This is called the "coupon collector problem" and H(n) is called the
+ * harmonic series. This could be approximated by n * ln(n), but for
+ * reasonable numbers of drives we might as well just compute the series.
+ *
+ * Alternatively we could set the target to the number of pages necessary
+ * so that the expected number of active spindles is some arbitrary
+ * percentage of the total. This sounds the same but is actually slightly
+ * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
+ * that desired fraction.
+ *
+ * Experimental results show that both of these formulas aren't aggressive
+ * enough, but we don't really have any better proposals.
+ *
+ * Note that if io_concurrency = 0 (disabled), we must set target = 0.
+ *----------
+ */
+
+
+ for (i = 1; i <= io_concurrency; i++)
+ new_prefetch_pages += (double) io_concurrency / (double) i;
+
+ *target_prefetch_pages = new_prefetch_pages;
+
+ /* This range check shouldn't fail, but let's be paranoid */
+ return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX);
+}
+
/*
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
*
diff --git a/src/backend/utils/cache/spccache.c b/src/backend/utils/cache/spccache.c
index 1a0c884..679b108 100644
--- a/src/backend/utils/cache/spccache.c
+++ b/src/backend/utils/cache/spccache.c
@@ -23,6 +23,7 @@
#include "commands/tablespace.h"
#include "miscadmin.h"
#include "optimizer/cost.h"
+#include "storage/bufmgr.h"
#include "utils/catcache.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
@@ -198,3 +199,16 @@ get_tablespace_page_costs(Oid spcid,
*spc_seq_page_cost = spc->opts->seq_page_cost;
}
}
+
+int
+get_tablespace_io_concurrency(Oid spcid)
+{
+ TableSpaceCacheEntry *spc = get_tablespace(spcid);
+
+ Assert(spc != NULL);
+
+ if (!spc->opts || spc->opts->effective_io_concurrency < 0)
+ return effective_io_concurrency;
+ else
+ return spc->opts->effective_io_concurrency;
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b3dac51..df39010 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -438,6 +438,8 @@ int temp_file_limit = -1;
int num_temp_buffers = 1024;
+int effective_io_concurrency = 0;
+
char *cluster_name = "";
char *ConfigFileName;
char *HbaFileName;
@@ -490,7 +492,6 @@ static int wal_block_size;
static bool data_checksums;
static int wal_segment_size;
static bool integer_datetimes;
-static int effective_io_concurrency;
static bool assert_enabled;
/* should be static, but commands/variable.c needs to get at this */
@@ -2352,7 +2353,7 @@ static struct config_int ConfigureNamesInt[] =
},
&effective_io_concurrency,
#ifdef USE_PREFETCH
- 1, 0, 1000,
+ 1, 0, MAX_IO_CONCURRENCY,
#else
0, 0, 0,
#endif
@@ -9986,47 +9987,9 @@ static bool
check_effective_io_concurrency(int *newval, void **extra, GucSource source)
{
#ifdef USE_PREFETCH
- double new_prefetch_pages = 0.0;
- int i;
-
- /*----------
- * The user-visible GUC parameter is the number of drives (spindles),
- * which we need to translate to a number-of-pages-to-prefetch target.
- * The target value is stashed in *extra and then assigned to the actual
- * variable by assign_effective_io_concurrency.
- *
- * The expected number of prefetch pages needed to keep N drives busy is:
- *
- * drives | I/O requests
- * -------+----------------
- * 1 | 1
- * 2 | 2/1 + 2/2 = 3
- * 3 | 3/1 + 3/2 + 3/3 = 5 1/2
- * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
- * n | n * H(n)
- *
- * This is called the "coupon collector problem" and H(n) is called the
- * harmonic series. This could be approximated by n * ln(n), but for
- * reasonable numbers of drives we might as well just compute the series.
- *
- * Alternatively we could set the target to the number of pages necessary
- * so that the expected number of active spindles is some arbitrary
- * percentage of the total. This sounds the same but is actually slightly
- * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
- * that desired fraction.
- *
- * Experimental results show that both of these formulas aren't aggressive
- * enough, but we don't really have any better proposals.
- *
- * Note that if *newval = 0 (disabled), we must set target = 0.
- *----------
- */
-
- for (i = 1; i <= *newval; i++)
- new_prefetch_pages += (double) *newval / (double) i;
+ double new_prefetch_pages;
- /* This range check shouldn't fail, but let's be paranoid */
- if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX)
+ if (ComputeIoConcurrency(*newval, &new_prefetch_pages))
{
int *myextra = (int *) guc_malloc(ERROR, sizeof(int));
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 9303f6a..2f9f8c0 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -1885,7 +1885,7 @@ psql_completion(const char *text, int start, int end)
pg_strcasecmp(prev_wd, "(") == 0)
{
static const char *const list_TABLESPACEOPTIONS[] =
- {"seq_page_cost", "random_page_cost", NULL};
+ {"seq_page_cost", "random_page_cost", "effective_io_concurrency", NULL};
COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS);
}
diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h
index 6b928a5..be9582a 100644
--- a/src/include/commands/tablespace.h
+++ b/src/include/commands/tablespace.h
@@ -39,6 +39,7 @@ typedef struct TableSpaceOpts
int32 vl_len_; /* varlena header (do not touch directly!) */
float8 random_page_cost;
float8 seq_page_cost;
+ int effective_io_concurrency;
} TableSpaceOpts;
extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 5796de8..fa0941d 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1416,15 +1416,16 @@ typedef struct BitmapIndexScanState
/* ----------------
* BitmapHeapScanState information
*
- * bitmapqualorig execution state for bitmapqualorig expressions
- * tbm bitmap obtained from child index scan(s)
- * tbmiterator iterator for scanning current pages
- * tbmres current-page data
- * exact_pages total number of exact pages retrieved
- * lossy_pages total number of lossy pages retrieved
- * prefetch_iterator iterator for prefetching ahead of current page
- * prefetch_pages # pages prefetch iterator is ahead of current
- * prefetch_target target prefetch distance
+ * bitmapqualorig execution state for bitmapqualorig expressions
+ * tbm bitmap obtained from child index scan(s)
+ * tbmiterator iterator for scanning current pages
+ * tbmres current-page data
+ * exact_pages total number of exact pages retrieved
+ * lossy_pages total number of lossy pages retrieved
+ * prefetch_iterator iterator for prefetching ahead of current page
+ * prefetch_pages # pages prefetch iterator is ahead of current
+ * prefetch_target target prefetch distance
+ * target_prefetch_pages may be overloaded by tablespace setting
* ----------------
*/
typedef struct BitmapHeapScanState
@@ -1439,6 +1440,7 @@ typedef struct BitmapHeapScanState
TBMIterator *prefetch_iterator;
int prefetch_pages;
int prefetch_target;
+ int target_prefetch_pages;
} BitmapHeapScanState;
/* ----------------
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ec0a254..9c9f141 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -58,11 +58,17 @@ extern int target_prefetch_pages;
/* in buf_init.c */
extern PGDLLIMPORT char *BufferBlocks;
+/* in guc.c */
+extern int effective_io_concurrency;
+
/* in localbuf.c */
extern PGDLLIMPORT int NLocBuffer;
extern PGDLLIMPORT Block *LocalBufferBlockPointers;
extern PGDLLIMPORT int32 *LocalRefCount;
+/* upper limit for effective_io_concurrency */
+#define MAX_IO_CONCURRENCY 1000
+
/* special block number for ReadBuffer() */
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
@@ -144,6 +150,7 @@ extern PGDLLIMPORT int32 *LocalRefCount;
/*
* prototypes for functions in bufmgr.c
*/
+extern bool ComputeIoConcurrency(int io_concurrency, double *target_prefetch_pages);
extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
BlockNumber blockNum);
extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
diff --git a/src/include/utils/spccache.h b/src/include/utils/spccache.h
index bdd1c0f..e466f36 100644
--- a/src/include/utils/spccache.h
+++ b/src/include/utils/spccache.h
@@ -15,5 +15,6 @@
void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost,
float8 *spc_seq_page_cost);
+int get_tablespace_io_concurrency(Oid spcid);
#endif /* SPCCACHE_H */
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers