On 07/26/2015 06:01 PM, Fabien COELHO wrote:
Attached is very minor v5 update which does a rebase & completes the
cleanup of doing a full sort instead of a chuncked sort.
Some thoughts on this:
* I think we should drop the "flush" part of this for now. It's not as
clearly beneficial as the sorting part, and adds a great deal more code
complexity. And it's orthogonal to the sorting patch, so we can deal
with it separately.
* Is it really necessary to parallelize the I/O among tablespaces? I can
see the point, but I wonder if it makes any difference in practice.
* Is there ever any harm in sorting the buffers? The GUC is useful for
benchmarking, but could we leave it out of the final patch?
* Do we need to worry about exceeding the 1 GB allocation limit in
AllocateCheckpointBufferIds? It's enough got 2 TB of shared_buffers.
That's a lot, but it's not totally crazy these days that someone might
do that. At the very least, we need to lower the maximum of
shared_buffers so that you can't hit that limit.
I ripped out the "flushing" part, keeping only the sorting. I refactored
the logic in BufferSync() a bit. There's now a separate function,
nextCheckpointBuffer(), that returns the next buffer ID from the sorted
list. The tablespace-parallelization behaviour in encapsulated there,
keeping the code in BufferSync() much simpler. See attached. Needs some
minor cleanup and commenting still before committing, and I haven't done
any testing besides a simple "make check".
- Heikki
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e900dcc..1cec243 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2454,6 +2454,28 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
+ <varlistentry id="guc-checkpoint-sort" xreflabel="checkpoint_sort">
+ <term><varname>checkpoint_sort</varname> (<type>bool</type>)
+ <indexterm>
+ <primary><varname>checkpoint_sort</> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Whether to sort buffers before writting them out to disk on checkpoint.
+ For a HDD storage, this setting allows to group together
+ neighboring pages written to disk, thus improving performance by
+ reducing random write activity.
+ This sorting should have limited performance effects on SSD backends
+ as such storages have good random write performance, but it may
+ help with wear-leveling so be worth keeping anyway.
+ The default is <literal>on</>.
+ This parameter can only be set in the <filename>postgresql.conf</>
+ file or on the server command line.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="guc-checkpoint-warning" xreflabel="checkpoint_warning">
<term><varname>checkpoint_warning</varname> (<type>integer</type>)
<indexterm>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index e3941c9..f538698 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -546,6 +546,18 @@
</para>
<para>
+ When hard-disk drives (HDD) are used for terminal data storage
+ <xref linkend="guc-checkpoint-sort"> allows to sort pages
+ so that neighboring pages on disk will be flushed together by
+ chekpoints, reducing the random write load and improving performance.
+ If solid-state drives (SSD) are used, sorting pages induces no benefit
+ as their random write I/O performance is good: this feature could then
+ be disabled by setting <varname>checkpoint_sort</> to <value>off</>.
+ It is possible that sorting may help with SSD wear leveling, so it may
+ be kept on that account.
+ </para>
+
+ <para>
The number of WAL segment files in <filename>pg_xlog</> directory depends on
<varname>min_wal_size</>, <varname>max_wal_size</> and
the amount of WAL generated in previous checkpoint cycles. When old log
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 68e33eb..bee38ab 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7995,11 +7995,13 @@ LogCheckpointEnd(bool restartpoint)
sync_secs,
total_secs,
longest_secs,
+ sort_secs,
average_secs;
int write_usecs,
sync_usecs,
total_usecs,
longest_usecs,
+ sort_usecs,
average_usecs;
uint64 average_sync_time;
@@ -8030,6 +8032,10 @@ LogCheckpointEnd(bool restartpoint)
CheckpointStats.ckpt_end_t,
&total_secs, &total_usecs);
+ TimestampDifference(CheckpointStats.ckpt_sort_t,
+ CheckpointStats.ckpt_sort_end_t,
+ &sort_secs, &sort_usecs);
+
/*
* Timing values returned from CheckpointStats are in microseconds.
* Convert to the second plus microsecond form that TimestampDifference
@@ -8048,8 +8054,8 @@ LogCheckpointEnd(bool restartpoint)
elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
"%d transaction log file(s) added, %d removed, %d recycled; "
- "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
- "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+ "sort=%ld.%03d s, write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s;"
+ " sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
"distance=%d kB, estimate=%d kB",
restartpoint ? "restartpoint" : "checkpoint",
CheckpointStats.ckpt_bufs_written,
@@ -8057,6 +8063,7 @@ LogCheckpointEnd(bool restartpoint)
CheckpointStats.ckpt_segs_added,
CheckpointStats.ckpt_segs_removed,
CheckpointStats.ckpt_segs_recycled,
+ sort_secs, sort_usecs / 1000,
write_secs, write_usecs / 1000,
sync_secs, sync_usecs / 1000,
total_secs, total_usecs / 1000,
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e4b25587..084bbfb 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -80,6 +80,7 @@ bool zero_damaged_pages = false;
int bgwriter_lru_maxpages = 100;
double bgwriter_lru_multiplier = 2.0;
bool track_io_timing = false;
+bool checkpoint_sort = true;
/*
* How many buffers PrefetchBuffer callers should try to stay ahead of their
@@ -1562,6 +1563,101 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
}
/*
+ * Array of buffer ids of all buffers to checkpoint.
+ */
+static int *CheckpointBufferIds = NULL;
+
+/* Compare checkpoint buffers
+ */
+static int bufcmp(const int * pa, const int * pb)
+{
+ BufferDesc
+ *a = GetBufferDescriptor(*pa),
+ *b = GetBufferDescriptor(*pb);
+
+ /* tag: rnode, forkNum (different files), blockNum
+ * rnode: { spcNode (ignore: not really needed),
+ * dbNode (ignore: this is a directory), relNode }
+ * spcNode: table space oid, not that there are at least two
+ * (pg_global and pg_default).
+ */
+ /* compare relation */
+ if (a->tag.rnode.spcNode < b->tag.rnode.spcNode)
+ return -1;
+ else if (a->tag.rnode.spcNode > b->tag.rnode.spcNode)
+ return 1;
+ if (a->tag.rnode.relNode < b->tag.rnode.relNode)
+ return -1;
+ else if (a->tag.rnode.relNode > b->tag.rnode.relNode)
+ return 1;
+ /* same relation, compare fork */
+ else if (a->tag.forkNum < b->tag.forkNum)
+ return -1;
+ else if (a->tag.forkNum > b->tag.forkNum)
+ return 1;
+ /* same relation/fork, so same segmented "file", compare block number
+ * which are mapped on different segments depending on the number.
+ */
+ else if (a->tag.blockNum < b->tag.blockNum)
+ return -1;
+ else /* should not be the same block anyway... */
+ return 1;
+}
+
+static void
+AllocateCheckpointBufferIds(void)
+{
+ /* Safe worst case allocation, all buffers belong to the checkpoint...
+ * that is pretty unlikely.
+ */
+ CheckpointBufferIds = (int *) palloc(sizeof(int) * NBuffers);
+}
+
+/* Status of buffers to checkpoint for a particular tablespace,
+ * used internally in BufferSync.
+ * - spcNone: oid of the tablespace
+ * - num_to_write: number of pages counted for this tablespace
+ * - num_written: number of pages actually written out
+ * - index: scanning position in CheckpointBufferIds for this tablespace
+ */
+typedef struct TableSpaceCheckpointStatus {
+ int index;
+ int index_end;
+} TableSpaceCheckpointStatus;
+
+static int allocatedSpc = 0;
+static TableSpaceCheckpointStatus *spcStatus = NULL;
+static int numSpc;
+static int currSpc;
+
+static int
+nextCheckpointBuffer(void)
+{
+ int result;
+
+ if (numSpc == 0)
+ return -1;
+
+ currSpc = (currSpc + 1) % numSpc;
+
+ result = CheckpointBufferIds[spcStatus[currSpc].index];
+ spcStatus[currSpc].index++;
+
+ if (spcStatus[currSpc].index == spcStatus[currSpc].index_end)
+ {
+ if (currSpc < numSpc - 1)
+ {
+ TableSpaceCheckpointStatus tmp = spcStatus[currSpc];
+ spcStatus[currSpc] = spcStatus[numSpc - 1];
+ spcStatus[numSpc - 1] = tmp;
+ }
+ numSpc--;
+ }
+
+ return result;
+}
+
+/*
* BufferSync -- Write out all dirty buffers in the pool.
*
* This is called at checkpoint time to write out all dirty shared buffers.
@@ -1575,11 +1671,17 @@ static void
BufferSync(int flags)
{
int buf_id;
- int num_to_scan;
int num_to_write;
int num_written;
int mask = BM_DIRTY;
+ /*
+ * Lazy allocation: this function is called through the checkpointer,
+ * but also by initdb. Maybe the allocation could be moved to the callers.
+ */
+ if (CheckpointBufferIds == NULL)
+ AllocateCheckpointBufferIds();
+
/* Make sure we can handle the pin inside SyncOneBuffer */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
@@ -1622,6 +1724,7 @@ BufferSync(int flags)
if ((bufHdr->flags & mask) == mask)
{
bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+ CheckpointBufferIds[num_to_write] = buf_id;
num_to_write++;
}
@@ -1633,18 +1736,97 @@ BufferSync(int flags)
TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);
+ /* Build checkpoint tablespace buffer status & flush context arrays */
+
+ /*
+ * Sort buffer ids to help find sequential writes.
+ *
+ * Note: buffers are not locked in anyway, but that does not matter,
+ * this sorting is really advisory, if some buffer changes status during
+ * this pass it will be filtered out later. The only necessary property
+ * is that marked buffers do not move elsewhere. Also, qsort implementation
+ * should be resilient to occasional contradictions (cmp(a,b) != -cmp(b,a))
+ * because of these possible concurrent changes.
+ */
+ CheckpointStats.ckpt_sort_t = GetCurrentTimestamp();
+
+ if (checkpoint_sort && num_to_write > 1 && false)
+ {
+ Oid lastspc;
+ Oid spc;
+ int i,
+ j;
+ volatile BufferDesc *bufHdr;
+
+ qsort(CheckpointBufferIds, num_to_write, sizeof(int),
+ (int(*)(const void *, const void *)) bufcmp);
+
+ if (allocatedSpc == 0)
+ {
+ allocatedSpc = 5;
+ spcStatus = (TableSpaceCheckpointStatus *)
+ palloc(sizeof(TableSpaceCheckpointStatus) * allocatedSpc);
+ }
+
+ bufHdr = GetBufferDescriptor(CheckpointBufferIds[0]);
+ spcStatus[0].index = 0;
+ lastspc = bufHdr->tag.rnode.spcNode;
+ j = 0;
+ for (i = 1; i < num_to_write; i++)
+ {
+ bufHdr = GetBufferDescriptor(CheckpointBufferIds[i]);
+
+ spc = bufHdr->tag.rnode.spcNode;
+ if (spc != lastspc && (bufHdr->flags & BM_CHECKPOINT_NEEDED) != 0)
+ {
+ if (allocatedSpc <= j)
+ {
+ allocatedSpc = j + 5;
+ spcStatus = (TableSpaceCheckpointStatus *)
+ repalloc(spcStatus, sizeof(TableSpaceCheckpointStatus) * allocatedSpc);
+ }
+
+ spcStatus[j].index_end = spcStatus[j + 1].index = i;
+ j++;
+ lastspc = spc;
+ }
+ }
+ spcStatus[j].index_end = num_to_write;
+ j++;
+ numSpc = j;
+ currSpc = 0;
+ }
+ else
+ {
+ if (allocatedSpc == 0)
+ {
+ allocatedSpc = 1;
+ spcStatus = (TableSpaceCheckpointStatus *)
+ palloc(sizeof(TableSpaceCheckpointStatus) * allocatedSpc);
+ }
+ spcStatus[0].index = 0;
+ spcStatus[0].index_end = num_to_write;
+ numSpc = 1;
+ currSpc = 0;
+ }
+
+ CheckpointStats.ckpt_sort_end_t = GetCurrentTimestamp();
+
/*
- * Loop over all buffers again, and write the ones (still) marked with
- * BM_CHECKPOINT_NEEDED. In this loop, we start at the clock sweep point
- * since we might as well dump soon-to-be-recycled buffers first.
+ * Loop over buffers to write through CheckpointBufferIds,
+ * and write the ones (still) marked with BM_CHECKPOINT_NEEDED,
+ * with some round robin over table spaces so as to balance writes,
+ * so that buffer writes move forward roughly proportionally for each
+ * tablespace.
*
- * Note that we don't read the buffer alloc count here --- that should be
- * left untouched till the next BgBufferSync() call.
+ * Termination: if a tablespace is selected by the inner while loop
+ * (see argument there), its index is incremented and will eventually
+ * reach num_to_write, mark this table space scanning as done and
+ * decrement the number of active spaces, which will thus reach 0.
*/
- buf_id = StrategySyncStart(NULL, NULL);
- num_to_scan = NBuffers;
num_written = 0;
- while (num_to_scan-- > 0)
+
+ while ((buf_id = nextCheckpointBuffer()) != -1)
{
volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
@@ -1669,28 +1851,11 @@ BufferSync(int flags)
num_written++;
/*
- * We know there are at most num_to_write buffers with
- * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
- * num_written reaches num_to_write.
- *
- * Note that num_written doesn't include buffers written by
- * other backends, or by the bgwriter cleaning scan. That
- * means that the estimate of how much progress we've made is
- * conservative, and also that this test will often fail to
- * trigger. But it seems worth making anyway.
- */
- if (num_written >= num_to_write)
- break;
-
- /*
* Sleep to throttle our I/O rate.
*/
CheckpointWriteDelay(flags, (double) num_written / num_to_write);
}
}
-
- if (++buf_id >= NBuffers)
- buf_id = 0;
}
/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 1b7b914..e07daca 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1013,6 +1013,17 @@ static struct config_bool ConfigureNamesBool[] =
false,
NULL, NULL, NULL
},
+
+ {
+ {"checkpoint_sort", PGC_SIGHUP, WAL_CHECKPOINTS,
+ gettext_noop("Whether disk-page buffers are sorted on checkpoints."),
+ NULL
+ },
+ &checkpoint_sort,
+ true,
+ NULL, NULL, NULL
+ },
+
{
{"log_connections", PGC_SU_BACKEND, LOGGING_WHAT,
gettext_noop("Logs each successful connection."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e5d275d..e84f380 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -201,6 +201,7 @@
#max_wal_size = 1GB
#min_wal_size = 80MB
#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0
+#checkpoint_sort = on # sort buffers on checkpoint
#checkpoint_warning = 30s # 0 disables
# - Archiving -
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 6dacee2..dbd4757 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -186,6 +186,8 @@ extern bool XLOG_DEBUG;
typedef struct CheckpointStatsData
{
TimestampTz ckpt_start_t; /* start of checkpoint */
+ TimestampTz ckpt_sort_t; /* start buffer sorting */
+ TimestampTz ckpt_sort_end_t; /* end of sorting */
TimestampTz ckpt_write_t; /* start of flushing buffers */
TimestampTz ckpt_sync_t; /* start of fsyncs */
TimestampTz ckpt_sync_end_t; /* end of fsyncs */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ec0a254..c228f39 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -54,6 +54,7 @@ extern int bgwriter_lru_maxpages;
extern double bgwriter_lru_multiplier;
extern bool track_io_timing;
extern int target_prefetch_pages;
+extern bool checkpoint_sort;
/* in buf_init.c */
extern PGDLLIMPORT char *BufferBlocks;
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers