On 07/26/2015 06:01 PM, Fabien COELHO wrote:
Attached is very minor v5 update which does a rebase & completes the
cleanup of doing a full sort instead of a chuncked sort.

Some thoughts on this:

* I think we should drop the "flush" part of this for now. It's not as clearly beneficial as the sorting part, and adds a great deal more code complexity. And it's orthogonal to the sorting patch, so we can deal with it separately.

* Is it really necessary to parallelize the I/O among tablespaces? I can see the point, but I wonder if it makes any difference in practice.

* Is there ever any harm in sorting the buffers? The GUC is useful for benchmarking, but could we leave it out of the final patch?

* Do we need to worry about exceeding the 1 GB allocation limit in AllocateCheckpointBufferIds? It's enough got 2 TB of shared_buffers. That's a lot, but it's not totally crazy these days that someone might do that. At the very least, we need to lower the maximum of shared_buffers so that you can't hit that limit.

I ripped out the "flushing" part, keeping only the sorting. I refactored the logic in BufferSync() a bit. There's now a separate function, nextCheckpointBuffer(), that returns the next buffer ID from the sorted list. The tablespace-parallelization behaviour in encapsulated there, keeping the code in BufferSync() much simpler. See attached. Needs some minor cleanup and commenting still before committing, and I haven't done any testing besides a simple "make check".

- Heikki

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e900dcc..1cec243 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2454,6 +2454,28 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-checkpoint-sort" xreflabel="checkpoint_sort">
+      <term><varname>checkpoint_sort</varname> (<type>bool</type>)
+      <indexterm>
+       <primary><varname>checkpoint_sort</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Whether to sort buffers before writting them out to disk on checkpoint.
+        For a HDD storage, this setting allows to group together
+        neighboring pages written to disk, thus improving performance by
+        reducing random write activity.
+        This sorting should have limited performance effects on SSD backends
+        as such storages have good random write performance, but it may
+        help with wear-leveling so be worth keeping anyway.
+        The default is <literal>on</>.
+        This parameter can only be set in the <filename>postgresql.conf</>
+        file or on the server command line.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-checkpoint-warning" xreflabel="checkpoint_warning">
       <term><varname>checkpoint_warning</varname> (<type>integer</type>)
       <indexterm>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index e3941c9..f538698 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -546,6 +546,18 @@
   </para>
 
   <para>
+   When hard-disk drives (HDD) are used for terminal data storage
+   <xref linkend="guc-checkpoint-sort"> allows to sort pages
+   so that neighboring pages on disk will be flushed together by
+   chekpoints, reducing the random write load and improving performance.
+   If solid-state drives (SSD) are used, sorting pages induces no benefit
+   as their random write I/O performance is good: this feature could then
+   be disabled by setting <varname>checkpoint_sort</> to <value>off</>.
+   It is possible that sorting may help with SSD wear leveling, so it may
+   be kept on that account.
+  </para>
+
+  <para>
    The number of WAL segment files in <filename>pg_xlog</> directory depends on
    <varname>min_wal_size</>, <varname>max_wal_size</> and
    the amount of WAL generated in previous checkpoint cycles. When old log
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 68e33eb..bee38ab 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7995,11 +7995,13 @@ LogCheckpointEnd(bool restartpoint)
 				sync_secs,
 				total_secs,
 				longest_secs,
+				sort_secs,
 				average_secs;
 	int			write_usecs,
 				sync_usecs,
 				total_usecs,
 				longest_usecs,
+				sort_usecs,
 				average_usecs;
 	uint64		average_sync_time;
 
@@ -8030,6 +8032,10 @@ LogCheckpointEnd(bool restartpoint)
 						CheckpointStats.ckpt_end_t,
 						&total_secs, &total_usecs);
 
+	TimestampDifference(CheckpointStats.ckpt_sort_t,
+						CheckpointStats.ckpt_sort_end_t,
+						&sort_secs, &sort_usecs);
+
 	/*
 	 * Timing values returned from CheckpointStats are in microseconds.
 	 * Convert to the second plus microsecond form that TimestampDifference
@@ -8048,8 +8054,8 @@ LogCheckpointEnd(bool restartpoint)
 
 	elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
 		 "%d transaction log file(s) added, %d removed, %d recycled; "
-		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
-		 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+		 "sort=%ld.%03d s, write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s;"
+		 " sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
 		 "distance=%d kB, estimate=%d kB",
 		 restartpoint ? "restartpoint" : "checkpoint",
 		 CheckpointStats.ckpt_bufs_written,
@@ -8057,6 +8063,7 @@ LogCheckpointEnd(bool restartpoint)
 		 CheckpointStats.ckpt_segs_added,
 		 CheckpointStats.ckpt_segs_removed,
 		 CheckpointStats.ckpt_segs_recycled,
+		 sort_secs, sort_usecs / 1000,
 		 write_secs, write_usecs / 1000,
 		 sync_secs, sync_usecs / 1000,
 		 total_secs, total_usecs / 1000,
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e4b25587..084bbfb 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -80,6 +80,7 @@ bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
 double		bgwriter_lru_multiplier = 2.0;
 bool		track_io_timing = false;
+bool		checkpoint_sort = true;
 
 /*
  * How many buffers PrefetchBuffer callers should try to stay ahead of their
@@ -1562,6 +1563,101 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 }
 
 /*
+ * Array of buffer ids of all buffers to checkpoint.
+ */
+static int *CheckpointBufferIds = NULL;
+
+/* Compare checkpoint buffers
+ */
+static int bufcmp(const int * pa, const int * pb)
+{
+	BufferDesc
+		*a = GetBufferDescriptor(*pa),
+		*b = GetBufferDescriptor(*pb);
+
+	/* tag: rnode, forkNum (different files), blockNum
+	 * rnode: { spcNode (ignore: not really needed),
+	 *   dbNode (ignore: this is a directory), relNode }
+	 * spcNode: table space oid, not that there are at least two
+	 * (pg_global and pg_default).
+	 */
+	/* compare relation */
+	if (a->tag.rnode.spcNode < b->tag.rnode.spcNode)
+		return -1;
+	else if (a->tag.rnode.spcNode > b->tag.rnode.spcNode)
+		return 1;
+	if (a->tag.rnode.relNode < b->tag.rnode.relNode)
+		return -1;
+	else if (a->tag.rnode.relNode > b->tag.rnode.relNode)
+		return 1;
+	/* same relation, compare fork */
+	else if (a->tag.forkNum < b->tag.forkNum)
+		return -1;
+	else if (a->tag.forkNum > b->tag.forkNum)
+		return 1;
+	/* same relation/fork, so same segmented "file", compare block number
+	 * which are mapped on different segments depending on the number.
+	 */
+	else if (a->tag.blockNum < b->tag.blockNum)
+		return -1;
+	else /* should not be the same block anyway... */
+		return 1;
+}
+
+static void
+AllocateCheckpointBufferIds(void)
+{
+	/* Safe worst case allocation, all buffers belong to the checkpoint...
+	 * that is pretty unlikely.
+	 */
+	CheckpointBufferIds = (int *) palloc(sizeof(int) * NBuffers);
+}
+
+/* Status of buffers to checkpoint for a particular tablespace,
+ * used internally in BufferSync.
+ * - spcNone: oid of the tablespace
+ * - num_to_write: number of pages counted for this tablespace
+ * - num_written: number of pages actually written out
+ * - index: scanning position in CheckpointBufferIds for this tablespace
+ */
+typedef struct TableSpaceCheckpointStatus {
+	int index;
+	int index_end;
+} TableSpaceCheckpointStatus;
+
+static int		allocatedSpc = 0;
+static TableSpaceCheckpointStatus *spcStatus = NULL;
+static int		numSpc;
+static int		currSpc;
+
+static int
+nextCheckpointBuffer(void)
+{
+	int			result;
+
+	if (numSpc == 0)
+		return -1;
+
+	currSpc = (currSpc + 1) % numSpc;
+
+	result = CheckpointBufferIds[spcStatus[currSpc].index];
+	spcStatus[currSpc].index++;
+
+	if (spcStatus[currSpc].index == spcStatus[currSpc].index_end)
+	{
+		if (currSpc < numSpc - 1)
+		{
+			TableSpaceCheckpointStatus tmp = spcStatus[currSpc];
+			spcStatus[currSpc] = spcStatus[numSpc - 1];
+			spcStatus[numSpc - 1] = tmp;
+		}
+		numSpc--;
+	}
+
+	return result;
+}
+
+/*
  * BufferSync -- Write out all dirty buffers in the pool.
  *
  * This is called at checkpoint time to write out all dirty shared buffers.
@@ -1575,11 +1671,17 @@ static void
 BufferSync(int flags)
 {
 	int			buf_id;
-	int			num_to_scan;
 	int			num_to_write;
 	int			num_written;
 	int			mask = BM_DIRTY;
 
+	/*
+	 * Lazy allocation: this function is called through the checkpointer,
+	 * but also by initdb. Maybe the allocation could be moved to the callers.
+	 */
+	if (CheckpointBufferIds == NULL)
+		AllocateCheckpointBufferIds();
+
 	/* Make sure we can handle the pin inside SyncOneBuffer */
 	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
@@ -1622,6 +1724,7 @@ BufferSync(int flags)
 		if ((bufHdr->flags & mask) == mask)
 		{
 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+			CheckpointBufferIds[num_to_write] = buf_id;
 			num_to_write++;
 		}
 
@@ -1633,18 +1736,97 @@ BufferSync(int flags)
 
 	TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);
 
+	/* Build checkpoint tablespace buffer status & flush context arrays */
+
+	/*
+	 * Sort buffer ids to help find sequential writes.
+	 *
+	 * Note: buffers are not locked in anyway, but that does not matter,
+	 * this sorting is really advisory, if some buffer changes status during
+	 * this pass it will be filtered out later.  The only necessary property
+	 * is that marked buffers do not move elsewhere.  Also, qsort implementation
+	 * should be resilient to occasional contradictions (cmp(a,b) != -cmp(b,a))
+	 * because of these possible concurrent changes.
+	 */
+	CheckpointStats.ckpt_sort_t = GetCurrentTimestamp();
+
+	if (checkpoint_sort && num_to_write > 1 && false)
+	{
+		Oid			lastspc;
+		Oid			spc;
+		int			i,
+					j;
+		volatile BufferDesc *bufHdr;
+
+		qsort(CheckpointBufferIds, num_to_write,  sizeof(int),
+				  (int(*)(const void *, const void *)) bufcmp);
+
+		if (allocatedSpc == 0)
+		{
+			allocatedSpc = 5;
+			spcStatus = (TableSpaceCheckpointStatus *)
+				palloc(sizeof(TableSpaceCheckpointStatus) * allocatedSpc);
+		}
+
+		bufHdr = GetBufferDescriptor(CheckpointBufferIds[0]);
+		spcStatus[0].index = 0;
+		lastspc = bufHdr->tag.rnode.spcNode;
+		j = 0;
+		for (i = 1; i < num_to_write; i++)
+		{
+			bufHdr = GetBufferDescriptor(CheckpointBufferIds[i]);
+
+			spc = bufHdr->tag.rnode.spcNode;
+			if (spc != lastspc && (bufHdr->flags & BM_CHECKPOINT_NEEDED) != 0)
+			{
+				if (allocatedSpc <= j)
+				{
+					allocatedSpc = j + 5;
+					spcStatus = (TableSpaceCheckpointStatus *)
+						repalloc(spcStatus, sizeof(TableSpaceCheckpointStatus) * allocatedSpc);
+				}
+
+				spcStatus[j].index_end = spcStatus[j + 1].index = i;
+				j++;
+				lastspc = spc;
+			}
+		}
+		spcStatus[j].index_end = num_to_write;
+		j++;
+		numSpc = j;
+		currSpc = 0;
+	}
+	else
+	{
+		if (allocatedSpc == 0)
+		{
+			allocatedSpc = 1;
+			spcStatus = (TableSpaceCheckpointStatus *)
+				palloc(sizeof(TableSpaceCheckpointStatus) * allocatedSpc);
+		}
+		spcStatus[0].index = 0;
+		spcStatus[0].index_end = num_to_write;
+		numSpc = 1;
+		currSpc = 0;
+	}
+
+	CheckpointStats.ckpt_sort_end_t = GetCurrentTimestamp();
+
 	/*
-	 * Loop over all buffers again, and write the ones (still) marked with
-	 * BM_CHECKPOINT_NEEDED.  In this loop, we start at the clock sweep point
-	 * since we might as well dump soon-to-be-recycled buffers first.
+	 * Loop over buffers to write through CheckpointBufferIds,
+	 * and write the ones (still) marked with BM_CHECKPOINT_NEEDED,
+	 * with some round robin over table spaces so as to balance writes,
+	 * so that buffer writes move forward roughly proportionally for each
+	 * tablespace.
 	 *
-	 * Note that we don't read the buffer alloc count here --- that should be
-	 * left untouched till the next BgBufferSync() call.
+	 * Termination: if a tablespace is selected by the inner while loop
+	 * (see argument there), its index is incremented and will eventually
+	 * reach num_to_write, mark this table space scanning as done and
+	 * decrement the number of active spaces, which will thus reach 0.
 	 */
-	buf_id = StrategySyncStart(NULL, NULL);
-	num_to_scan = NBuffers;
 	num_written = 0;
-	while (num_to_scan-- > 0)
+
+	while ((buf_id = nextCheckpointBuffer()) != -1)
 	{
 		volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
 
@@ -1669,28 +1851,11 @@ BufferSync(int flags)
 				num_written++;
 
 				/*
-				 * We know there are at most num_to_write buffers with
-				 * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
-				 * num_written reaches num_to_write.
-				 *
-				 * Note that num_written doesn't include buffers written by
-				 * other backends, or by the bgwriter cleaning scan. That
-				 * means that the estimate of how much progress we've made is
-				 * conservative, and also that this test will often fail to
-				 * trigger.  But it seems worth making anyway.
-				 */
-				if (num_written >= num_to_write)
-					break;
-
-				/*
 				 * Sleep to throttle our I/O rate.
 				 */
 				CheckpointWriteDelay(flags, (double) num_written / num_to_write);
 			}
 		}
-
-		if (++buf_id >= NBuffers)
-			buf_id = 0;
 	}
 
 	/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 1b7b914..e07daca 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1013,6 +1013,17 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+
+	{
+		{"checkpoint_sort", PGC_SIGHUP, WAL_CHECKPOINTS,
+		 gettext_noop("Whether disk-page buffers are sorted on checkpoints."),
+		 NULL
+		},
+		&checkpoint_sort,
+		true,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"log_connections", PGC_SU_BACKEND, LOGGING_WHAT,
 			gettext_noop("Logs each successful connection."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e5d275d..e84f380 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -201,6 +201,7 @@
 #max_wal_size = 1GB
 #min_wal_size = 80MB
 #checkpoint_completion_target = 0.5	# checkpoint target duration, 0.0 - 1.0
+#checkpoint_sort = on			# sort buffers on checkpoint
 #checkpoint_warning = 30s		# 0 disables
 
 # - Archiving -
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 6dacee2..dbd4757 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -186,6 +186,8 @@ extern bool XLOG_DEBUG;
 typedef struct CheckpointStatsData
 {
 	TimestampTz ckpt_start_t;	/* start of checkpoint */
+	TimestampTz ckpt_sort_t;    /* start buffer sorting */
+	TimestampTz ckpt_sort_end_t;      /* end of sorting */
 	TimestampTz ckpt_write_t;	/* start of flushing buffers */
 	TimestampTz ckpt_sync_t;	/* start of fsyncs */
 	TimestampTz ckpt_sync_end_t;	/* end of fsyncs */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ec0a254..c228f39 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -54,6 +54,7 @@ extern int	bgwriter_lru_maxpages;
 extern double bgwriter_lru_multiplier;
 extern bool track_io_timing;
 extern int	target_prefetch_pages;
+extern bool checkpoint_sort;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to