Re: [HACKERS] Improvement of checkpoint IO scheduler for stable transaction responses

Heikki Linnakangas Sat, 20 Jul 2013 12:38:51 -0700

On 20.07.2013 19:28, Greg Smith wrote:

On 7/20/13 4:48 AM, didier wrote:

With your tests did you try to write the hot buffers first? ie buffers
with a high refcount, either by sorting them on refcount or at least
sweeping the buffer list in reverse?


I never tried that version. After a few rounds of seeing that all
changes I tried were just rearranging the good and bad cases, I got
pretty bored with trying new changes in that same style.

It doesn't seem like we're getting anywhere with minor changes to theexisting logic. The reason I brought up sorting the writes in the firstplace is that it allows you to fsync() each segment after it's written,rather than doing all the writes first, and then fsyncing all the relations.

Mitsumasa-san, since you have the test rig ready, could you try theattached patch please? It scans the buffer cache several times, writingout all the dirty buffers for segment A first, then fsyncs it, then alldirty buffers for segment B, and so on. The patch is ugly, but if itproves to be helpful, we can spend the time to clean it up.


- Heikki

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8079226..14149a9 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1210,10 +1210,15 @@ static void
 BufferSync(int flags)
 {
 	int			buf_id;
-	int			num_to_scan;
+	int			buf_id_start;
 	int			num_to_write;
 	int			num_written;
 	int			mask = BM_DIRTY;
+	bool		target_chosen = false;
+	RelFileNode	target_rnode = {0, 0, 0};
+	ForkNumber	target_forknum = 0;
+	int			target_segno = 0;
+	int			target_bufs = 0;
 
 	/* Make sure we can handle the pin inside SyncOneBuffer */
 	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
@@ -1275,10 +1280,10 @@ BufferSync(int flags)
 	 * Note that we don't read the buffer alloc count here --- that should be
 	 * left untouched till the next BgBufferSync() call.
 	 */
-	buf_id = StrategySyncStart(NULL, NULL);
-	num_to_scan = NBuffers;
+	buf_id_start = 0;
 	num_written = 0;
-	while (num_to_scan-- > 0)
+	target_chosen = false;
+	for (buf_id = 0; buf_id < NBuffers; buf_id++)
 	{
 		volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
 
@@ -1294,7 +1299,12 @@ BufferSync(int flags)
 		 * write the buffer though we didn't need to.  It doesn't seem worth
 		 * guarding against this, though.
 		 */
-		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
+		/* the above reasoning applies to the target checks too */
+		if ((!target_chosen ||
+			 (RelFileNodeEquals(target_rnode, bufHdr->tag.rnode) &&
+			  target_forknum == bufHdr->tag.forkNum &&
+			  target_segno == bufHdr->tag.blockNum / RELSEG_SIZE)) &&
+			bufHdr->flags & BM_CHECKPOINT_NEEDED)
 		{
 			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
 			{
@@ -1320,11 +1330,34 @@ BufferSync(int flags)
 				 * Sleep to throttle our I/O rate.
 				 */
 				CheckpointWriteDelay(flags, (double) num_written / num_to_write);
+
+				/* Find other buffers belonging to this segment */
+				if (!target_chosen)
+				{
+					LockBufHdr(bufHdr);
+					target_rnode = bufHdr->tag.rnode;
+					target_forknum = bufHdr->tag.forkNum;
+					target_segno = bufHdr->tag.blockNum / RELSEG_SIZE;
+					UnlockBufHdr(bufHdr);
+					target_bufs = 0;
+					buf_id_start = buf_id;
+					target_chosen = true;
+				}
+				target_bufs++;
 			}
 		}
 
-		if (++buf_id >= NBuffers)
-			buf_id = 0;
+		if (buf_id == NBuffers - 1 && target_chosen)
+		{
+			if (log_checkpoints)
+				elog(LOG, "checkpoint sync: wrote %d buffers for target, syncing",
+					 target_bufs);
+
+			smgrsyncrel(target_rnode, target_forknum, target_segno);
+			target_chosen = false;
+			/* continue the scan where we left before we chose the target */
+			buf_id = buf_id_start;
+		}
 	}
 
 	/*
@@ -1825,10 +1858,11 @@ CheckPointBuffers(int flags)
 {
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
 	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
+	smgrsyncbegin();
 	BufferSync(flags);
 	CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
-	smgrsync();
+	smgrsyncend();
 	CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
 }
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index e629181..4e4eb03 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -187,6 +187,7 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
 			 BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 		   MdfdVec *seg);
+static void mdsyncguts(PendingOperationEntry *entry, ForkNumber forknum, int segno);
 
 
 /*
@@ -235,7 +236,8 @@ SetForwardFsyncRequests(void)
 	/* Perform any pending fsyncs we may have queued up, then drop table */
 	if (pendingOpsTable)
 	{
-		mdsync();
+		mdsyncbegin();
+		mdsyncend();
 		hash_destroy(pendingOpsTable);
 	}
 	pendingOpsTable = NULL;
@@ -970,27 +972,18 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 	}
 }
 
+static bool mdsync_in_progress = false;
+/* Statistics on sync times */
+static 	int			processed = 0;
+static	uint64		longest = 0;
+static	uint64		total_elapsed = 0;
+
 /*
  *	mdsync() -- Sync previous writes to stable storage.
  */
 void
-mdsync(void)
+mdsyncbegin(void)
 {
-	static bool mdsync_in_progress = false;
-
-	HASH_SEQ_STATUS hstat;
-	PendingOperationEntry *entry;
-	int			absorb_counter;
-
-	/* Statistics on sync times */
-	int			processed = 0;
-	instr_time	sync_start,
-				sync_end,
-				sync_diff;
-	uint64		elapsed;
-	uint64		longest = 0;
-	uint64		total_elapsed = 0;
-
 	/*
 	 * This is only called during checkpoints, and checkpoints should only
 	 * occur in processes that have created a pendingOpsTable.
@@ -1036,6 +1029,8 @@ mdsync(void)
 	if (mdsync_in_progress)
 	{
 		/* prior try failed, so update any stale cycle_ctr values */
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;
 		hash_seq_init(&hstat, pendingOpsTable);
 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 		{
@@ -1048,6 +1043,151 @@ mdsync(void)
 
 	/* Set flag to detect failure if we don't reach the end of the loop */
 	mdsync_in_progress = true;
+}
+
+void
+mdsyncrel(RelFileNode rnode, ForkNumber forknum, int segno)
+{
+	PendingOperationEntry *entry;
+
+	/*
+	 * If fsync is off then we don't have to bother opening the
+	 * file at all.  (We delay checking until this point so that
+	 * changing fsync on the fly behaves sensibly.)
+	 */
+	if (!enableFsync)
+		return;
+
+	entry = hash_search(pendingOpsTable, &rnode, HASH_FIND, NULL);
+
+	if (entry)
+	{
+		if (bms_is_member(segno, entry->requests[forknum]))
+		{
+			mdsyncguts(entry, forknum, segno);
+			bms_del_member(entry->requests[forknum], segno);
+		}
+	}
+}
+
+static void
+mdsyncguts(PendingOperationEntry *entry, ForkNumber forknum, int segno)
+{
+	int			failures;
+	instr_time	sync_start,
+				sync_end,
+				sync_diff;
+	uint64		elapsed;
+
+	/*
+	 * The fsync table could contain requests to fsync segments
+	 * that have been deleted (unlinked) by the time we get to
+	 * them. Rather than just hoping an ENOENT (or EACCES on
+	 * Windows) error can be ignored, what we do on error is
+	 * absorb pending requests and then retry.	Since mdunlink()
+	 * queues a "cancel" message before actually unlinking, the
+	 * fsync request is guaranteed to be marked canceled after the
+	 * absorb if it really was this case. DROP DATABASE likewise
+	 * has to tell us to forget fsync requests before it starts
+	 * deletions.
+	 */
+	for (failures = 0;; failures++) /* loop exits at "break" */
+	{
+		SMgrRelation reln;
+		MdfdVec    *seg;
+		char	   *path;
+		int			save_errno;
+
+		/*
+		 * Find or create an smgr hash entry for this relation.
+		 * This may seem a bit unclean -- md calling smgr?	But
+		 * it's really the best solution.  It ensures that the
+		 * open file reference isn't permanently leaked if we get
+		 * an error here. (You may say "but an unreferenced
+		 * SMgrRelation is still a leak!" Not really, because the
+		 * only case in which a checkpoint is done by a process
+		 * that isn't about to shut down is in the checkpointer,
+		 * and it will periodically do smgrcloseall(). This fact
+		 * justifies our not closing the reln in the success path
+		 * either, which is a good thing since in non-checkpointer
+		 * cases we couldn't safely do that.)
+		 */
+		reln = smgropen(entry->rnode, InvalidBackendId);
+
+		/* Attempt to open and fsync the target segment */
+		seg = _mdfd_getseg(reln, forknum,
+						   (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
+						   false, EXTENSION_RETURN_NULL);
+
+		INSTR_TIME_SET_CURRENT(sync_start);
+
+		if (seg != NULL && FileSync(seg->mdfd_vfd) >= 0)
+		{
+			/* Success; update statistics about sync timing */
+			INSTR_TIME_SET_CURRENT(sync_end);
+			sync_diff = sync_end;
+			INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+			elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+			if (elapsed > longest)
+				longest = elapsed;
+			total_elapsed += elapsed;
+			processed++;
+			if (log_checkpoints)
+				elog(LOG, "checkpoint sync: number=%d file=%s time=%.3f msec",
+					 processed,
+					 FilePathName(seg->mdfd_vfd),
+					 (double) elapsed / 1000);
+
+			break;	/* out of retry loop */
+		}
+
+		/* Compute file name for use in message */
+		save_errno = errno;
+		path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
+		errno = save_errno;
+
+		/*
+		 * It is possible that the relation has been dropped or
+		 * truncated since the fsync request was entered.
+		 * Therefore, allow ENOENT, but only if we didn't fail
+		 * already on this file.  This applies both for
+		 * _mdfd_getseg() and for FileSync, since fd.c might have
+		 * closed the file behind our back.
+		 *
+		 * XXX is there any point in allowing more than one retry?
+		 * Don't see one at the moment, but easy to change the
+		 * test here if so.
+		 */
+		if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m",
+							path)));
+		else
+			ereport(DEBUG1,
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\" but retrying: %m",
+							path)));
+		pfree(path);
+
+		/*
+		 * Absorb incoming requests and check to see if a cancel
+		 * arrived for this relation fork.
+		 */
+		AbsorbFsyncRequests();
+		//absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
+
+		if (entry->canceled[forknum])
+			break;
+	}				/* end retry loop */
+}
+
+void
+mdsyncend(void)
+{
+	int			absorb_counter;
+	HASH_SEQ_STATUS hstat;
+	PendingOperationEntry *entry;
 
 	/* Now scan the hashtable for fsync requests to process */
 	absorb_counter = FSYNCS_PER_ABSORB;
@@ -1088,8 +1228,6 @@ mdsync(void)
 
 			while ((segno = bms_first_member(requests)) >= 0)
 			{
-				int			failures;
-
 				/*
 				 * If fsync is off then we don't have to bother opening the
 				 * file at all.  (We delay checking until this point so that
@@ -1111,109 +1249,7 @@ mdsync(void)
 					absorb_counter = FSYNCS_PER_ABSORB;
 				}
 
-				/*
-				 * The fsync table could contain requests to fsync segments
-				 * that have been deleted (unlinked) by the time we get to
-				 * them. Rather than just hoping an ENOENT (or EACCES on
-				 * Windows) error can be ignored, what we do on error is
-				 * absorb pending requests and then retry.	Since mdunlink()
-				 * queues a "cancel" message before actually unlinking, the
-				 * fsync request is guaranteed to be marked canceled after the
-				 * absorb if it really was this case. DROP DATABASE likewise
-				 * has to tell us to forget fsync requests before it starts
-				 * deletions.
-				 */
-				for (failures = 0;; failures++) /* loop exits at "break" */
-				{
-					SMgrRelation reln;
-					MdfdVec    *seg;
-					char	   *path;
-					int			save_errno;
-
-					/*
-					 * Find or create an smgr hash entry for this relation.
-					 * This may seem a bit unclean -- md calling smgr?	But
-					 * it's really the best solution.  It ensures that the
-					 * open file reference isn't permanently leaked if we get
-					 * an error here. (You may say "but an unreferenced
-					 * SMgrRelation is still a leak!" Not really, because the
-					 * only case in which a checkpoint is done by a process
-					 * that isn't about to shut down is in the checkpointer,
-					 * and it will periodically do smgrcloseall(). This fact
-					 * justifies our not closing the reln in the success path
-					 * either, which is a good thing since in non-checkpointer
-					 * cases we couldn't safely do that.)
-					 */
-					reln = smgropen(entry->rnode, InvalidBackendId);
-
-					/* Attempt to open and fsync the target segment */
-					seg = _mdfd_getseg(reln, forknum,
-							 (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
-									   false, EXTENSION_RETURN_NULL);
-
-					INSTR_TIME_SET_CURRENT(sync_start);
-
-					if (seg != NULL &&
-						FileSync(seg->mdfd_vfd) >= 0)
-					{
-						/* Success; update statistics about sync timing */
-						INSTR_TIME_SET_CURRENT(sync_end);
-						sync_diff = sync_end;
-						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
-						elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
-						if (elapsed > longest)
-							longest = elapsed;
-						total_elapsed += elapsed;
-						processed++;
-						if (log_checkpoints)
-							elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
-								 processed,
-								 FilePathName(seg->mdfd_vfd),
-								 (double) elapsed / 1000);
-
-						break;	/* out of retry loop */
-					}
-
-					/* Compute file name for use in message */
-					save_errno = errno;
-					path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
-					errno = save_errno;
-
-					/*
-					 * It is possible that the relation has been dropped or
-					 * truncated since the fsync request was entered.
-					 * Therefore, allow ENOENT, but only if we didn't fail
-					 * already on this file.  This applies both for
-					 * _mdfd_getseg() and for FileSync, since fd.c might have
-					 * closed the file behind our back.
-					 *
-					 * XXX is there any point in allowing more than one retry?
-					 * Don't see one at the moment, but easy to change the
-					 * test here if so.
-					 */
-					if (!FILE_POSSIBLY_DELETED(errno) ||
-						failures > 0)
-						ereport(ERROR,
-								(errcode_for_file_access(),
-								 errmsg("could not fsync file \"%s\": %m",
-										path)));
-					else
-						ereport(DEBUG1,
-								(errcode_for_file_access(),
-						errmsg("could not fsync file \"%s\" but retrying: %m",
-							   path)));
-					pfree(path);
-
-					/*
-					 * Absorb incoming requests and check to see if a cancel
-					 * arrived for this relation fork.
-					 */
-					AbsorbFsyncRequests();
-					absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
-
-					if (entry->canceled[forknum])
-						break;
-				}				/* end retry loop */
+				mdsyncguts(entry, forknum, segno);
 			}
 			bms_free(requests);
 		}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index f7f1437..00b000d 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -58,7 +58,9 @@ typedef struct f_smgr
 											  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_pre_ckpt) (void);		/* may be NULL */
-	void		(*smgr_sync) (void);	/* may be NULL */
+	void		(*smgr_sync_begin) (void);	/* may be NULL */
+	void		(*smgr_sync_rel) (RelFileNode rnode, ForkNumber forknum, int segno);	/* may be NULL */
+	void		(*smgr_sync_end) (void);	/* may be NULL */
 	void		(*smgr_post_ckpt) (void);		/* may be NULL */
 } f_smgr;
 
@@ -67,7 +69,7 @@ static const f_smgr smgrsw[] = {
 	/* magnetic disk */
 	{mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
 		mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
-		mdpreckpt, mdsync, mdpostckpt
+	 mdpreckpt, mdsyncbegin, mdsyncrel, mdsyncend, mdpostckpt
 	}
 };
 
@@ -705,17 +707,47 @@ smgrpreckpt(void)
 }
 
 /*
- *	smgrsync() -- Sync files to disk during checkpoint.
+ *	smgrsyncbegin() -- Sync files to disk during checkpoint.
  */
 void
-smgrsync(void)
+smgrsyncbegin(void)
 {
 	int			i;
 
 	for (i = 0; i < NSmgr; i++)
 	{
-		if (smgrsw[i].smgr_sync)
-			(*(smgrsw[i].smgr_sync)) ();
+		if (smgrsw[i].smgr_sync_begin)
+			(*(smgrsw[i].smgr_sync_begin)) ();
+	}
+}
+
+/*
+ *	smgrsyncrel() -- Sync files to disk during checkpoint.
+ */
+void
+smgrsyncrel(RelFileNode rnode, ForkNumber forknum, int segno)
+{
+	int			i;
+
+	for (i = 0; i < NSmgr; i++)
+	{
+		if (smgrsw[i].smgr_sync_rel)
+			(*(smgrsw[i].smgr_sync_rel)) (rnode, forknum, segno);
+	}
+}
+
+/*
+ *	smgrsyncend() -- Sync files to disk during checkpoint.
+ */
+void
+smgrsyncend(void)
+{
+	int			i;
+
+	for (i = 0; i < NSmgr; i++)
+	{
+		if (smgrsw[i].smgr_sync_end)
+			(*(smgrsw[i].smgr_sync_end)) ();
 	}
 }
 
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 98b6f13..4725221 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -100,7 +100,9 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
 			 BlockNumber nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
 extern void smgrpreckpt(void);
-extern void smgrsync(void);
+extern void smgrsyncbegin(void);
+extern void smgrsyncrel(RelFileNode rnode, ForkNumber forknum, int segno);
+extern void smgrsyncend(void);
 extern void smgrpostckpt(void);
 extern void AtEOXact_SMgr(void);
 
@@ -126,7 +128,9 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 		   BlockNumber nblocks);
 extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
 extern void mdpreckpt(void);
-extern void mdsync(void);
+extern void mdsyncbegin(void);
+extern void mdsyncrel(RelFileNode rnode, ForkNumber forknum, int segno);
+extern void mdsyncend(void);
 extern void mdpostckpt(void);
 
 extern void SetForwardFsyncRequests(void);

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Improvement of checkpoint IO scheduler for stable transaction responses

Reply via email to