Jeff Janes wrote:
I've attached a tiny patch to apply over yours, to deal with this and
with the case where no files are synced.

Thanks for that. That obvious error eluded me because in most of the patch update testing I was doing (on ext3), the longest sync was always about the same length as the total sync time.

Attached patch (in correct diff form this time!) collects up all changes. That includes elimination of a potential race condition if someone changes log_checkpoints while a long sync phase is executing. I don't know whether that can happen, and it obviously won't give accurate stats going back to the beginning of the checkpoint in that case, but it tries to defend aginst producing complete garbage if that value changes out from under it.

This is the first version of this patch I feel fairly good about; no open concerns left on my side. Jeff, since you're now the de-facto credited reviewer of this one by virtue of suggesting bug fixes, could you take this update out for a spin too?

Combining this instrumentation patch with the backend sync one already
committed, the net result under log_min_messages=debug1is somewhat
undesirable in that I can now see the individual sync times for the
syncs done by the checkpoint writer, but I do not get times for the
syncs done by the backend (I only get informed of their existence).

On a properly working system, backend syncs shouldn't be happening. So if you see them, I think the question shouldn't be "how long are they taking?", it's "how do I get rid of them?" From that perspective, knowing of their existence is sufficient to suggest the necessary tuning changes, such as dropping bgwriter_delay.

When you get into a situation where they are showing up, a whole lot of them can happen in a very brief period; enough that I'd actually be concerned about the added timing overhead, something I normally don't care about very much. That's the main reason I didn't bother instrumenting those too--the situations where they happen are ones already running low on resources.

Big writes for things that can only be written out at checkpoint time, on the other hand, are unavoidable in the current design. Providing detail on them is going to be relevant unless there's a major refactoring of how those happen.

--
Greg Smith   2ndQuadrant US    g...@2ndquadrant.com   Baltimore, MD
PostgreSQL Training, Services and Support        www.2ndQuadrant.us


diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ae9ed8f..6adc139 100644
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
*************** LogCheckpointEnd(bool restartpoint)
*** 6955,6964 ****
  {
  	long		write_secs,
  				sync_secs,
! 				total_secs;
  	int			write_usecs,
  				sync_usecs,
! 				total_usecs;
  
  	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
  
--- 6955,6969 ----
  {
  	long		write_secs,
  				sync_secs,
! 				total_secs,
! 				longest_secs,
! 				average_secs;
  	int			write_usecs,
  				sync_usecs,
! 				total_usecs,
! 				longest_usecs,
! 				average_usecs;
! 	double		average_sync_time;
  
  	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
  
*************** LogCheckpointEnd(bool restartpoint)
*** 6974,6991 ****
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
  	if (restartpoint)
  		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
  			 CheckpointStats.ckpt_bufs_written,
  			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  			 write_secs, write_usecs / 1000,
  			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
  	else
  		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
  			 "%d transaction log file(s) added, %d removed, %d recycled; "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
  			 CheckpointStats.ckpt_bufs_written,
  			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  			 CheckpointStats.ckpt_segs_added,
--- 6979,7017 ----
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
+ 	/*
+ 	 * Timing values returned from CheckpointStats are in milliseconds.
+ 	 * Convert to the second plus microsecond form that TimestampDifference
+ 	 * returns for homogeneous printing.
+ 	 */
+ 	longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000);
+ 	longest_usecs = 1000 * (CheckpointStats.ckpt_longest_sync -
+ 		longest_secs * 1000);
+ 
+ 	average_sync_time = 0;
+ 	if (CheckpointStats.ckpt_sync_rels > 0) 
+ 		average_sync_time = CheckpointStats.ckpt_agg_sync_time /
+ 			CheckpointStats.ckpt_sync_rels;
+ 	average_secs = (long) (average_sync_time / 1000);
+ 	average_usecs = 1000 * (average_sync_time - average_secs * 1000);
+ 
  	if (restartpoint)
  		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
! 			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
  			 CheckpointStats.ckpt_bufs_written,
  			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  			 write_secs, write_usecs / 1000,
  			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000,
! 			 CheckpointStats.ckpt_sync_rels,
! 			 longest_secs, longest_usecs / 1000,
! 			 average_secs, average_usecs / 1000);
  	else
  		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
  			 "%d transaction log file(s) added, %d removed, %d recycled; "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
! 			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
  			 CheckpointStats.ckpt_bufs_written,
  			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  			 CheckpointStats.ckpt_segs_added,
*************** LogCheckpointEnd(bool restartpoint)
*** 6993,6999 ****
  			 CheckpointStats.ckpt_segs_recycled,
  			 write_secs, write_usecs / 1000,
  			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
  }
  
  /*
--- 7019,7028 ----
  			 CheckpointStats.ckpt_segs_recycled,
  			 write_secs, write_usecs / 1000,
  			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000,
! 			 CheckpointStats.ckpt_sync_rels,
! 			 longest_secs, longest_usecs / 1000,
! 			 average_secs, average_usecs / 1000);
  }
  
  /*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 604db21..1f602a7 100644
*** a/src/backend/storage/smgr/md.c
--- b/src/backend/storage/smgr/md.c
***************
*** 20,25 ****
--- 20,26 ----
  
  #include "catalog/catalog.h"
  #include "miscadmin.h"
+ #include "portability/instr_time.h"
  #include "postmaster/bgwriter.h"
  #include "storage/fd.h"
  #include "storage/bufmgr.h"
*************** mdsync(void)
*** 927,932 ****
--- 928,940 ----
  	PendingOperationEntry *entry;
  	int			absorb_counter;
  
+ 	/* Statistics on sync times */
+ 	int processed = 0;
+ 	instr_time sync_start, sync_end, sync_diff; 
+ 	double elapsed;
+ 	double longest = 0;
+ 	double total_elapsed = 0;
+ 
  	/*
  	 * This is only called during checkpoints, and checkpoints should only
  	 * occur in processes that have created a pendingOpsTable.
*************** mdsync(void)
*** 1069,1077 ****
--- 1077,1107 ----
  				seg = _mdfd_getseg(reln, entry->tag.forknum,
  							  entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
  								   false, EXTENSION_RETURN_NULL);
+ 
+ 				if (log_checkpoints)
+ 					INSTR_TIME_SET_CURRENT(sync_start);
+ 				else
+ 					INSTR_TIME_SET_ZERO(sync_start);
+ 
  				if (seg != NULL &&
  					FileSync(seg->mdfd_vfd) >= 0)
+ 				{
+ 					if (log_checkpoints && (! INSTR_TIME_IS_ZERO(sync_start)))
+ 					{
+ 						INSTR_TIME_SET_CURRENT(sync_end);
+ 						sync_diff = sync_end;
+ 						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+ 						elapsed = (double) INSTR_TIME_GET_MICROSEC(sync_diff) / 1000.0;
+ 						if (elapsed > longest)
+ 							longest = elapsed;
+ 						total_elapsed += elapsed;
+ 						processed++;
+ 						elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%f msec", 
+ 							processed, FilePathName(seg->mdfd_vfd), elapsed);
+ 					}
+ 
  					break;		/* success; break out of retry loop */
+ 				}
  
  				/*
  				 * XXX is there any point in allowing more than one retry?
*************** mdsync(void)
*** 1113,1118 ****
--- 1143,1153 ----
  			elog(ERROR, "pendingOpsTable corrupted");
  	}							/* end loop over hashtable entries */
  
+ 	/* Return sync performance metrics for report at checkpoint end */
+ 	CheckpointStats.ckpt_sync_rels = processed;
+ 	CheckpointStats.ckpt_longest_sync = longest;
+ 	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+ 
  	/* Flag successful completion of mdsync */
  	mdsync_in_progress = false;
  }
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index fa7ae2a..d95f843 100644
*** a/src/include/access/xlog.h
--- b/src/include/access/xlog.h
*************** typedef struct CheckpointStatsData
*** 257,262 ****
--- 257,269 ----
  	int			ckpt_segs_added;	/* # of new xlog segments created */
  	int			ckpt_segs_removed;		/* # of xlog segments deleted */
  	int			ckpt_segs_recycled;		/* # of xlog segments recycled */
+ 
+ 	int			ckpt_sync_rels;		/* # of relations synced */
+ 	double		ckpt_longest_sync;	/* Longest sync for one relation */
+ 	double		ckpt_agg_sync_time;	/* The sum of all the individual sync
+ 									   times, which is not necessarily the
+ 									   same as the total elapsed time for
+ 									   the entire sync phase. */
  } CheckpointStatsData;
  
  extern CheckpointStatsData CheckpointStats;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to