Hi,

I fix and submit this patch in CF4.

In my past patch, it is significant bug which is mistaken caluculation of
offset in posix_fadvise():-( However it works well without problem in pgbench.
Because pgbench transactions are always random access...

And I test my patch in DBT-2 benchmark. Results are under following.

* Test server
  Server: HP Proliant DL360 G7
  CPU:    Xeon E5640 2.66GHz (1P/4C)
  Memory: 18GB(PC3-10600R-9)
  Disk:   146GB(15k)*4 RAID1+0
  RAID controller: P410i/256MB
  OS: RHEL 6.4(x86_64)
  FS: Ext4


* DBT-2 result(WH400, SESSION=100, ideal_score=5160)
Method     | score | average | 90%tile | Maximum
------------------------------------------------
plain      | 3589  | 9.751   | 33.680  | 87.8036
option=off | 3670  | 9.107   | 34.267  | 79.3773
option=on  | 4222  | 5.140   | 7.619   | 102.473

 "option" is "readahead_strategy" option, and "on" is my proposed.
 "average", "90%tile", and Maximum represent latency.
 Average_latency is 2 times faster than plain!


* Detail results (uploading now. please wait for a hour...)
[plain]
http://pgstatsinfo.projects.pgfoundry.org/readahead_dbt2/normal_20140109/HTML/index_thput.html
[option=off]
http://pgstatsinfo.projects.pgfoundry.org/readahead_dbt2/readahead_off_20140109/HTML/index_thput.html
[option=on]
http://pgstatsinfo.projects.pgfoundry.org/readahead_dbt2/readahead_on_20140109/HTML/index_thput.html

We can see part of super slow latency in my proposed method test.
Part of transaction active is 20%, and part of slow transactions is 80%.
It might be Pareto principle in CHECKPOINT;-)
#It's joke.

I will test some join sqls performance and TPC-3 benchmark in this or next week.

Regards,
--
Mitsumasa KONDO
NTT Open Source Software Center
*** a/configure
--- b/configure
***************
*** 11303,11309 **** fi
  LIBS_including_readline="$LIBS"
  LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
  
! for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
  do :
    as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
  ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
--- 11303,11309 ----
  LIBS_including_readline="$LIBS"
  LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
  
! for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fadvise pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
  do :
    as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
  ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
*** a/contrib/pg_prewarm/pg_prewarm.c
--- b/contrib/pg_prewarm/pg_prewarm.c
***************
*** 179,185 **** pg_prewarm(PG_FUNCTION_ARGS)
  		 */
  		for (block = first_block; block <= last_block; ++block)
  		{
! 			smgrread(rel->rd_smgr, forkNumber, block, blockbuffer);
  			++blocks_done;
  		}
  	}
--- 179,185 ----
  		 */
  		for (block = first_block; block <= last_block; ++block)
  		{
! 			smgrread(rel->rd_smgr, forkNumber, block, blockbuffer, BAS_BULKREAD);
  			++blocks_done;
  		}
  	}
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 1293,1298 **** include 'filename'
--- 1293,1322 ----
        </listitem>
       </varlistentry>
  
+      <varlistentry id="guc-readahead-strategy" xreflabel="readahead_strategy">
+       <term><varname>readahead_strategy</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>readahead_strategy</>configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         This feature is to select which readahead strategy is used. When we
+         set off(default), readahead strategy is optimized by OS. On the other
+         hands, when we set on, readahead strategy is optimized by Postgres.
+         In typicaly situations, OS readahead strategy will be good working,
+         however Postgres often knows better readahead strategy before 
+         executing disk access. For example, we can easy to predict access 
+         pattern when we input SQLs, because planner of postgres decides 
+         efficient access pattern to read faster. And it might be random access
+         pattern or sequential access pattern. It will be less disk IO and more
+         efficient to use file cache in OS. It will be better performance.
+         However this optimization is not complete now, so it is necessary to
+         choose it carefully in considering situations. Default setting is off
+         that is optimized by OS, and whenever it can change it.
+        </para>
+       </listitem>
+      </varlistentry> 
+ 
       </variablelist>
       </sect2>
  
*** a/src/backend/commands/tablecmds.c
--- b/src/backend/commands/tablecmds.c
***************
*** 9125,9131 **** copy_relation_data(SMgrRelation src, SMgrRelation dst,
  		/* If we got a cancel signal during the copy of the data, quit */
  		CHECK_FOR_INTERRUPTS();
  
! 		smgrread(src, forkNum, blkno, buf);
  
  		if (!PageIsVerified(page, blkno))
  			ereport(ERROR,
--- 9125,9131 ----
  		/* If we got a cancel signal during the copy of the data, quit */
  		CHECK_FOR_INTERRUPTS();
  
! 		smgrread(src, forkNum, blkno, buf, (char *) BAS_BULKREAD);
  
  		if (!PageIsVerified(page, blkno))
  			ereport(ERROR,
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 41,46 ****
--- 41,47 ----
  #include "pg_trace.h"
  #include "pgstat.h"
  #include "postmaster/bgwriter.h"
+ #include "storage/buf.h"
  #include "storage/buf_internals.h"
  #include "storage/bufmgr.h"
  #include "storage/ipc.h"
***************
*** 451,457 **** ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
  			if (track_io_timing)
  				INSTR_TIME_SET_CURRENT(io_start);
  
! 			smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
  
  			if (track_io_timing)
  			{
--- 452,458 ----
  			if (track_io_timing)
  				INSTR_TIME_SET_CURRENT(io_start);
  
! 			smgrread(smgr, forkNum, blockNum, (char *) bufBlock, (char *) strategy);
  
  			if (track_io_timing)
  			{
*** a/src/backend/storage/file/fd.c
--- b/src/backend/storage/file/fd.c
***************
*** 73,80 ****
--- 73,82 ----
  #include "catalog/pg_tablespace.h"
  #include "common/relpath.h"
  #include "pgstat.h"
+ #include "storage/buf.h"
  #include "storage/fd.h"
  #include "storage/ipc.h"
+ #include "storage/bufmgr.h"
  #include "utils/guc.h"
  #include "utils/resowner_private.h"
  
***************
*** 123,129 **** int			max_files_per_process = 1000;
   * setting this variable, and so need not be tested separately.
   */
  int			max_safe_fds = 32;	/* default if not changed */
! 
  
  /* Debugging.... */
  
--- 125,131 ----
   * setting this variable, and so need not be tested separately.
   */
  int			max_safe_fds = 32;	/* default if not changed */
! bool			readahead_strategy = false ;
  
  /* Debugging.... */
  
***************
*** 383,388 **** pg_flush_data(int fd, off_t offset, off_t amount)
--- 385,405 ----
  	return 0;
  }
  
+ /*
+  * pg_fadvise --- advise OS that the cache will need or not
+  *
+  * Not all platforms have posix_fadvise. If it does not support posix_fadvise,
+  * we do nothing about here.
+  */
+ int
+ pg_fadvise(int fd, off_t offset, off_t amount, int advise)
+ {
+ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) && defined(POSIX_FADV_RANDOM) && defined(POSIX_FADV_SEQUENTIAL)
+ 	return posix_fadvise(fd, offset, amount, advise);
+ #else
+ 	return 0;
+ #endif
+ }
  
  /*
   * fsync_fname -- fsync a file or directory, handling errors properly
***************
*** 1142,1147 **** OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
--- 1159,1201 ----
  }
  
  /*
+  * Controling OS file cache using posix_fadvise()
+  */
+ int
+ FileCacheAdvise(File file, off_t offset, off_t amount, int advise)
+ {
+ 	return pg_fadvise(VfdCache[file].fd, offset, amount, advise);
+ }
+ 
+ /*
+  * Select OS readahead strategy using buffer hint in postgres. If we select POSIX_FADV_RANDOM,
+  * readahead isn't executed at all and file cache replace algorithm in OS will be more smart.
+  * Because it can calculate correct number of accesses which are hot data.
+  */
+ int
+ BufferHintIOAdvise(File file, off_t offset, off_t amount, char *strategy)
+ {
+ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) && defined(POSIX_FADV_RANDOM)
+ 	if(!readahead_strategy)
+ 		return FileCacheAdvise(file, offset, amount, POSIX_FADV_NORMAL);
+ 
+ 	/* readahead optimization */
+ 	if(strategy != NULL)
+ 	{
+ 		/* use normal readahead setting, we confirmed POSIX_FADV_NORMAL is faster than FADV_SEQUENTIAL in linux. */
+ 		return FileCacheAdvise(file, offset, amount, POSIX_FADV_NORMAL);
+ 	}
+ 	else
+ 	{
+ 		/* don't use readahead in kernel, so we can more effectively use OS file cache */
+ 		return FileCacheAdvise(file, offset, amount, POSIX_FADV_RANDOM);
+ 	}
+ #else
+ 	return 0;
+ #endif
+ }
+ 
+ /*
   * close a file when done with it
   */
  void
*** a/src/backend/storage/smgr/md.c
--- b/src/backend/storage/smgr/md.c
***************
*** 653,659 **** mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
   */
  void
  mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 	   char *buffer)
  {
  	off_t		seekpos;
  	int			nbytes;
--- 653,659 ----
   */
  void
  mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 	   char *buffer, char *strategy)
  {
  	off_t		seekpos;
  	int			nbytes;
***************
*** 677,682 **** mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
--- 677,684 ----
  				 errmsg("could not seek to block %u in file \"%s\": %m",
  						blocknum, FilePathName(v->mdfd_vfd))));
  
+ 	/* Control buffered IO in OS by using posix_fadvise() */
+ 	BufferHintIOAdvise(v->mdfd_vfd, seekpos, BLCKSZ, strategy);
  	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
  
  	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
*** a/src/backend/storage/smgr/smgr.c
--- b/src/backend/storage/smgr/smgr.c
***************
*** 50,56 **** typedef struct f_smgr
  	void		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
  											  BlockNumber blocknum);
  	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
! 										  BlockNumber blocknum, char *buffer);
  	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
  						 BlockNumber blocknum, char *buffer, bool skipFsync);
  	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
--- 50,56 ----
  	void		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
  											  BlockNumber blocknum);
  	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
! 					  BlockNumber blocknum, char *buffer, char *strategy);
  	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
  						 BlockNumber blocknum, char *buffer, bool skipFsync);
  	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
***************
*** 588,596 **** smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
   */
  void
  smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 		 char *buffer)
  {
! 	(*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
  }
  
  /*
--- 588,596 ----
   */
  void
  smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 		 char *buffer, char *strategy)
  {
! 	(*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer, strategy);
  }
  
  /*
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 769,774 **** static struct config_bool ConfigureNamesBool[] =
--- 769,783 ----
  		NULL, NULL, NULL
  	},
  	{
+ 		{"readahead_strategy", PGC_USERSET, QUERY_TUNING_METHOD,
+ 			gettext_noop("off is optimized readahead by kernel, on is optimized by postgres."),
+ 			NULL
+ 		},
+ 		&readahead_strategy,
+ 		false,
+ 		NULL, NULL, NULL
+ 	},
+ 	{
  		{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
  			gettext_noop("Enables genetic query optimization."),
  			gettext_noop("This algorithm attempts to do planning without "
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 138,143 ****
--- 138,145 ----
  
  #temp_file_limit = -1			# limits per-session temp file space
  					# in kB, or -1 for no limit
+ #readahead_strategy = off		# off is optimized by OS,
+ 					# on is optimized by postgres
  
  # - Kernel Resource Usage -
  
*** a/src/include/storage/bufmgr.h
--- b/src/include/storage/bufmgr.h
***************
*** 50,55 **** extern int	bgwriter_lru_maxpages;
--- 50,56 ----
  extern double bgwriter_lru_multiplier;
  extern bool track_io_timing;
  extern int	target_prefetch_pages;
+ extern bool	readahead_strategy;
  
  /* in buf_init.c */
  extern PGDLLIMPORT char *BufferBlocks;
*** a/src/include/storage/fd.h
--- b/src/include/storage/fd.h
***************
*** 68,73 **** extern int	max_safe_fds;
--- 68,74 ----
  extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
  extern File OpenTemporaryFile(bool interXact);
  extern void FileClose(File file);
+ extern int	FileCacheAdvise(File file, off_t offset, off_t amount, int advise);
  extern int	FilePrefetch(File file, off_t offset, int amount);
  extern int	FileRead(File file, char *buffer, int amount);
  extern int	FileWrite(File file, char *buffer, int amount);
***************
*** 75,80 **** extern int	FileSync(File file);
--- 76,82 ----
  extern off_t FileSeek(File file, off_t offset, int whence);
  extern int	FileTruncate(File file, off_t offset);
  extern char *FilePathName(File file);
+ extern int	BufferHintIOAdvise(File file, off_t offset, off_t amount, char *strategy);
  
  /* Operations that allow use of regular stdio --- USE WITH CAUTION */
  extern FILE *AllocateFile(const char *name, const char *mode);
***************
*** 113,118 **** extern int	pg_fsync_no_writethrough(int fd);
--- 115,121 ----
  extern int	pg_fsync_writethrough(int fd);
  extern int	pg_fdatasync(int fd);
  extern int	pg_flush_data(int fd, off_t offset, off_t amount);
+ extern int	pg_fadvise(int fd, off_t offset, off_t amount, int advise);
  extern void fsync_fname(char *fname, bool isdir);
  
  /* Filename components for OpenTemporaryFile */
*** a/src/include/storage/smgr.h
--- b/src/include/storage/smgr.h
***************
*** 92,98 **** extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
  extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
  			 BlockNumber blocknum);
  extern void smgrread(SMgrRelation reln, ForkNumber forknum,
! 		 BlockNumber blocknum, char *buffer);
  extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
  		  BlockNumber blocknum, char *buffer, bool skipFsync);
  extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
--- 92,98 ----
  extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
  			 BlockNumber blocknum);
  extern void smgrread(SMgrRelation reln, ForkNumber forknum,
! 			BlockNumber blocknum, char *buffer, char *strategy);
  extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
  		  BlockNumber blocknum, char *buffer, bool skipFsync);
  extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
***************
*** 118,124 **** extern void mdextend(SMgrRelation reln, ForkNumber forknum,
  extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
  		   BlockNumber blocknum);
  extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 	   char *buffer);
  extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
  		BlockNumber blocknum, char *buffer, bool skipFsync);
  extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
--- 118,124 ----
  extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
  		   BlockNumber blocknum);
  extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 	   char *buffer, char *strategy);
  extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
  		BlockNumber blocknum, char *buffer, bool skipFsync);
  extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to