Hi, I fix and submit this patch in CF4.
In my past patch, it is significant bug which is mistaken caluculation of offset in posix_fadvise():-( However it works well without problem in pgbench. Because pgbench transactions are always random access... And I test my patch in DBT-2 benchmark. Results are under following. * Test server Server: HP Proliant DL360 G7 CPU: Xeon E5640 2.66GHz (1P/4C) Memory: 18GB(PC3-10600R-9) Disk: 146GB(15k)*4 RAID1+0 RAID controller: P410i/256MB OS: RHEL 6.4(x86_64) FS: Ext4 * DBT-2 result(WH400, SESSION=100, ideal_score=5160) Method | score | average | 90%tile | Maximum ------------------------------------------------ plain | 3589 | 9.751 | 33.680 | 87.8036 option=off | 3670 | 9.107 | 34.267 | 79.3773 option=on | 4222 | 5.140 | 7.619 | 102.473 "option" is "readahead_strategy" option, and "on" is my proposed. "average", "90%tile", and Maximum represent latency. Average_latency is 2 times faster than plain! * Detail results (uploading now. please wait for a hour...) [plain] http://pgstatsinfo.projects.pgfoundry.org/readahead_dbt2/normal_20140109/HTML/index_thput.html [option=off] http://pgstatsinfo.projects.pgfoundry.org/readahead_dbt2/readahead_off_20140109/HTML/index_thput.html [option=on] http://pgstatsinfo.projects.pgfoundry.org/readahead_dbt2/readahead_on_20140109/HTML/index_thput.html We can see part of super slow latency in my proposed method test. Part of transaction active is 20%, and part of slow transactions is 80%. It might be Pareto principle in CHECKPOINT;-) #It's joke. I will test some join sqls performance and TPC-3 benchmark in this or next week. Regards, -- Mitsumasa KONDO NTT Open Source Software Center
*** a/configure --- b/configure *************** *** 11303,11309 **** fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` ! for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" --- 11303,11309 ---- LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` ! for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fadvise pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" *** a/contrib/pg_prewarm/pg_prewarm.c --- b/contrib/pg_prewarm/pg_prewarm.c *************** *** 179,185 **** pg_prewarm(PG_FUNCTION_ARGS) */ for (block = first_block; block <= last_block; ++block) { ! smgrread(rel->rd_smgr, forkNumber, block, blockbuffer); ++blocks_done; } } --- 179,185 ---- */ for (block = first_block; block <= last_block; ++block) { ! smgrread(rel->rd_smgr, forkNumber, block, blockbuffer, BAS_BULKREAD); ++blocks_done; } } *** a/doc/src/sgml/config.sgml --- b/doc/src/sgml/config.sgml *************** *** 1293,1298 **** include 'filename' --- 1293,1322 ---- </listitem> </varlistentry> + <varlistentry id="guc-readahead-strategy" xreflabel="readahead_strategy"> + <term><varname>readahead_strategy</varname> (<type>integer</type>)</term> + <indexterm> + <primary><varname>readahead_strategy</>configuration parameter</primary> + </indexterm> + <listitem> + <para> + This feature is to select which readahead strategy is used. When we + set off(default), readahead strategy is optimized by OS. On the other + hands, when we set on, readahead strategy is optimized by Postgres. + In typicaly situations, OS readahead strategy will be good working, + however Postgres often knows better readahead strategy before + executing disk access. For example, we can easy to predict access + pattern when we input SQLs, because planner of postgres decides + efficient access pattern to read faster. And it might be random access + pattern or sequential access pattern. It will be less disk IO and more + efficient to use file cache in OS. It will be better performance. + However this optimization is not complete now, so it is necessary to + choose it carefully in considering situations. Default setting is off + that is optimized by OS, and whenever it can change it. + </para> + </listitem> + </varlistentry> + </variablelist> </sect2> *** a/src/backend/commands/tablecmds.c --- b/src/backend/commands/tablecmds.c *************** *** 9125,9131 **** copy_relation_data(SMgrRelation src, SMgrRelation dst, /* If we got a cancel signal during the copy of the data, quit */ CHECK_FOR_INTERRUPTS(); ! smgrread(src, forkNum, blkno, buf); if (!PageIsVerified(page, blkno)) ereport(ERROR, --- 9125,9131 ---- /* If we got a cancel signal during the copy of the data, quit */ CHECK_FOR_INTERRUPTS(); ! smgrread(src, forkNum, blkno, buf, (char *) BAS_BULKREAD); if (!PageIsVerified(page, blkno)) ereport(ERROR, *** a/src/backend/storage/buffer/bufmgr.c --- b/src/backend/storage/buffer/bufmgr.c *************** *** 41,46 **** --- 41,47 ---- #include "pg_trace.h" #include "pgstat.h" #include "postmaster/bgwriter.h" + #include "storage/buf.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/ipc.h" *************** *** 451,457 **** ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); ! smgrread(smgr, forkNum, blockNum, (char *) bufBlock); if (track_io_timing) { --- 452,458 ---- if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); ! smgrread(smgr, forkNum, blockNum, (char *) bufBlock, (char *) strategy); if (track_io_timing) { *** a/src/backend/storage/file/fd.c --- b/src/backend/storage/file/fd.c *************** *** 73,80 **** --- 73,82 ---- #include "catalog/pg_tablespace.h" #include "common/relpath.h" #include "pgstat.h" + #include "storage/buf.h" #include "storage/fd.h" #include "storage/ipc.h" + #include "storage/bufmgr.h" #include "utils/guc.h" #include "utils/resowner_private.h" *************** *** 123,129 **** int max_files_per_process = 1000; * setting this variable, and so need not be tested separately. */ int max_safe_fds = 32; /* default if not changed */ ! /* Debugging.... */ --- 125,131 ---- * setting this variable, and so need not be tested separately. */ int max_safe_fds = 32; /* default if not changed */ ! bool readahead_strategy = false ; /* Debugging.... */ *************** *** 383,388 **** pg_flush_data(int fd, off_t offset, off_t amount) --- 385,405 ---- return 0; } + /* + * pg_fadvise --- advise OS that the cache will need or not + * + * Not all platforms have posix_fadvise. If it does not support posix_fadvise, + * we do nothing about here. + */ + int + pg_fadvise(int fd, off_t offset, off_t amount, int advise) + { + #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) && defined(POSIX_FADV_RANDOM) && defined(POSIX_FADV_SEQUENTIAL) + return posix_fadvise(fd, offset, amount, advise); + #else + return 0; + #endif + } /* * fsync_fname -- fsync a file or directory, handling errors properly *************** *** 1142,1147 **** OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) --- 1159,1201 ---- } /* + * Controling OS file cache using posix_fadvise() + */ + int + FileCacheAdvise(File file, off_t offset, off_t amount, int advise) + { + return pg_fadvise(VfdCache[file].fd, offset, amount, advise); + } + + /* + * Select OS readahead strategy using buffer hint in postgres. If we select POSIX_FADV_RANDOM, + * readahead isn't executed at all and file cache replace algorithm in OS will be more smart. + * Because it can calculate correct number of accesses which are hot data. + */ + int + BufferHintIOAdvise(File file, off_t offset, off_t amount, char *strategy) + { + #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) && defined(POSIX_FADV_RANDOM) + if(!readahead_strategy) + return FileCacheAdvise(file, offset, amount, POSIX_FADV_NORMAL); + + /* readahead optimization */ + if(strategy != NULL) + { + /* use normal readahead setting, we confirmed POSIX_FADV_NORMAL is faster than FADV_SEQUENTIAL in linux. */ + return FileCacheAdvise(file, offset, amount, POSIX_FADV_NORMAL); + } + else + { + /* don't use readahead in kernel, so we can more effectively use OS file cache */ + return FileCacheAdvise(file, offset, amount, POSIX_FADV_RANDOM); + } + #else + return 0; + #endif + } + + /* * close a file when done with it */ void *** a/src/backend/storage/smgr/md.c --- b/src/backend/storage/smgr/md.c *************** *** 653,659 **** mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) */ void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ! char *buffer) { off_t seekpos; int nbytes; --- 653,659 ---- */ void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ! char *buffer, char *strategy) { off_t seekpos; int nbytes; *************** *** 677,682 **** mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, --- 677,684 ---- errmsg("could not seek to block %u in file \"%s\": %m", blocknum, FilePathName(v->mdfd_vfd)))); + /* Control buffered IO in OS by using posix_fadvise() */ + BufferHintIOAdvise(v->mdfd_vfd, seekpos, BLCKSZ, strategy); nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ); TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, *** a/src/backend/storage/smgr/smgr.c --- b/src/backend/storage/smgr/smgr.c *************** *** 50,56 **** typedef struct f_smgr void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, ! BlockNumber blocknum, char *buffer); void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); --- 50,56 ---- void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, ! BlockNumber blocknum, char *buffer, char *strategy); void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); *************** *** 588,596 **** smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) */ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ! char *buffer) { ! (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer); } /* --- 588,596 ---- */ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ! char *buffer, char *strategy) { ! (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer, strategy); } /* *** a/src/backend/utils/misc/guc.c --- b/src/backend/utils/misc/guc.c *************** *** 769,774 **** static struct config_bool ConfigureNamesBool[] = --- 769,783 ---- NULL, NULL, NULL }, { + {"readahead_strategy", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("off is optimized readahead by kernel, on is optimized by postgres."), + NULL + }, + &readahead_strategy, + false, + NULL, NULL, NULL + }, + { {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, gettext_noop("Enables genetic query optimization."), gettext_noop("This algorithm attempts to do planning without " *** a/src/backend/utils/misc/postgresql.conf.sample --- b/src/backend/utils/misc/postgresql.conf.sample *************** *** 138,143 **** --- 138,145 ---- #temp_file_limit = -1 # limits per-session temp file space # in kB, or -1 for no limit + #readahead_strategy = off # off is optimized by OS, + # on is optimized by postgres # - Kernel Resource Usage - *** a/src/include/storage/bufmgr.h --- b/src/include/storage/bufmgr.h *************** *** 50,55 **** extern int bgwriter_lru_maxpages; --- 50,56 ---- extern double bgwriter_lru_multiplier; extern bool track_io_timing; extern int target_prefetch_pages; + extern bool readahead_strategy; /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; *** a/src/include/storage/fd.h --- b/src/include/storage/fd.h *************** *** 68,73 **** extern int max_safe_fds; --- 68,74 ---- extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode); extern File OpenTemporaryFile(bool interXact); extern void FileClose(File file); + extern int FileCacheAdvise(File file, off_t offset, off_t amount, int advise); extern int FilePrefetch(File file, off_t offset, int amount); extern int FileRead(File file, char *buffer, int amount); extern int FileWrite(File file, char *buffer, int amount); *************** *** 75,80 **** extern int FileSync(File file); --- 76,82 ---- extern off_t FileSeek(File file, off_t offset, int whence); extern int FileTruncate(File file, off_t offset); extern char *FilePathName(File file); + extern int BufferHintIOAdvise(File file, off_t offset, off_t amount, char *strategy); /* Operations that allow use of regular stdio --- USE WITH CAUTION */ extern FILE *AllocateFile(const char *name, const char *mode); *************** *** 113,118 **** extern int pg_fsync_no_writethrough(int fd); --- 115,121 ---- extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); + extern int pg_fadvise(int fd, off_t offset, off_t amount, int advise); extern void fsync_fname(char *fname, bool isdir); /* Filename components for OpenTemporaryFile */ *** a/src/include/storage/smgr.h --- b/src/include/storage/smgr.h *************** *** 92,98 **** extern void smgrextend(SMgrRelation reln, ForkNumber forknum, extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void smgrread(SMgrRelation reln, ForkNumber forknum, ! BlockNumber blocknum, char *buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); --- 92,98 ---- extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void smgrread(SMgrRelation reln, ForkNumber forknum, ! BlockNumber blocknum, char *buffer, char *strategy); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); *************** *** 118,124 **** extern void mdextend(SMgrRelation reln, ForkNumber forknum, extern void mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ! char *buffer); extern void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); --- 118,124 ---- extern void mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ! char *buffer, char *strategy); extern void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers