Hi,
I fix and submit this patch in CF4.
In my past patch, it is significant bug which is mistaken caluculation of
offset in posix_fadvise():-( However it works well without problem in pgbench.
Because pgbench transactions are always random access...
And I test my patch in DBT-2 benchmark. Results are under following.
* Test server
Server: HP Proliant DL360 G7
CPU: Xeon E5640 2.66GHz (1P/4C)
Memory: 18GB(PC3-10600R-9)
Disk: 146GB(15k)*4 RAID1+0
RAID controller: P410i/256MB
OS: RHEL 6.4(x86_64)
FS: Ext4
* DBT-2 result(WH400, SESSION=100, ideal_score=5160)
Method | score | average | 90%tile | Maximum
------------------------------------------------
plain | 3589 | 9.751 | 33.680 | 87.8036
option=off | 3670 | 9.107 | 34.267 | 79.3773
option=on | 4222 | 5.140 | 7.619 | 102.473
"option" is "readahead_strategy" option, and "on" is my proposed.
"average", "90%tile", and Maximum represent latency.
Average_latency is 2 times faster than plain!
* Detail results (uploading now. please wait for a hour...)
[plain]
http://pgstatsinfo.projects.pgfoundry.org/readahead_dbt2/normal_20140109/HTML/index_thput.html
[option=off]
http://pgstatsinfo.projects.pgfoundry.org/readahead_dbt2/readahead_off_20140109/HTML/index_thput.html
[option=on]
http://pgstatsinfo.projects.pgfoundry.org/readahead_dbt2/readahead_on_20140109/HTML/index_thput.html
We can see part of super slow latency in my proposed method test.
Part of transaction active is 20%, and part of slow transactions is 80%.
It might be Pareto principle in CHECKPOINT;-)
#It's joke.
I will test some join sqls performance and TPC-3 benchmark in this or next week.
Regards,
--
Mitsumasa KONDO
NTT Open Source Software Center
*** a/configure
--- b/configure
***************
*** 11303,11309 **** fi
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
! for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
do :
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
--- 11303,11309 ----
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
! for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fadvise pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
do :
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
*** a/contrib/pg_prewarm/pg_prewarm.c
--- b/contrib/pg_prewarm/pg_prewarm.c
***************
*** 179,185 **** pg_prewarm(PG_FUNCTION_ARGS)
*/
for (block = first_block; block <= last_block; ++block)
{
! smgrread(rel->rd_smgr, forkNumber, block, blockbuffer);
++blocks_done;
}
}
--- 179,185 ----
*/
for (block = first_block; block <= last_block; ++block)
{
! smgrread(rel->rd_smgr, forkNumber, block, blockbuffer, BAS_BULKREAD);
++blocks_done;
}
}
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 1293,1298 **** include 'filename'
--- 1293,1322 ----
</listitem>
</varlistentry>
+ <varlistentry id="guc-readahead-strategy" xreflabel="readahead_strategy">
+ <term><varname>readahead_strategy</varname> (<type>integer</type>)</term>
+ <indexterm>
+ <primary><varname>readahead_strategy</>configuration parameter</primary>
+ </indexterm>
+ <listitem>
+ <para>
+ This feature is to select which readahead strategy is used. When we
+ set off(default), readahead strategy is optimized by OS. On the other
+ hands, when we set on, readahead strategy is optimized by Postgres.
+ In typicaly situations, OS readahead strategy will be good working,
+ however Postgres often knows better readahead strategy before
+ executing disk access. For example, we can easy to predict access
+ pattern when we input SQLs, because planner of postgres decides
+ efficient access pattern to read faster. And it might be random access
+ pattern or sequential access pattern. It will be less disk IO and more
+ efficient to use file cache in OS. It will be better performance.
+ However this optimization is not complete now, so it is necessary to
+ choose it carefully in considering situations. Default setting is off
+ that is optimized by OS, and whenever it can change it.
+ </para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</sect2>
*** a/src/backend/commands/tablecmds.c
--- b/src/backend/commands/tablecmds.c
***************
*** 9125,9131 **** copy_relation_data(SMgrRelation src, SMgrRelation dst,
/* If we got a cancel signal during the copy of the data, quit */
CHECK_FOR_INTERRUPTS();
! smgrread(src, forkNum, blkno, buf);
if (!PageIsVerified(page, blkno))
ereport(ERROR,
--- 9125,9131 ----
/* If we got a cancel signal during the copy of the data, quit */
CHECK_FOR_INTERRUPTS();
! smgrread(src, forkNum, blkno, buf, (char *) BAS_BULKREAD);
if (!PageIsVerified(page, blkno))
ereport(ERROR,
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 41,46 ****
--- 41,47 ----
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
+ #include "storage/buf.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
***************
*** 451,457 **** ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
if (track_io_timing)
INSTR_TIME_SET_CURRENT(io_start);
! smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
if (track_io_timing)
{
--- 452,458 ----
if (track_io_timing)
INSTR_TIME_SET_CURRENT(io_start);
! smgrread(smgr, forkNum, blockNum, (char *) bufBlock, (char *) strategy);
if (track_io_timing)
{
*** a/src/backend/storage/file/fd.c
--- b/src/backend/storage/file/fd.c
***************
*** 73,80 ****
--- 73,82 ----
#include "catalog/pg_tablespace.h"
#include "common/relpath.h"
#include "pgstat.h"
+ #include "storage/buf.h"
#include "storage/fd.h"
#include "storage/ipc.h"
+ #include "storage/bufmgr.h"
#include "utils/guc.h"
#include "utils/resowner_private.h"
***************
*** 123,129 **** int max_files_per_process = 1000;
* setting this variable, and so need not be tested separately.
*/
int max_safe_fds = 32; /* default if not changed */
!
/* Debugging.... */
--- 125,131 ----
* setting this variable, and so need not be tested separately.
*/
int max_safe_fds = 32; /* default if not changed */
! bool readahead_strategy = false ;
/* Debugging.... */
***************
*** 383,388 **** pg_flush_data(int fd, off_t offset, off_t amount)
--- 385,405 ----
return 0;
}
+ /*
+ * pg_fadvise --- advise OS that the cache will need or not
+ *
+ * Not all platforms have posix_fadvise. If it does not support posix_fadvise,
+ * we do nothing about here.
+ */
+ int
+ pg_fadvise(int fd, off_t offset, off_t amount, int advise)
+ {
+ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) && defined(POSIX_FADV_RANDOM) && defined(POSIX_FADV_SEQUENTIAL)
+ return posix_fadvise(fd, offset, amount, advise);
+ #else
+ return 0;
+ #endif
+ }
/*
* fsync_fname -- fsync a file or directory, handling errors properly
***************
*** 1142,1147 **** OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
--- 1159,1201 ----
}
/*
+ * Controling OS file cache using posix_fadvise()
+ */
+ int
+ FileCacheAdvise(File file, off_t offset, off_t amount, int advise)
+ {
+ return pg_fadvise(VfdCache[file].fd, offset, amount, advise);
+ }
+
+ /*
+ * Select OS readahead strategy using buffer hint in postgres. If we select POSIX_FADV_RANDOM,
+ * readahead isn't executed at all and file cache replace algorithm in OS will be more smart.
+ * Because it can calculate correct number of accesses which are hot data.
+ */
+ int
+ BufferHintIOAdvise(File file, off_t offset, off_t amount, char *strategy)
+ {
+ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) && defined(POSIX_FADV_RANDOM)
+ if(!readahead_strategy)
+ return FileCacheAdvise(file, offset, amount, POSIX_FADV_NORMAL);
+
+ /* readahead optimization */
+ if(strategy != NULL)
+ {
+ /* use normal readahead setting, we confirmed POSIX_FADV_NORMAL is faster than FADV_SEQUENTIAL in linux. */
+ return FileCacheAdvise(file, offset, amount, POSIX_FADV_NORMAL);
+ }
+ else
+ {
+ /* don't use readahead in kernel, so we can more effectively use OS file cache */
+ return FileCacheAdvise(file, offset, amount, POSIX_FADV_RANDOM);
+ }
+ #else
+ return 0;
+ #endif
+ }
+
+ /*
* close a file when done with it
*/
void
*** a/src/backend/storage/smgr/md.c
--- b/src/backend/storage/smgr/md.c
***************
*** 653,659 **** mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
*/
void
mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! char *buffer)
{
off_t seekpos;
int nbytes;
--- 653,659 ----
*/
void
mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! char *buffer, char *strategy)
{
off_t seekpos;
int nbytes;
***************
*** 677,682 **** mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
--- 677,684 ----
errmsg("could not seek to block %u in file \"%s\": %m",
blocknum, FilePathName(v->mdfd_vfd))));
+ /* Control buffered IO in OS by using posix_fadvise() */
+ BufferHintIOAdvise(v->mdfd_vfd, seekpos, BLCKSZ, strategy);
nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
*** a/src/backend/storage/smgr/smgr.c
--- b/src/backend/storage/smgr/smgr.c
***************
*** 50,56 **** typedef struct f_smgr
void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
! BlockNumber blocknum, char *buffer);
void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
--- 50,56 ----
void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
! BlockNumber blocknum, char *buffer, char *strategy);
void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
***************
*** 588,596 **** smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
*/
void
smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! char *buffer)
{
! (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
}
/*
--- 588,596 ----
*/
void
smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! char *buffer, char *strategy)
{
! (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer, strategy);
}
/*
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 769,774 **** static struct config_bool ConfigureNamesBool[] =
--- 769,783 ----
NULL, NULL, NULL
},
{
+ {"readahead_strategy", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("off is optimized readahead by kernel, on is optimized by postgres."),
+ NULL
+ },
+ &readahead_strategy,
+ false,
+ NULL, NULL, NULL
+ },
+ {
{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
gettext_noop("Enables genetic query optimization."),
gettext_noop("This algorithm attempts to do planning without "
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 138,143 ****
--- 138,145 ----
#temp_file_limit = -1 # limits per-session temp file space
# in kB, or -1 for no limit
+ #readahead_strategy = off # off is optimized by OS,
+ # on is optimized by postgres
# - Kernel Resource Usage -
*** a/src/include/storage/bufmgr.h
--- b/src/include/storage/bufmgr.h
***************
*** 50,55 **** extern int bgwriter_lru_maxpages;
--- 50,56 ----
extern double bgwriter_lru_multiplier;
extern bool track_io_timing;
extern int target_prefetch_pages;
+ extern bool readahead_strategy;
/* in buf_init.c */
extern PGDLLIMPORT char *BufferBlocks;
*** a/src/include/storage/fd.h
--- b/src/include/storage/fd.h
***************
*** 68,73 **** extern int max_safe_fds;
--- 68,74 ----
extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
extern File OpenTemporaryFile(bool interXact);
extern void FileClose(File file);
+ extern int FileCacheAdvise(File file, off_t offset, off_t amount, int advise);
extern int FilePrefetch(File file, off_t offset, int amount);
extern int FileRead(File file, char *buffer, int amount);
extern int FileWrite(File file, char *buffer, int amount);
***************
*** 75,80 **** extern int FileSync(File file);
--- 76,82 ----
extern off_t FileSeek(File file, off_t offset, int whence);
extern int FileTruncate(File file, off_t offset);
extern char *FilePathName(File file);
+ extern int BufferHintIOAdvise(File file, off_t offset, off_t amount, char *strategy);
/* Operations that allow use of regular stdio --- USE WITH CAUTION */
extern FILE *AllocateFile(const char *name, const char *mode);
***************
*** 113,118 **** extern int pg_fsync_no_writethrough(int fd);
--- 115,121 ----
extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd);
extern int pg_flush_data(int fd, off_t offset, off_t amount);
+ extern int pg_fadvise(int fd, off_t offset, off_t amount, int advise);
extern void fsync_fname(char *fname, bool isdir);
/* Filename components for OpenTemporaryFile */
*** a/src/include/storage/smgr.h
--- b/src/include/storage/smgr.h
***************
*** 92,98 **** extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void smgrread(SMgrRelation reln, ForkNumber forknum,
! BlockNumber blocknum, char *buffer);
extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
--- 92,98 ----
extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void smgrread(SMgrRelation reln, ForkNumber forknum,
! BlockNumber blocknum, char *buffer, char *strategy);
extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
***************
*** 118,124 **** extern void mdextend(SMgrRelation reln, ForkNumber forknum,
extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! char *buffer);
extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
--- 118,124 ----
extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! char *buffer, char *strategy);
extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers