Hi,
Using sync_file_range(2) as wal_sync_method might speed up
the XLOG flush. So, I made the patch to introduce the new valid
value (sync_file_range) to wal_sync_method, and performed the
comparative performance measurement of fdatasync vs
sync_file_range using this patch. The patch is attached to this
mail. This is just a reference information, and I'm not planning to
provide the patch for CommitFest now.
Environment:
- PowerEdge1850 (Xeon 2.8GHz, Mem 512MB)
- Fedora11
- PostgreSQL v8.4 with the patch
Measurement:
- pgbench -i -s64
- pgbench -c16 -t1000 -Mprepared * [20 times]
- postgresql.conf
checkpoint_segments = 64
- The above measurement was repeated 3 times
Result:
- The following values indicate throughput of pgbench (tps)
The first set
----------------
fdatasync sync_file_range
1 60.6 58.9
2 63.1 58.8
3 61.3 62.3
4 70.3 66.8
5 67.4 66.2
6 67.8 71.1
7 74.3 67.5
8 70.0 71.9
9 71.7 72.8
10 74.0 72.0
11 72.3 72.1
12 79.9 78.6
13 73.3 73.3
14 72.9 71.2
15 78.6 78.6
16 81.7 76.7
17 75.5 75.9
18 78.0 73.3
19 75.3 78.9
20 83.0 77.3
avg 72.5 71.2
The second set
---------------------
fdatasync sync_file_range
1 52.6 60.3
2 57.4 65.9
3 62.6 63.7
4 59.0 68.9
5 67.0 72.2
6 61.5 72.2
7 69.0 73.4
8 64.3 75.6
9 67.6 74.8
10 69.1 75.7
11 65.7 77.7
12 72.6 76.6
13 68.8 75.5
14 69.4 79.4
15 74.2 81.2
16 71.4 77.5
17 71.3 78.0
18 73.1 80.4
19 73.5 80.2
20 73.7 80.7
avg 67.2 74.5
The third set
-----------------
fdatasync sync_file_range
1 60.9 59.5
2 58.3 64.1
3 64.7 62.9
4 66.6 68.0
5 67.9 70.9
6 69.9 69.4
7 70.0 72.6
8 72.3 76.6
9 70.7 74.7
10 70.3 70.2
11 77.2 78.2
12 74.8 73.9
13 69.6 79.0
14 79.3 80.7
15 78.0 74.6
16 77.8 78.9
17 73.6 81.0
18 81.5 77.6
19 76.1 78.5
20 79.1 83.7
avg 71.9 73.8
According to the result, using sync_file_range instead of fdatasync
has little effect in the performance of postgres. This time I just used
sync_file_range with the following combination of the flags:
SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
SYNC_FILE_RANGE_WAIT_AFTER
This might be a stupid way, so there might be room for improvement.
Regards,
--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center
Index: configure
===================================================================
RCS file: /projects/cvsroot/pgsql/configure,v
retrieving revision 1.644
diff -c -r1.644 configure
*** configure 27 Jun 2009 00:14:46 -0000 1.644
--- configure 30 Jun 2009 04:54:13 -0000
***************
*** 16587,16592 ****
--- 16587,16761 ----
fi
+ # sync_file_range() is a no-op on Solaris, so don't incur function overhead
+ # by calling it.
+ if test "$PORTNAME" != "solaris"; then
+
+ for ac_func in sync_file_range
+ do
+ as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ { echo "$as_me:$LINENO: checking for $ac_func" >&5
+ echo $ECHO_N "checking for $ac_func... $ECHO_C" >&6; }
+ if { as_var=$as_ac_var; eval "test \"\${$as_var+set}\" = set"; }; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+ else
+ cat >conftest.$ac_ext <<_ACEOF
+ /* confdefs.h. */
+ _ACEOF
+ cat confdefs.h >>conftest.$ac_ext
+ cat >>conftest.$ac_ext <<_ACEOF
+ /* end confdefs.h. */
+ /* Define $ac_func to an innocuous variant, in case <limits.h> declares $ac_func.
+ For example, HP-UX 11i <limits.h> declares gettimeofday. */
+ #define $ac_func innocuous_$ac_func
+
+ /* System header to define __stub macros and hopefully few prototypes,
+ which can conflict with char $ac_func (); below.
+ Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+ <limits.h> exists even on freestanding compilers. */
+
+ #ifdef __STDC__
+ # include <limits.h>
+ #else
+ # include <assert.h>
+ #endif
+
+ #undef $ac_func
+
+ /* Override any GCC internal prototype to avoid an error.
+ Use char because int might match the return type of a GCC
+ builtin and then its argument prototype would still apply. */
+ #ifdef __cplusplus
+ extern "C"
+ #endif
+ char $ac_func ();
+ /* The GNU C library defines this for functions which it implements
+ to always fail with ENOSYS. Some functions are actually named
+ something starting with __ and the normal name is an alias. */
+ #if defined __stub_$ac_func || defined __stub___$ac_func
+ choke me
+ #endif
+
+ int
+ main ()
+ {
+ return $ac_func ();
+ ;
+ return 0;
+ }
+ _ACEOF
+ rm -f conftest.$ac_objext conftest$ac_exeext
+ if { (ac_try="$ac_link"
+ case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+ esac
+ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+ (eval "$ac_link") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest$ac_exeext &&
+ $as_test_x conftest$ac_exeext; then
+ eval "$as_ac_var=yes"
+ else
+ echo "$as_me: failed program was:" >&5
+ sed 's/^/| /' conftest.$ac_ext >&5
+
+ eval "$as_ac_var=no"
+ fi
+
+ rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+ conftest$ac_exeext conftest.$ac_ext
+ fi
+ ac_res=`eval echo '${'$as_ac_var'}'`
+ { echo "$as_me:$LINENO: result: $ac_res" >&5
+ echo "${ECHO_T}$ac_res" >&6; }
+ if test `eval echo '${'$as_ac_var'}'` = yes; then
+ cat >>confdefs.h <<_ACEOF
+ #define `echo "HAVE_$ac_func" | $as_tr_cpp` 1
+ _ACEOF
+
+ fi
+ done
+
+ { echo "$as_me:$LINENO: checking whether sync_file_range is declared" >&5
+ echo $ECHO_N "checking whether sync_file_range is declared... $ECHO_C" >&6; }
+ if test "${ac_cv_have_decl_sync_file_range+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+ else
+ cat >conftest.$ac_ext <<_ACEOF
+ /* confdefs.h. */
+ _ACEOF
+ cat confdefs.h >>conftest.$ac_ext
+ cat >>conftest.$ac_ext <<_ACEOF
+ /* end confdefs.h. */
+ #define _GNU_SOURCE
+ #include <fcntl.h>
+
+ int
+ main ()
+ {
+ #ifndef sync_file_range
+ (void) sync_file_range;
+ #endif
+
+ ;
+ return 0;
+ }
+ _ACEOF
+ rm -f conftest.$ac_objext
+ if { (ac_try="$ac_compile"
+ case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+ esac
+ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+ (eval "$ac_compile") 2>conftest.er1
+ ac_status=$?
+ grep -v '^ *+' conftest.er1 >conftest.err
+ rm -f conftest.er1
+ cat conftest.err >&5
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); } && {
+ test -z "$ac_c_werror_flag" ||
+ test ! -s conftest.err
+ } && test -s conftest.$ac_objext; then
+ ac_cv_have_decl_sync_file_range=yes
+ else
+ echo "$as_me: failed program was:" >&5
+ sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_cv_have_decl_sync_file_range=no
+ fi
+
+ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ fi
+ { echo "$as_me:$LINENO: result: $ac_cv_have_decl_sync_file_range" >&5
+ echo "${ECHO_T}$ac_cv_have_decl_sync_file_range" >&6; }
+ if test $ac_cv_have_decl_sync_file_range = yes; then
+
+ cat >>confdefs.h <<_ACEOF
+ #define HAVE_DECL_SYNC_FILE_RANGE 1
+ _ACEOF
+
+
+ else
+ cat >>confdefs.h <<_ACEOF
+ #define HAVE_DECL_SYNC_FILE_RANGE 0
+ _ACEOF
+
+
+ fi
+
+
+ fi
+
{ echo "$as_me:$LINENO: checking whether fdatasync is declared" >&5
echo $ECHO_N "checking whether fdatasync is declared... $ECHO_C" >&6; }
if test "${ac_cv_have_decl_fdatasync+set}" = set; then
Index: configure.in
===================================================================
RCS file: /projects/cvsroot/pgsql/configure.in,v
retrieving revision 1.602
diff -c -r1.602 configure.in
*** configure.in 27 Jun 2009 00:14:47 -0000 1.602
--- configure.in 30 Jun 2009 04:54:13 -0000
***************
*** 1151,1156 ****
--- 1151,1163 ----
AC_CHECK_DECLS(posix_fadvise, [], [], [#include <fcntl.h>])
fi
+ # sync_file_range() is a no-op on Solaris, so don't incur function overhead
+ # by calling it.
+ if test "$PORTNAME" != "solaris"; then
+ AC_CHECK_FUNCS(sync_file_range)
+ AC_CHECK_DECLS(sync_file_range, [], [], [#include <fcntl.h>])
+ fi
+
AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>])
AC_CHECK_DECLS([strlcat, strlcpy])
# This is probably only present on Darwin, but may as well check always
Index: doc/src/sgml/config.sgml
===================================================================
RCS file: /projects/cvsroot/pgsql/doc/src/sgml/config.sgml,v
retrieving revision 1.220
diff -c -r1.220 config.sgml
*** doc/src/sgml/config.sgml 17 Jun 2009 21:58:48 -0000 1.220
--- doc/src/sgml/config.sgml 30 Jun 2009 04:54:13 -0000
***************
*** 1406,1411 ****
--- 1406,1416 ----
<literal>open_sync</> (write WAL files with <function>open()</> option <symbol>O_SYNC</>)
</para>
</listitem>
+ <listitem>
+ <para>
+ <literal>sync_file_range</> (call <function>sync_file_range()</> at each commit)
+ </para>
+ </listitem>
</itemizedlist>
<para>
Not all of these choices are available on all platforms.
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.345
diff -c -r1.345 xlog.c
*** src/backend/access/transam/xlog.c 26 Jun 2009 20:29:04 -0000 1.345
--- src/backend/access/transam/xlog.c 30 Jun 2009 04:54:13 -0000
***************
*** 99,104 ****
--- 99,107 ----
#ifdef HAVE_FDATASYNC
{"fdatasync", SYNC_METHOD_FDATASYNC, false},
#endif
+ #ifdef HAVE_SYNC_FILE_RANGE
+ {"sync_file_range", SYNC_METHOD_SYNC_FILE_RANGE, false},
+ #endif
#ifdef OPEN_SYNC_FLAG
{"open_sync", SYNC_METHOD_OPEN, false},
#endif
***************
*** 501,507 ****
--- 504,514 ----
#ifdef WAL_DEBUG
static void xlog_outrec(StringInfo buf, XLogRecord *record);
#endif
+ #ifdef HAVE_SYNC_FILE_RANGE
+ static void issue_xlog_fsync(uint32 offset, Size nbytes);
+ #else
static void issue_xlog_fsync(void);
+ #endif
static void pg_start_backup_callback(int code, Datum arg);
static bool read_backup_label(XLogRecPtr *checkPointLoc,
XLogRecPtr *minRecoveryLoc);
***************
*** 1526,1531 ****
--- 1533,1542 ----
int npages;
int startidx;
uint32 startoffset;
+ #ifdef HAVE_SYNC_FILE_RANGE
+ uint32 fsyncoffset;
+ Size fsyncnbytes;
+ #endif
/* We should always be inside a critical section here */
Assert(CritSectionCount > 0);
***************
*** 1548,1553 ****
--- 1559,1570 ----
startidx = 0;
startoffset = 0;
+ /* Initialize info about sync of a file segment */
+ #ifdef HAVE_SYNC_FILE_RANGE
+ fsyncoffset = 0;
+ fsyncnbytes = 0;
+ #endif
+
/*
* Within the loop, curridx is the cache block index of the page to
* consider writing. We advance Write->curridx only after successfully
***************
*** 1656,1661 ****
--- 1673,1685 ----
openLogOff, (unsigned long) nbytes)));
}
+ /* Update state for sync */
+ #ifdef HAVE_SYNC_FILE_RANGE
+ if (fsyncnbytes == 0)
+ fsyncoffset = startoffset;
+ fsyncnbytes += nbytes;
+ #endif
+
/* Update state for write */
openLogOff += nbytes;
Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
***************
*** 1679,1685 ****
--- 1703,1714 ----
*/
if (finishing_seg || (xlog_switch && last_iteration))
{
+ #ifdef HAVE_SYNC_FILE_RANGE
+ issue_xlog_fsync(fsyncoffset, fsyncnbytes);
+ fsyncnbytes = 0;
+ #else
issue_xlog_fsync();
+ #endif
LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
if (XLogArchivingActive())
***************
*** 1743,1749 ****
--- 1772,1783 ----
openLogFile = XLogFileOpen(openLogId, openLogSeg);
openLogOff = 0;
}
+ #ifdef HAVE_SYNC_FILE_RANGE
+ issue_xlog_fsync(fsyncoffset, fsyncnbytes);
+ fsyncnbytes = 0;
+ #else
issue_xlog_fsync();
+ #endif
}
LogwrtResult.Flush = LogwrtResult.Write;
}
***************
*** 7107,7112 ****
--- 7141,7147 ----
case SYNC_METHOD_FSYNC:
case SYNC_METHOD_FSYNC_WRITETHROUGH:
case SYNC_METHOD_FDATASYNC:
+ case SYNC_METHOD_SYNC_FILE_RANGE:
return 0;
#ifdef OPEN_SYNC_FLAG
case SYNC_METHOD_OPEN:
***************
*** 7160,7166 ****
* Issue appropriate kind of fsync (if any) on the current XLOG output file
*/
static void
! issue_xlog_fsync(void)
{
switch (sync_method)
{
--- 7195,7205 ----
* Issue appropriate kind of fsync (if any) on the current XLOG output file
*/
static void
! #ifdef HAVE_SYNC_FILE_RANGE
! issue_xlog_fsync(uint32 offset, Size nbytes)
! #else
! issue_xlog_fsync()
! #endif
{
switch (sync_method)
{
***************
*** 7193,7198 ****
--- 7232,7246 ----
case SYNC_METHOD_OPEN_DSYNC:
/* write synced it already */
break;
+ #ifdef HAVE_SYNC_FILE_RANGE
+ case SYNC_METHOD_SYNC_FILE_RANGE:
+ if (pg_sync_file_range(openLogFile, offset, nbytes) != 0)
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not sync_file_range log file %u, segment %u: %m",
+ openLogId, openLogSeg)));
+ break;
+ #endif
default:
elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
break;
Index: src/backend/storage/file/fd.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/file/fd.c,v
retrieving revision 1.149
diff -c -r1.149 fd.c
*** src/backend/storage/file/fd.c 11 Jun 2009 14:49:01 -0000 1.149
--- src/backend/storage/file/fd.c 30 Jun 2009 04:54:13 -0000
***************
*** 44,49 ****
--- 44,52 ----
#include <sys/param.h>
#include <sys/stat.h>
#include <unistd.h>
+ #ifdef HAVE_SYNC_FILE_RANGE
+ #define _GNU_SOURCE
+ #endif
#include <fcntl.h>
#ifdef HAVE_SYS_RESOURCE_H
#include <sys/resource.h> /* for getrlimit */
***************
*** 318,323 ****
--- 321,350 ----
}
/*
+ * pg_sync_file_range --- same as sync_file_range except does nothing if
+ * enableFsync is off
+ *
+ * Not all platforms have fdatasync; treat as fsync if not available.
+ */
+ int
+ pg_sync_file_range(int fd, uint32 offset, Size nbytes)
+ {
+ if (enableFsync)
+ {
+ #ifdef HAVE_SYNC_FILE_RANGE
+ return sync_file_range(fd, (off64_t) offset, (off64_t) nbytes,
+ SYNC_FILE_RANGE_WAIT_BEFORE |
+ SYNC_FILE_RANGE_WRITE |
+ SYNC_FILE_RANGE_WAIT_AFTER);
+ #else
+ return fsync(fd);
+ #endif
+ }
+ else
+ return 0;
+ }
+
+ /*
* InitFileAccess --- initialize this module during backend startup
*
* This is called during either normal or standalone backend start.
Index: src/backend/utils/misc/postgresql.conf.sample
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/postgresql.conf.sample,v
retrieving revision 1.260
diff -c -r1.260 postgresql.conf.sample
*** src/backend/utils/misc/postgresql.conf.sample 23 Apr 2009 00:23:45 -0000 1.260
--- src/backend/utils/misc/postgresql.conf.sample 30 Jun 2009 04:54:13 -0000
***************
*** 156,161 ****
--- 156,162 ----
# fsync
# fsync_writethrough
# open_sync
+ # sync_file_range
#full_page_writes = on # recover from partial page writes
#wal_buffers = 64kB # min 32kB
# (change requires restart)
Index: src/include/pg_config.h.in
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/pg_config.h.in,v
retrieving revision 1.139
diff -c -r1.139 pg_config.h.in
*** src/include/pg_config.h.in 4 Apr 2009 21:55:50 -0000 1.139
--- src/include/pg_config.h.in 30 Jun 2009 04:54:13 -0000
***************
*** 114,119 ****
--- 114,123 ----
don't. */
#undef HAVE_DECL_STRLCPY
+ /* Define to 1 if you have the declaration of `sync_file_range', and to 0 if you
+ don't. */
+ #undef HAVE_DECL_SYNC_FILE_RANGE
+
/* Define to 1 if you have the declaration of `sys_siglist', and to 0 if you
don't. */
#undef HAVE_DECL_SYS_SIGLIST
***************
*** 508,513 ****
--- 512,520 ----
/* Define to 1 if you have the `symlink' function. */
#undef HAVE_SYMLINK
+ /* Define to 1 if you have the `sync_file_range' function. */
+ #undef HAVE_SYNC_FILE_RANGE
+
/* Define to 1 if you have the `sysconf' function. */
#undef HAVE_SYSCONF
Index: src/include/access/xlog.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/access/xlog.h,v
retrieving revision 1.93
diff -c -r1.93 xlog.h
*** src/include/access/xlog.h 26 Jun 2009 20:29:04 -0000 1.93
--- src/include/access/xlog.h 30 Jun 2009 04:54:13 -0000
***************
*** 91,96 ****
--- 91,97 ----
#define SYNC_METHOD_OPEN 2 /* for O_SYNC */
#define SYNC_METHOD_FSYNC_WRITETHROUGH 3
#define SYNC_METHOD_OPEN_DSYNC 4 /* for O_DSYNC */
+ #define SYNC_METHOD_SYNC_FILE_RANGE 5
extern int sync_method;
/*
Index: src/include/access/xlogdefs.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/access/xlogdefs.h,v
retrieving revision 1.23
diff -c -r1.23 xlogdefs.h
*** src/include/access/xlogdefs.h 1 Jan 2009 17:23:56 -0000 1.23
--- src/include/access/xlogdefs.h 30 Jun 2009 04:54:13 -0000
***************
*** 114,119 ****
--- 114,121 ----
#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN_DSYNC
#elif defined(HAVE_FDATASYNC)
#define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC
+ #elif defined(HAVE_SYNC_FILE_RANGE)
+ #define DEFAULT_SYNC_METHOD SYNC_METHOD_SYNC_FILE_RANGE
#elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY)
#define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC_WRITETHROUGH
#else
Index: src/include/storage/fd.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/fd.h,v
retrieving revision 1.64
diff -c -r1.64 fd.h
*** src/include/storage/fd.h 12 Jan 2009 05:10:45 -0000 1.64
--- src/include/storage/fd.h 30 Jun 2009 04:54:13 -0000
***************
*** 97,102 ****
--- 97,103 ----
extern int pg_fsync_no_writethrough(int fd);
extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd);
+ extern int pg_sync_file_range(int fd, uint32 offset, Size nbytes);
/* Filename components for OpenTemporaryFile */
#define PG_TEMP_FILES_DIR "pgsql_tmp"
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers