Magnus Hagander wrote:
> 2010/2/18 Fujii Masao <[email protected]>:
>> On Thu, Feb 18, 2010 at 7:04 PM, Heikki Linnakangas
>> <[email protected]> wrote:
>>> Magnus Hagander wrote:
>>>> O_DIRECT helps us when we're not going to read the file again, because
>>>> we don't waste cache on it. If we are, which is the case here, it
>>>> should be really bad for performance, since we actually have to do a
>>>> physical read.
>>>>
>>>> Incidentally, that should also apply to general WAL when archive_mdoe
>>>> is on. Do we optimize for that?
>>> Hmm, no we don't. We do take that into account so that we refrain from
>>> issuing posix_fadvice(DONTNEED) if archive_mode is on, but we don't
>>> disable O_DIRECT. Maybe we should..
>> Since the performance of WAL write is more important than that of WAL
>> archiving in general, that optimization might offer little benefit.
>
> Well, it's going to make the process that reads the WAL cause actual
> physical I/O... That'll take a chunk out of your total available I/O,
> which is likely to push you to the limit of your I/O capacity much
> quicker.
Right, doesn't seem sensible, though it would be nice to see a benchmark
on that.
Here's a patch to disable O_DIRECT when archiving or streaming is
enabled. This is pretty hard to test, so any extra eyeballs would be nice..
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
? GNUmakefile
? a.patch
? b
? backend
? config.log
? config.status
? config.status.lineno
? configure.lineno
? gin-splay-1.patch
? gin-splay-2.patch
? gin-splay-3.patch
? include
? libpq
? log_unlogged_op_0115.patch
? md-1.c
? md-1.patch
? replication
? temp-file-resowner-2.patch
? contrib/pgbench/.deps
? contrib/pgbench/fsynctest
? contrib/pgbench/fsynctest.c
? contrib/pgbench/fsynctestfile
? contrib/pgbench/pgbench
? contrib/spi/.deps
? doc/src/sgml/HTML.index
? doc/src/sgml/bookindex.sgml
? doc/src/sgml/features-supported.sgml
? doc/src/sgml/features-unsupported.sgml
? doc/src/sgml/version.sgml
? src/Makefile.global
? src/backend/aaa.patch
? src/backend/postgres
? src/backend/access/common/.deps
? src/backend/access/gin/.deps
? src/backend/access/gist/.deps
? src/backend/access/hash/.deps
? src/backend/access/heap/.deps
? src/backend/access/index/.deps
? src/backend/access/nbtree/.deps
? src/backend/access/transam/.deps
? src/backend/bootstrap/.deps
? src/backend/catalog/.deps
? src/backend/commands/.deps
? src/backend/executor/.deps
? src/backend/foreign/.deps
? src/backend/foreign/dummy/.deps
? src/backend/foreign/postgresql/.deps
? src/backend/lib/.deps
? src/backend/libpq/.deps
? src/backend/main/.deps
? src/backend/nodes/.deps
? src/backend/optimizer/geqo/.deps
? src/backend/optimizer/path/.deps
? src/backend/optimizer/plan/.deps
? src/backend/optimizer/prep/.deps
? src/backend/optimizer/util/.deps
? src/backend/parser/.deps
? src/backend/po/af.mo
? src/backend/po/cs.mo
? src/backend/po/hr.mo
? src/backend/po/hu.mo
? src/backend/po/it.mo
? src/backend/po/ko.mo
? src/backend/po/nb.mo
? src/backend/po/nl.mo
? src/backend/po/pl.mo
? src/backend/po/ro.mo
? src/backend/po/ru.mo
? src/backend/po/sk.mo
? src/backend/po/sl.mo
? src/backend/po/sv.mo
? src/backend/po/zh_CN.mo
? src/backend/po/zh_TW.mo
? src/backend/port/.deps
? src/backend/postmaster/.deps
? src/backend/regex/.deps
? src/backend/replication/.deps
? src/backend/replication/libpqwalreceiver/.deps
? src/backend/rewrite/.deps
? src/backend/snowball/.deps
? src/backend/snowball/snowball_create.sql
? src/backend/storage/buffer/.deps
? src/backend/storage/file/.deps
? src/backend/storage/freespace/.deps
? src/backend/storage/ipc/.deps
? src/backend/storage/large_object/.deps
? src/backend/storage/lmgr/.deps
? src/backend/storage/page/.deps
? src/backend/storage/smgr/.deps
? src/backend/tcop/.deps
? src/backend/tsearch/.deps
? src/backend/utils/.deps
? src/backend/utils/probes.h
? src/backend/utils/adt/.deps
? src/backend/utils/cache/.deps
? src/backend/utils/error/.deps
? src/backend/utils/fmgr/.deps
? src/backend/utils/hash/.deps
? src/backend/utils/init/.deps
? src/backend/utils/mb/.deps
? src/backend/utils/mb/Unicode/BIG5.TXT
? src/backend/utils/mb/Unicode/CP950.TXT
? src/backend/utils/mb/conversion_procs/conversion_create.sql
? src/backend/utils/mb/conversion_procs/ascii_and_mic/.deps
? src/backend/utils/mb/conversion_procs/cyrillic_and_mic/.deps
? src/backend/utils/mb/conversion_procs/euc2004_sjis2004/.deps
? src/backend/utils/mb/conversion_procs/euc_cn_and_mic/.deps
? src/backend/utils/mb/conversion_procs/euc_jis_2004_and_shift_jis_2004/.deps
? src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/.deps
? src/backend/utils/mb/conversion_procs/euc_kr_and_mic/.deps
? src/backend/utils/mb/conversion_procs/euc_tw_and_big5/.deps
? src/backend/utils/mb/conversion_procs/latin2_and_win1250/.deps
? src/backend/utils/mb/conversion_procs/latin_and_mic/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_ascii/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_big5/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc2004/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_jis_2004/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_gb18030/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_gbk/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_iso8859/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_johab/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_shift_jis_2004/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_sjis/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_uhc/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_win/.deps
? src/backend/utils/misc/.deps
? src/backend/utils/mmgr/.deps
? src/backend/utils/resowner/.deps
? src/backend/utils/sort/.deps
? src/backend/utils/time/.deps
? src/bin/initdb/.deps
? src/bin/initdb/initdb
? src/bin/initdb/po/ko.mo
? src/bin/initdb/po/pl.mo
? src/bin/initdb/po/ro.mo
? src/bin/initdb/po/sk.mo
? src/bin/initdb/po/sl.mo
? src/bin/initdb/po/zh_CN.mo
? src/bin/initdb/po/zh_TW.mo
? src/bin/pg_config/.deps
? src/bin/pg_config/pg_config
? src/bin/pg_config/po/cs.mo
? src/bin/pg_config/po/pl.mo
? src/bin/pg_config/po/sl.mo
? src/bin/pg_config/po/zh_CN.mo
? src/bin/pg_config/po/zh_TW.mo
? src/bin/pg_controldata/.deps
? src/bin/pg_controldata/pg_controldata
? src/bin/pg_controldata/po/cs.mo
? src/bin/pg_controldata/po/fa.mo
? src/bin/pg_controldata/po/hu.mo
? src/bin/pg_controldata/po/nb.mo
? src/bin/pg_controldata/po/pl.mo
? src/bin/pg_controldata/po/ro.mo
? src/bin/pg_controldata/po/ru.mo
? src/bin/pg_controldata/po/sk.mo
? src/bin/pg_controldata/po/sl.mo
? src/bin/pg_controldata/po/zh_CN.mo
? src/bin/pg_controldata/po/zh_TW.mo
? src/bin/pg_ctl/.deps
? src/bin/pg_ctl/pg_ctl
? src/bin/pg_ctl/po/cs.mo
? src/bin/pg_ctl/po/ro.mo
? src/bin/pg_ctl/po/sk.mo
? src/bin/pg_ctl/po/sl.mo
? src/bin/pg_ctl/po/zh_CN.mo
? src/bin/pg_ctl/po/zh_TW.mo
? src/bin/pg_dump/.deps
? src/bin/pg_dump/pg_dump
? src/bin/pg_dump/pg_dumpall
? src/bin/pg_dump/pg_restore
? src/bin/pg_dump/po/cs.mo
? src/bin/pg_dump/po/ko.mo
? src/bin/pg_dump/po/nb.mo
? src/bin/pg_dump/po/ro.mo
? src/bin/pg_dump/po/ru.mo
? src/bin/pg_dump/po/sk.mo
? src/bin/pg_dump/po/sl.mo
? src/bin/pg_dump/po/zh_CN.mo
? src/bin/pg_dump/po/zh_TW.mo
? src/bin/pg_resetxlog/.deps
? src/bin/pg_resetxlog/pg_resetxlog
? src/bin/pg_resetxlog/po/cs.mo
? src/bin/pg_resetxlog/po/hu.mo
? src/bin/pg_resetxlog/po/nb.mo
? src/bin/pg_resetxlog/po/sk.mo
? src/bin/pg_resetxlog/po/sl.mo
? src/bin/pg_resetxlog/po/zh_CN.mo
? src/bin/pg_resetxlog/po/zh_TW.mo
? src/bin/psql/.deps
? src/bin/psql/psql
? src/bin/psql/po/fa.mo
? src/bin/psql/po/hu.mo
? src/bin/psql/po/it.mo
? src/bin/psql/po/ko.mo
? src/bin/psql/po/nb.mo
? src/bin/psql/po/ro.mo
? src/bin/psql/po/ru.mo
? src/bin/psql/po/sk.mo
? src/bin/psql/po/sl.mo
? src/bin/psql/po/zh_CN.mo
? src/bin/psql/po/zh_TW.mo
? src/bin/scripts/.deps
? src/bin/scripts/clusterdb
? src/bin/scripts/createdb
? src/bin/scripts/createlang
? src/bin/scripts/createuser
? src/bin/scripts/dropdb
? src/bin/scripts/droplang
? src/bin/scripts/dropuser
? src/bin/scripts/foo
? src/bin/scripts/reindexdb
? src/bin/scripts/vacuumdb
? src/bin/scripts/po/ru.mo
? src/bin/scripts/po/sk.mo
? src/bin/scripts/po/sl.mo
? src/bin/scripts/po/zh_CN.mo
? src/bin/scripts/po/zh_TW.mo
? src/include/pg_config.h
? src/include/stamp-h
? src/interfaces/ecpg/compatlib/.deps
? src/interfaces/ecpg/compatlib/exports.list
? src/interfaces/ecpg/compatlib/libecpg_compat.so.3.1
? src/interfaces/ecpg/compatlib/libecpg_compat.so.3.2
? src/interfaces/ecpg/ecpglib/.deps
? src/interfaces/ecpg/ecpglib/exports.list
? src/interfaces/ecpg/ecpglib/libecpg.so.6.1
? src/interfaces/ecpg/ecpglib/libecpg.so.6.2
? src/interfaces/ecpg/include/ecpg_config.h
? src/interfaces/ecpg/include/stamp-h
? src/interfaces/ecpg/pgtypeslib/.deps
? src/interfaces/ecpg/pgtypeslib/exports.list
? src/interfaces/ecpg/pgtypeslib/libpgtypes.so.3.1
? src/interfaces/ecpg/pgtypeslib/libpgtypes.so.3.2
? src/interfaces/ecpg/preproc/.deps
? src/interfaces/ecpg/preproc/ecpg
? src/interfaces/libpq/.deps
? src/interfaces/libpq/exports.list
? src/interfaces/libpq/libpq.so.5.3
? src/interfaces/libpq/po/af.mo
? src/interfaces/libpq/po/hr.mo
? src/interfaces/libpq/po/nb.mo
? src/interfaces/libpq/po/pl.mo
? src/interfaces/libpq/po/sk.mo
? src/interfaces/libpq/po/sl.mo
? src/interfaces/libpq/po/zh_CN.mo
? src/interfaces/libpq/po/zh_TW.mo
? src/pl/plpgsql/src/.deps
? src/pl/plpgsql/src/pl_scan.c
? src/pl/plpgsql/src/po/tr.mo
? src/port/.deps
? src/port/pg_config_paths.h
? src/test/regress/.deps
? src/test/regress/log
? src/test/regress/pg_regress
? src/test/regress/results
? src/test/regress/testtablespace
? src/test/regress/expected/constraints.out
? src/test/regress/expected/copy.out
? src/test/regress/expected/create_function_1.out
? src/test/regress/expected/create_function_2.out
? src/test/regress/expected/largeobject.out
? src/test/regress/expected/largeobject_1.out
? src/test/regress/expected/misc.out
? src/test/regress/expected/tablespace.out
? src/test/regress/sql/constraints.sql
? src/test/regress/sql/copy.sql
? src/test/regress/sql/create_function_1.sql
? src/test/regress/sql/create_function_2.sql
? src/test/regress/sql/largeobject.sql
? src/test/regress/sql/misc.sql
? src/test/regress/sql/tablespace.sql
? src/timezone/.deps
? src/timezone/zic
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.375
diff -u -r1.375 xlog.c
--- src/backend/access/transam/xlog.c 17 Feb 2010 04:19:39 -0000 1.375
+++ src/backend/access/transam/xlog.c 18 Feb 2010 12:40:05 -0000
@@ -2686,13 +2686,10 @@
* WAL segment files will not be re-read in normal operation, so we advise
* the OS to release any cached pages. But do not do so if WAL archiving
* or streaming is active, because archiver and walsender process could use
- * the cache to read the WAL segment. Also, don't bother with it if we
- * are using O_DIRECT, since the kernel is presumably not caching in that
- * case.
+ * the cache to read the WAL segment.
*/
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
- if (!XLogIsNeeded() &&
- (get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
+ if (!XLogIsNeeded())
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
#endif
@@ -7652,10 +7649,29 @@
static int
get_sync_bit(int method)
{
+ int o_direct_flag = 0;
+
/* If fsync is disabled, never open in sync mode */
if (!enableFsync)
return 0;
+ /*
+ * Optimize writes by bypassing kernel cache with O_DIRECT when using
+ * O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are
+ * disabled, otherwise the archive command or walsender process will
+ * read the WAL soon after writing it, which is guaranteed to cause a
+ * physical read if we bypassed the kernel cache. We also skip the
+ * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the
+ * same reason.
+ *
+ * Never use O_DIRECT in walsender process for similar reasons; the WAL
+ * written by walreceiver is normally read by the startup process soon
+ * after its written. Also, walreceiver performs unaligned writes, which
+ * don't work with O_DIRECT, so it is required for correctness too.
+ */
+ if (!XLogIsNeeded() && !am_walreceiver)
+ o_direct_flag = PG_O_DIRECT;
+
switch (method)
{
/*
@@ -7670,11 +7686,11 @@
return 0;
#ifdef OPEN_SYNC_FLAG
case SYNC_METHOD_OPEN:
- return OPEN_SYNC_FLAG;
+ return OPEN_SYNC_FLAG | o_direct_flag;
#endif
#ifdef OPEN_DATASYNC_FLAG
case SYNC_METHOD_OPEN_DSYNC:
- return OPEN_DATASYNC_FLAG;
+ return OPEN_DATASYNC_FLAG | o_direct_flag;
#endif
default:
/* can't happen (unless we are out of sync with option array) */
Index: src/backend/replication/walreceiver.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/replication/walreceiver.c,v
retrieving revision 1.4
diff -u -r1.4 walreceiver.c
--- src/backend/replication/walreceiver.c 17 Feb 2010 04:19:39 -0000 1.4
+++ src/backend/replication/walreceiver.c 18 Feb 2010 12:40:05 -0000
@@ -50,6 +50,9 @@
#include "utils/ps_status.h"
#include "utils/resowner.h"
+/* Global variable to indicate if this process is a walreceiver process */
+bool am_walreceiver;
+
/* libpqreceiver hooks to these when loaded */
walrcv_connect_type walrcv_connect = NULL;
walrcv_receive_type walrcv_receive = NULL;
@@ -158,6 +161,8 @@
/* use volatile pointer to prevent code rearrangement */
volatile WalRcvData *walrcv = WalRcv;
+ am_walreceiver = true;
+
/*
* WalRcv should be set up already (if we are a backend, we inherit
* this by fork() or EXEC_BACKEND mechanism from the postmaster).
@@ -424,16 +429,18 @@
bool use_existent;
/*
- * XLOG segment files will be re-read in recovery operation soon,
- * so we don't need to advise the OS to release any cache page.
+ * fsync() and close current file before we switch to next one.
+ * We would otherwise have to reopen this file to fsync it later
*/
if (recvFile >= 0)
{
+ XLogWalRcvFlush();
+
/*
- * fsync() before we switch to next file. We would otherwise
- * have to reopen this file to fsync it later
+ * XLOG segment files will be re-read by recovery in startup
+ * process soon, so we don't advise the OS to release cache
+ * pages associated with the file like XLogFileClose() does.
*/
- XLogWalRcvFlush();
if (close(recvFile) != 0)
ereport(PANIC,
(errcode_for_file_access(),
@@ -445,8 +452,7 @@
/* Create/use new log file */
XLByteToSeg(recptr, recvId, recvSeg);
use_existent = true;
- recvFile = XLogFileInit(recvId, recvSeg,
- &use_existent, true);
+ recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true);
recvOff = 0;
}
Index: src/include/access/xlogdefs.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/access/xlogdefs.h,v
retrieving revision 1.25
diff -u -r1.25 xlogdefs.h
--- src/include/access/xlogdefs.h 15 Jan 2010 09:19:06 -0000 1.25
+++ src/include/access/xlogdefs.h 18 Feb 2010 12:40:05 -0000
@@ -106,23 +106,20 @@
* configure determined whether fdatasync() is.
*/
#if defined(O_SYNC)
-#define BARE_OPEN_SYNC_FLAG O_SYNC
+#define OPEN_SYNC_FLAG O_SYNC
#elif defined(O_FSYNC)
-#define BARE_OPEN_SYNC_FLAG O_FSYNC
-#endif
-#ifdef BARE_OPEN_SYNC_FLAG
-#define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
+#define OPEN_SYNC_FLAG O_FSYNC
#endif
#if defined(O_DSYNC)
#if defined(OPEN_SYNC_FLAG)
/* O_DSYNC is distinct? */
-#if O_DSYNC != BARE_OPEN_SYNC_FLAG
-#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
+#if O_DSYNC != OPEN_SYNC_FLAG
+#define OPEN_DATASYNC_FLAG O_DSYNC
#endif
#else /* !defined(OPEN_SYNC_FLAG) */
/* Win32 only has O_DSYNC */
-#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
+#define OPEN_DATASYNC_FLAG O_DSYNC
#endif
#endif
Index: src/include/replication/walreceiver.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/replication/walreceiver.h,v
retrieving revision 1.6
diff -u -r1.6 walreceiver.h
--- src/include/replication/walreceiver.h 3 Feb 2010 09:47:19 -0000 1.6
+++ src/include/replication/walreceiver.h 18 Feb 2010 12:40:05 -0000
@@ -15,6 +15,8 @@
#include "access/xlogdefs.h"
#include "storage/spin.h"
+extern bool am_walreceiver;
+
/*
* MAXCONNINFO: maximum size of a connection string.
*
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers