Hi,
I create patch that can drop duplicate buffers in OS using usage_count
alogorithm. I have developed this patch since last summer. This feature seems to
be discussed in hot topic, so I submit it more faster than my schedule.
When usage_count is high in shared_buffers, they are hard to drop from
shared_buffers. However, these buffers wasn't required in file cache. Because
they aren't accessed by postgres(postgres access to shared_buffers).
So I create algorithm that dropping file cache which is high usage_count in
shared_buffers and is clean state in OS. If file cache are clean state in OS,
and
executing posix_fadvice DONTNEED, it can only free in file cache without writing
physical disk. This algorithm will solve double-buffered situation problem and
can use memory more efficiently.
I am testing DBT-2 benchmark now...
Regards,
--
Mitsumasa KONDO
NTT Open Source Software Center
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 1107,1112 **** include 'filename'
--- 1107,1130 ----
</listitem>
</varlistentry>
+ <varlistentry id="guc-drop-duplicate-buffers" xreflabel="drop-duplicate-buffers">
+ <term><varname>drop-duplicate-buffers</varname> (<type>integer</type>)</term>
+ <indexterm>
+ <primary><varname>drop-duplicate-buffers</> configuration parameter</primary>
+ </indexterm>
+ <listitem>
+ <para>
+ Sets target min usage count in shared_buffers, which is droped in file cache.
+ When you use this parameter, you set large shared_buffers that is about 50%
+ or higher, and set parameter with 4 or 5. It need to set carefuly. If you
+ use this parameter with small shared_buffers, transaction performance will be
+ decliend. This parameter is needed to support posix_fadvise() system call.
+ If your system doesn't support posix_fadvise(), it doesn't work at all.
+ </para>
+
+ </listitem>
+ </varlistentry>
+
<varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
<term><varname>temp_buffers</varname> (<type>integer</type>)</term>
<indexterm>
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 69,74 ****
--- 69,75 ----
/* GUC variables */
bool zero_damaged_pages = false;
int bgwriter_lru_maxpages = 100;
+ int DropDuplicateBuffers = -1;
double bgwriter_lru_multiplier = 2.0;
bool track_io_timing = false;
***************
*** 111,116 **** static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
--- 112,118 ----
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
static int rnode_comparator(const void *p1, const void *p2);
+ static void DropDuplicateOSCache(volatile BufferDesc *bufHdr);
/*
***************
*** 835,841 **** BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* 1 so that the buffer can survive one clock-sweep pass.)
*/
buf->tag = newTag;
! buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
if (relpersistence == RELPERSISTENCE_PERMANENT)
buf->flags |= BM_TAG_VALID | BM_PERMANENT;
else
--- 837,843 ----
* 1 so that the buffer can survive one clock-sweep pass.)
*/
buf->tag = newTag;
! buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT | BM_FADVED);
if (relpersistence == RELPERSISTENCE_PERMANENT)
buf->flags |= BM_TAG_VALID | BM_PERMANENT;
else
***************
*** 1101,1107 **** PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
--- 1103,1120 ----
if (strategy == NULL)
{
if (buf->usage_count < BM_MAX_USAGE_COUNT)
+ {
buf->usage_count++;
+
+ /*
+ * If the buffer is clean, we can remove duplicate buffers
+ * from the file cache in OS without physical disk writing,
+ * which is for more efficient whole memory using. It is needed
+ * to execute at once per buffers.
+ */
+ if (!(buf->flags & BM_FADVED) && !(buf->flags & BM_JUST_DIRTIED))
+ DropDuplicateOSCache(buf);
+ }
}
else
{
***************
*** 1953,1958 **** FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
--- 1966,1974 ----
*/
recptr = BufferGetLSN(buf);
+ /* mark this buffer's file cache in OS isn't clean */
+ buf->flags |= BM_FADVED;
+
/* To check if block content changes while flushing. - vadim 01/17/97 */
buf->flags &= ~BM_JUST_DIRTIED;
UnlockBufHdr(buf);
***************
*** 2101,2106 **** BufferGetLSNAtomic(Buffer buffer)
--- 2117,2144 ----
return lsn;
}
+ /*
+ * Drop duplicate OS cache which is in OS and shared_buffers. This purpose
+ * is for more efficient file cache space and dirty file cache management.
+ */
+ static void
+ DropDuplicateOSCache(volatile BufferDesc *bufHdr)
+ {
+ SMgrRelation reln;
+ MdfdVec *v;
+ off_t seekpos;
+
+ /* Drop OS cache which is higher usage_count in shared_buffer */
+ if(DropDuplicateBuffers != -1 && bufHdr->usage_count >= DropDuplicateBuffers)
+ {
+ reln = smgropen(bufHdr->tag.rnode, InvalidBackendId);
+ v = _mdfd_getseg(reln, bufHdr->tag.forkNum, bufHdr->tag.blockNum, false, 0);
+ seekpos = (off_t) BLCKSZ *(bufHdr->tag.blockNum % ((BlockNumber) RELSEG_SIZE));
+ FileCacheAdvise(v->mdfd_vfd, seekpos, (off_t) BLCKSZ, POSIX_FADV_DONTNEED);
+ bufHdr->flags |= BM_FADVED;
+ }
+ }
+
/* ---------------------------------------------------------------------
* DropRelFileNodeBuffers
*
***************
*** 2414,2419 **** FlushRelationBuffers(Relation rel)
--- 2452,2459 ----
error_context_stack = &errcallback;
PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+
+ bufHdr->flags |= BM_FADVED;
smgrwrite(rel->rd_smgr,
bufHdr->tag.forkNum,
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
***************
*** 204,209 **** LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
--- 204,211 ----
PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+ bufHdr->flags |= BM_FADVED;
+
/* And write... */
smgrwrite(oreln,
bufHdr->tag.forkNum,
*** a/src/backend/storage/file/fd.c
--- b/src/backend/storage/file/fd.c
***************
*** 383,388 **** pg_flush_data(int fd, off_t offset, off_t amount)
--- 383,403 ----
return 0;
}
+ /*
+ * pg_fadvise --- advise OS that the cache will need or not
+ *
+ * Not all platforms have posix_fadvise. If it does not support posix_fadvise,
+ * we do nothing about here.
+ */
+ int
+ pg_fadvise(int fd, off_t offset, off_t amount, int advise)
+ {
+ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+ return posix_fadvise(fd, offset, amount, advise);
+ #else
+ return 0;
+ #endif
+ }
/*
* fsync_fname -- fsync a file or directory, handling errors properly
***************
*** 1142,1147 **** OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
--- 1157,1171 ----
}
/*
+ * Controling OS file cache using posix_fadvise()
+ */
+ int
+ FileCacheAdvise(File file, off_t offset, off_t amount, int advise)
+ {
+ return pg_fadvise(VfdCache[file].fd, offset, amount, advise);
+ }
+
+ /*
* close a file when done with it
*/
void
*** a/src/backend/storage/smgr/md.c
--- b/src/backend/storage/smgr/md.c
***************
*** 109,121 ****
* All MdfdVec objects are palloc'd in the MdCxt memory context.
*/
- typedef struct _MdfdVec
- {
- File mdfd_vfd; /* fd number in fd.c's pool */
- BlockNumber mdfd_segno; /* segment number, from 0 */
- struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
- } MdfdVec;
-
static MemoryContext MdCxt; /* context for all md.c allocations */
--- 109,114 ----
***************
*** 163,175 **** static CycleCtr mdsync_cycle_ctr = 0;
static CycleCtr mdckpt_cycle_ctr = 0;
- typedef enum /* behavior for mdopen & _mdfd_getseg */
- {
- EXTENSION_FAIL, /* ereport if segment not present */
- EXTENSION_RETURN_NULL, /* return NULL if not present */
- EXTENSION_CREATE /* create new segments as needed */
- } ExtensionBehavior;
-
/* local routines */
static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
bool isRedo);
--- 156,161 ----
***************
*** 183,190 **** static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
BlockNumber segno);
static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
BlockNumber segno, int oflags);
- static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
- BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
MdfdVec *seg);
--- 169,174 ----
***************
*** 1701,1707 **** _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
* segment, according to "behavior". Note: skipFsync is only used in the
* EXTENSION_CREATE case.
*/
! static MdfdVec *
_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
bool skipFsync, ExtensionBehavior behavior)
{
--- 1685,1691 ----
* segment, according to "behavior". Note: skipFsync is only used in the
* EXTENSION_CREATE case.
*/
! extern MdfdVec *
_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
bool skipFsync, ExtensionBehavior behavior)
{
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 61,66 ****
--- 61,67 ----
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#include "storage/bufmgr.h"
+ #include "storage/buf_internals.h"
#include "storage/dsm_impl.h"
#include "storage/standby.h"
#include "storage/fd.h"
***************
*** 2056,2061 **** static struct config_int ConfigureNamesInt[] =
--- 2057,2072 ----
},
{
+ {"drop_duplicate_buffers", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+ gettext_noop("drop duplicate buffers in OS file cache"),
+ NULL
+ },
+ &DropDuplicateBuffers,
+ -1, -1, BM_MAX_USAGE_COUNT,
+ NULL, NULL, NULL
+ },
+
+ {
{"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS,
gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."),
NULL,
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 115,120 ****
--- 115,122 ----
#shared_buffers = 32MB # min 128kB
# (change requires restart)
+ #drop_duplicate_buffers = -1 # set target min usage_count in shared_buffers
+ # -1 is disables
#temp_buffers = 8MB # min 800kB
#max_prepared_transactions = 0 # zero disables the feature
# (change requires restart)
*** a/src/include/postmaster/bgwriter.h
--- b/src/include/postmaster/bgwriter.h
***************
*** 23,28 ****
--- 23,29 ----
extern int BgWriterDelay;
extern int CheckPointTimeout;
extern int CheckPointWarning;
+ extern int DropDuplicateBuffers;
extern double CheckPointCompletionTarget;
extern void BackgroundWriterMain(void) __attribute__((noreturn));
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 40,45 ****
--- 40,46 ----
#define BM_CHECKPOINT_NEEDED (1 << 7) /* must write for checkpoint */
#define BM_PERMANENT (1 << 8) /* permanent relation (not
* unlogged) */
+ #define BM_FADVED (1 << 9) /* remove duplicate file cache flag */
typedef bits16 BufFlags;
*** a/src/include/storage/fd.h
--- b/src/include/storage/fd.h
***************
*** 68,73 **** extern int max_safe_fds;
--- 68,74 ----
extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
extern File OpenTemporaryFile(bool interXact);
extern void FileClose(File file);
+ extern int FileCacheAdvise(File file, off_t offset, off_t amount, int advise);
extern int FilePrefetch(File file, off_t offset, int amount);
extern int FileRead(File file, char *buffer, int amount);
extern int FileWrite(File file, char *buffer, int amount);
***************
*** 113,118 **** extern int pg_fsync_no_writethrough(int fd);
--- 114,120 ----
extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd);
extern int pg_flush_data(int fd, off_t offset, off_t amount);
+ extern int pg_fadvise(int fd, off_t offset, off_t amount, int advise);
extern void fsync_fname(char *fname, bool isdir);
/* Filename components for OpenTemporaryFile */
*** a/src/include/storage/smgr.h
--- b/src/include/storage/smgr.h
***************
*** 17,23 ****
#include "fmgr.h"
#include "storage/block.h"
#include "storage/relfilenode.h"
!
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
--- 17,23 ----
#include "fmgr.h"
#include "storage/block.h"
#include "storage/relfilenode.h"
! #include "storage/fd.h"
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
***************
*** 76,81 **** typedef SMgrRelationData *SMgrRelation;
--- 76,96 ----
#define SmgrIsTemp(smgr) \
RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
+ /* If you want to get more detail about MdfdVec, you read src/storage/smgr/md.c */
+ typedef struct _MdfdVec
+ {
+ File mdfd_vfd; /* fd number in fd.c's pool */
+ BlockNumber mdfd_segno; /* segment number, from 0 */
+ struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
+ } MdfdVec;
+
+ typedef enum /* behavior for mdopen & _mdfd_getseg */
+ {
+ EXTENSION_FAIL, /* ereport if segment not present */
+ EXTENSION_RETURN_NULL, /* return NULL if not present */
+ EXTENSION_CREATE /* create new segments as needed */
+ } ExtensionBehavior;
+
extern void smgrinit(void);
extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
***************
*** 128,133 **** extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
--- 143,150 ----
extern void mdpreckpt(void);
extern void mdsync(void);
extern void mdpostckpt(void);
+ extern MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
+ BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
extern void SetForwardFsyncRequests(void);
extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
*** a/src/include/utils/guc_tables.h
--- b/src/include/utils/guc_tables.h
***************
*** 63,68 **** enum config_group
--- 63,69 ----
RESOURCES_KERNEL,
RESOURCES_VACUUM_DELAY,
RESOURCES_BGWRITER,
+ RESOURCES_CHECKPOINTER,
RESOURCES_ASYNCHRONOUS,
WAL,
WAL_SETTINGS,
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers