Hi, I create patch that can drop duplicate buffers in OS using usage_count alogorithm. I have developed this patch since last summer. This feature seems to be discussed in hot topic, so I submit it more faster than my schedule.
When usage_count is high in shared_buffers, they are hard to drop from shared_buffers. However, these buffers wasn't required in file cache. Because they aren't accessed by postgres(postgres access to shared_buffers). So I create algorithm that dropping file cache which is high usage_count in shared_buffers and is clean state in OS. If file cache are clean state in OS, and executing posix_fadvice DONTNEED, it can only free in file cache without writing physical disk. This algorithm will solve double-buffered situation problem and can use memory more efficiently. I am testing DBT-2 benchmark now... Regards, -- Mitsumasa KONDO NTT Open Source Software Center
*** a/doc/src/sgml/config.sgml --- b/doc/src/sgml/config.sgml *************** *** 1107,1112 **** include 'filename' --- 1107,1130 ---- </listitem> </varlistentry> + <varlistentry id="guc-drop-duplicate-buffers" xreflabel="drop-duplicate-buffers"> + <term><varname>drop-duplicate-buffers</varname> (<type>integer</type>)</term> + <indexterm> + <primary><varname>drop-duplicate-buffers</> configuration parameter</primary> + </indexterm> + <listitem> + <para> + Sets target min usage count in shared_buffers, which is droped in file cache. + When you use this parameter, you set large shared_buffers that is about 50% + or higher, and set parameter with 4 or 5. It need to set carefuly. If you + use this parameter with small shared_buffers, transaction performance will be + decliend. This parameter is needed to support posix_fadvise() system call. + If your system doesn't support posix_fadvise(), it doesn't work at all. + </para> + + </listitem> + </varlistentry> + <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers"> <term><varname>temp_buffers</varname> (<type>integer</type>)</term> <indexterm> *** a/src/backend/storage/buffer/bufmgr.c --- b/src/backend/storage/buffer/bufmgr.c *************** *** 69,74 **** --- 69,75 ---- /* GUC variables */ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; + int DropDuplicateBuffers = -1; double bgwriter_lru_multiplier = 2.0; bool track_io_timing = false; *************** *** 111,116 **** static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, --- 112,118 ---- static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln); static void AtProcExit_Buffers(int code, Datum arg); static int rnode_comparator(const void *p1, const void *p2); + static void DropDuplicateOSCache(volatile BufferDesc *bufHdr); /* *************** *** 835,841 **** BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * 1 so that the buffer can survive one clock-sweep pass.) */ buf->tag = newTag; ! buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT); if (relpersistence == RELPERSISTENCE_PERMANENT) buf->flags |= BM_TAG_VALID | BM_PERMANENT; else --- 837,843 ---- * 1 so that the buffer can survive one clock-sweep pass.) */ buf->tag = newTag; ! buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT | BM_FADVED); if (relpersistence == RELPERSISTENCE_PERMANENT) buf->flags |= BM_TAG_VALID | BM_PERMANENT; else *************** *** 1101,1107 **** PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy) --- 1103,1120 ---- if (strategy == NULL) { if (buf->usage_count < BM_MAX_USAGE_COUNT) + { buf->usage_count++; + + /* + * If the buffer is clean, we can remove duplicate buffers + * from the file cache in OS without physical disk writing, + * which is for more efficient whole memory using. It is needed + * to execute at once per buffers. + */ + if (!(buf->flags & BM_FADVED) && !(buf->flags & BM_JUST_DIRTIED)) + DropDuplicateOSCache(buf); + } } else { *************** *** 1953,1958 **** FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) --- 1966,1974 ---- */ recptr = BufferGetLSN(buf); + /* mark this buffer's file cache in OS isn't clean */ + buf->flags |= BM_FADVED; + /* To check if block content changes while flushing. - vadim 01/17/97 */ buf->flags &= ~BM_JUST_DIRTIED; UnlockBufHdr(buf); *************** *** 2101,2106 **** BufferGetLSNAtomic(Buffer buffer) --- 2117,2144 ---- return lsn; } + /* + * Drop duplicate OS cache which is in OS and shared_buffers. This purpose + * is for more efficient file cache space and dirty file cache management. + */ + static void + DropDuplicateOSCache(volatile BufferDesc *bufHdr) + { + SMgrRelation reln; + MdfdVec *v; + off_t seekpos; + + /* Drop OS cache which is higher usage_count in shared_buffer */ + if(DropDuplicateBuffers != -1 && bufHdr->usage_count >= DropDuplicateBuffers) + { + reln = smgropen(bufHdr->tag.rnode, InvalidBackendId); + v = _mdfd_getseg(reln, bufHdr->tag.forkNum, bufHdr->tag.blockNum, false, 0); + seekpos = (off_t) BLCKSZ *(bufHdr->tag.blockNum % ((BlockNumber) RELSEG_SIZE)); + FileCacheAdvise(v->mdfd_vfd, seekpos, (off_t) BLCKSZ, POSIX_FADV_DONTNEED); + bufHdr->flags |= BM_FADVED; + } + } + /* --------------------------------------------------------------------- * DropRelFileNodeBuffers * *************** *** 2414,2419 **** FlushRelationBuffers(Relation rel) --- 2452,2459 ---- error_context_stack = &errcallback; PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + + bufHdr->flags |= BM_FADVED; smgrwrite(rel->rd_smgr, bufHdr->tag.forkNum, *** a/src/backend/storage/buffer/localbuf.c --- b/src/backend/storage/buffer/localbuf.c *************** *** 204,209 **** LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, --- 204,211 ---- PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + bufHdr->flags |= BM_FADVED; + /* And write... */ smgrwrite(oreln, bufHdr->tag.forkNum, *** a/src/backend/storage/file/fd.c --- b/src/backend/storage/file/fd.c *************** *** 383,388 **** pg_flush_data(int fd, off_t offset, off_t amount) --- 383,403 ---- return 0; } + /* + * pg_fadvise --- advise OS that the cache will need or not + * + * Not all platforms have posix_fadvise. If it does not support posix_fadvise, + * we do nothing about here. + */ + int + pg_fadvise(int fd, off_t offset, off_t amount, int advise) + { + #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + return posix_fadvise(fd, offset, amount, advise); + #else + return 0; + #endif + } /* * fsync_fname -- fsync a file or directory, handling errors properly *************** *** 1142,1147 **** OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) --- 1157,1171 ---- } /* + * Controling OS file cache using posix_fadvise() + */ + int + FileCacheAdvise(File file, off_t offset, off_t amount, int advise) + { + return pg_fadvise(VfdCache[file].fd, offset, amount, advise); + } + + /* * close a file when done with it */ void *** a/src/backend/storage/smgr/md.c --- b/src/backend/storage/smgr/md.c *************** *** 109,121 **** * All MdfdVec objects are palloc'd in the MdCxt memory context. */ - typedef struct _MdfdVec - { - File mdfd_vfd; /* fd number in fd.c's pool */ - BlockNumber mdfd_segno; /* segment number, from 0 */ - struct _MdfdVec *mdfd_chain; /* next segment, or NULL */ - } MdfdVec; - static MemoryContext MdCxt; /* context for all md.c allocations */ --- 109,114 ---- *************** *** 163,175 **** static CycleCtr mdsync_cycle_ctr = 0; static CycleCtr mdckpt_cycle_ctr = 0; - typedef enum /* behavior for mdopen & _mdfd_getseg */ - { - EXTENSION_FAIL, /* ereport if segment not present */ - EXTENSION_RETURN_NULL, /* return NULL if not present */ - EXTENSION_CREATE /* create new segments as needed */ - } ExtensionBehavior; - /* local routines */ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo); --- 156,161 ---- *************** *** 183,190 **** static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno); static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno, BlockNumber segno, int oflags); - static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, - BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior); static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); --- 169,174 ---- *************** *** 1701,1707 **** _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, * segment, according to "behavior". Note: skipFsync is only used in the * EXTENSION_CREATE case. */ ! static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior) { --- 1685,1691 ---- * segment, according to "behavior". Note: skipFsync is only used in the * EXTENSION_CREATE case. */ ! extern MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior) { *** a/src/backend/utils/misc/guc.c --- b/src/backend/utils/misc/guc.c *************** *** 61,66 **** --- 61,67 ---- #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" + #include "storage/buf_internals.h" #include "storage/dsm_impl.h" #include "storage/standby.h" #include "storage/fd.h" *************** *** 2056,2061 **** static struct config_int ConfigureNamesInt[] = --- 2057,2072 ---- }, { + {"drop_duplicate_buffers", PGC_SIGHUP, RESOURCES_CHECKPOINTER, + gettext_noop("drop duplicate buffers in OS file cache"), + NULL + }, + &DropDuplicateBuffers, + -1, -1, BM_MAX_USAGE_COUNT, + NULL, NULL, NULL + }, + + { {"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS, gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."), NULL, *** a/src/backend/utils/misc/postgresql.conf.sample --- b/src/backend/utils/misc/postgresql.conf.sample *************** *** 115,120 **** --- 115,122 ---- #shared_buffers = 32MB # min 128kB # (change requires restart) + #drop_duplicate_buffers = -1 # set target min usage_count in shared_buffers + # -1 is disables #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) *** a/src/include/postmaster/bgwriter.h --- b/src/include/postmaster/bgwriter.h *************** *** 23,28 **** --- 23,29 ---- extern int BgWriterDelay; extern int CheckPointTimeout; extern int CheckPointWarning; + extern int DropDuplicateBuffers; extern double CheckPointCompletionTarget; extern void BackgroundWriterMain(void) __attribute__((noreturn)); *** a/src/include/storage/buf_internals.h --- b/src/include/storage/buf_internals.h *************** *** 40,45 **** --- 40,46 ---- #define BM_CHECKPOINT_NEEDED (1 << 7) /* must write for checkpoint */ #define BM_PERMANENT (1 << 8) /* permanent relation (not * unlogged) */ + #define BM_FADVED (1 << 9) /* remove duplicate file cache flag */ typedef bits16 BufFlags; *** a/src/include/storage/fd.h --- b/src/include/storage/fd.h *************** *** 68,73 **** extern int max_safe_fds; --- 68,74 ---- extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode); extern File OpenTemporaryFile(bool interXact); extern void FileClose(File file); + extern int FileCacheAdvise(File file, off_t offset, off_t amount, int advise); extern int FilePrefetch(File file, off_t offset, int amount); extern int FileRead(File file, char *buffer, int amount); extern int FileWrite(File file, char *buffer, int amount); *************** *** 113,118 **** extern int pg_fsync_no_writethrough(int fd); --- 114,120 ---- extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); + extern int pg_fadvise(int fd, off_t offset, off_t amount, int advise); extern void fsync_fname(char *fname, bool isdir); /* Filename components for OpenTemporaryFile */ *** a/src/include/storage/smgr.h --- b/src/include/storage/smgr.h *************** *** 17,23 **** #include "fmgr.h" #include "storage/block.h" #include "storage/relfilenode.h" ! /* * smgr.c maintains a table of SMgrRelation objects, which are essentially --- 17,23 ---- #include "fmgr.h" #include "storage/block.h" #include "storage/relfilenode.h" ! #include "storage/fd.h" /* * smgr.c maintains a table of SMgrRelation objects, which are essentially *************** *** 76,81 **** typedef SMgrRelationData *SMgrRelation; --- 76,96 ---- #define SmgrIsTemp(smgr) \ RelFileNodeBackendIsTemp((smgr)->smgr_rnode) + /* If you want to get more detail about MdfdVec, you read src/storage/smgr/md.c */ + typedef struct _MdfdVec + { + File mdfd_vfd; /* fd number in fd.c's pool */ + BlockNumber mdfd_segno; /* segment number, from 0 */ + struct _MdfdVec *mdfd_chain; /* next segment, or NULL */ + } MdfdVec; + + typedef enum /* behavior for mdopen & _mdfd_getseg */ + { + EXTENSION_FAIL, /* ereport if segment not present */ + EXTENSION_RETURN_NULL, /* return NULL if not present */ + EXTENSION_CREATE /* create new segments as needed */ + } ExtensionBehavior; + extern void smgrinit(void); extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); *************** *** 128,133 **** extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); --- 143,150 ---- extern void mdpreckpt(void); extern void mdsync(void); extern void mdpostckpt(void); + extern MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, + BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior); extern void SetForwardFsyncRequests(void); extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, *** a/src/include/utils/guc_tables.h --- b/src/include/utils/guc_tables.h *************** *** 63,68 **** enum config_group --- 63,69 ---- RESOURCES_KERNEL, RESOURCES_VACUUM_DELAY, RESOURCES_BGWRITER, + RESOURCES_CHECKPOINTER, RESOURCES_ASYNCHRONOUS, WAL, WAL_SETTINGS,
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers