Hi,

I create patch that can drop duplicate buffers in OS using usage_count
alogorithm. I have developed this patch since last summer. This feature seems to
be discussed in hot topic, so I submit it more faster than my schedule.

When usage_count is high in shared_buffers, they are hard to drop from
shared_buffers. However, these buffers wasn't required in file cache. Because
they aren't accessed by postgres(postgres access to shared_buffers).
So I create algorithm that dropping file cache which is high usage_count in
shared_buffers and is clean state in OS. If file cache are clean state in OS, 
and
executing posix_fadvice DONTNEED, it can only free in file cache without writing
physical disk. This algorithm will solve double-buffered situation problem and
can use memory more efficiently.

I am testing DBT-2 benchmark now...

Regards,
--
Mitsumasa KONDO
NTT Open Source Software Center
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 1107,1112 **** include 'filename'
--- 1107,1130 ----
        </listitem>
       </varlistentry>
  
+      <varlistentry id="guc-drop-duplicate-buffers" xreflabel="drop-duplicate-buffers">
+       <term><varname>drop-duplicate-buffers</varname> (<type>integer</type>)</term>
+       <indexterm>
+        <primary><varname>drop-duplicate-buffers</> configuration parameter</primary>
+       </indexterm>
+       <listitem>
+        <para>
+         Sets target min usage count in shared_buffers, which is droped in file cache.
+         When you use this parameter, you set large shared_buffers that is about 50% 
+         or higher, and set parameter with 4 or 5. It need to set carefuly. If you 
+         use this parameter with small shared_buffers, transaction performance will be
+         decliend. This parameter is needed to support posix_fadvise() system call.
+         If your system doesn't support posix_fadvise(), it doesn't work at all.
+        </para>
+ 
+       </listitem>
+      </varlistentry>
+ 
       <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
        <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
        <indexterm>
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 69,74 ****
--- 69,75 ----
  /* GUC variables */
  bool		zero_damaged_pages = false;
  int			bgwriter_lru_maxpages = 100;
+ int		DropDuplicateBuffers = -1;
  double		bgwriter_lru_multiplier = 2.0;
  bool		track_io_timing = false;
  
***************
*** 111,116 **** static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
--- 112,118 ----
  static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
  static void AtProcExit_Buffers(int code, Datum arg);
  static int	rnode_comparator(const void *p1, const void *p2);
+ static void DropDuplicateOSCache(volatile BufferDesc *bufHdr);
  
  
  /*
***************
*** 835,841 **** BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
  		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
--- 837,843 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT | BM_FADVED);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
  		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
***************
*** 1101,1107 **** PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
--- 1103,1120 ----
  		if (strategy == NULL)
  		{
  			if (buf->usage_count < BM_MAX_USAGE_COUNT)
+ 			{
  				buf->usage_count++;
+ 
+ 				/*
+ 				 * If the buffer is clean, we can remove duplicate buffers
+ 				 * from the file cache in OS without physical disk writing,
+ 				 * which is for more efficient whole memory using. It is needed
+ 				 * to execute at once per buffers.
+ 				 */
+ 				if (!(buf->flags & BM_FADVED) && !(buf->flags & BM_JUST_DIRTIED))
+ 					DropDuplicateOSCache(buf);
+ 			}
  		}
  		else
  		{
***************
*** 1953,1958 **** FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
--- 1966,1974 ----
  	 */
  	recptr = BufferGetLSN(buf);
  
+ 	/* mark this buffer's file cache in OS isn't clean */
+ 	buf->flags |= BM_FADVED;
+ 
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
  	buf->flags &= ~BM_JUST_DIRTIED;
  	UnlockBufHdr(buf);
***************
*** 2101,2106 **** BufferGetLSNAtomic(Buffer buffer)
--- 2117,2144 ----
  	return lsn;
  }
  
+ /*
+  * Drop duplicate OS cache which is in OS and shared_buffers. This purpose
+  * is for more efficient file cache space and dirty file cache management.
+  */
+ static void
+ DropDuplicateOSCache(volatile BufferDesc *bufHdr)
+ {
+ 	SMgrRelation    reln;
+ 	MdfdVec         *v;
+ 	off_t           seekpos;
+ 
+ 	/* Drop OS cache which is higher usage_count in shared_buffer */
+ 	if(DropDuplicateBuffers != -1 && bufHdr->usage_count >= DropDuplicateBuffers)
+ 	{
+ 		reln = smgropen(bufHdr->tag.rnode, InvalidBackendId);
+ 		v = _mdfd_getseg(reln, bufHdr->tag.forkNum, bufHdr->tag.blockNum, false, 0);
+ 		seekpos = (off_t) BLCKSZ *(bufHdr->tag.blockNum % ((BlockNumber) RELSEG_SIZE));
+ 		FileCacheAdvise(v->mdfd_vfd, seekpos, (off_t) BLCKSZ, POSIX_FADV_DONTNEED);
+ 		bufHdr->flags |= BM_FADVED;
+ 	}
+ }
+ 
  /* ---------------------------------------------------------------------
   *		DropRelFileNodeBuffers
   *
***************
*** 2414,2419 **** FlushRelationBuffers(Relation rel)
--- 2452,2459 ----
  				error_context_stack = &errcallback;
  
  				PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+ 				
+ 				bufHdr->flags |= BM_FADVED;
  
  				smgrwrite(rel->rd_smgr,
  						  bufHdr->tag.forkNum,
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
***************
*** 204,209 **** LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
--- 204,211 ----
  
  		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
  
+ 		bufHdr->flags |= BM_FADVED;
+ 
  		/* And write... */
  		smgrwrite(oreln,
  				  bufHdr->tag.forkNum,
*** a/src/backend/storage/file/fd.c
--- b/src/backend/storage/file/fd.c
***************
*** 383,388 **** pg_flush_data(int fd, off_t offset, off_t amount)
--- 383,403 ----
  	return 0;
  }
  
+ /*
+  * pg_fadvise --- advise OS that the cache will need or not
+  *
+  * Not all platforms have posix_fadvise. If it does not support posix_fadvise,
+  * we do nothing about here.
+  */
+ int
+ pg_fadvise(int fd, off_t offset, off_t amount, int advise)
+ {
+ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+ 	return posix_fadvise(fd, offset, amount, advise);
+ #else
+ 	return 0;
+ #endif
+ }
  
  /*
   * fsync_fname -- fsync a file or directory, handling errors properly
***************
*** 1142,1147 **** OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
--- 1157,1171 ----
  }
  
  /*
+  * Controling OS file cache using posix_fadvise()
+  */
+ int
+ FileCacheAdvise(File file, off_t offset, off_t amount, int advise)
+ {
+ 	return pg_fadvise(VfdCache[file].fd, offset, amount, advise);
+ }
+ 
+ /*
   * close a file when done with it
   */
  void
*** a/src/backend/storage/smgr/md.c
--- b/src/backend/storage/smgr/md.c
***************
*** 109,121 ****
   *	All MdfdVec objects are palloc'd in the MdCxt memory context.
   */
  
- typedef struct _MdfdVec
- {
- 	File		mdfd_vfd;		/* fd number in fd.c's pool */
- 	BlockNumber mdfd_segno;		/* segment number, from 0 */
- 	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
- } MdfdVec;
- 
  static MemoryContext MdCxt;		/* context for all md.c allocations */
  
  
--- 109,114 ----
***************
*** 163,175 **** static CycleCtr mdsync_cycle_ctr = 0;
  static CycleCtr mdckpt_cycle_ctr = 0;
  
  
- typedef enum					/* behavior for mdopen & _mdfd_getseg */
- {
- 	EXTENSION_FAIL,				/* ereport if segment not present */
- 	EXTENSION_RETURN_NULL,		/* return NULL if not present */
- 	EXTENSION_CREATE			/* create new segments as needed */
- } ExtensionBehavior;
- 
  /* local routines */
  static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
  			 bool isRedo);
--- 156,161 ----
***************
*** 183,190 **** static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
  			  BlockNumber segno);
  static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
  			  BlockNumber segno, int oflags);
- static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
- 			 BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
  static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
  		   MdfdVec *seg);
  
--- 169,174 ----
***************
*** 1701,1707 **** _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
   * segment, according to "behavior".  Note: skipFsync is only used in the
   * EXTENSION_CREATE case.
   */
! static MdfdVec *
  _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
  			 bool skipFsync, ExtensionBehavior behavior)
  {
--- 1685,1691 ----
   * segment, according to "behavior".  Note: skipFsync is only used in the
   * EXTENSION_CREATE case.
   */
! extern MdfdVec *
  _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
  			 bool skipFsync, ExtensionBehavior behavior)
  {
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 61,66 ****
--- 61,67 ----
  #include "replication/walreceiver.h"
  #include "replication/walsender.h"
  #include "storage/bufmgr.h"
+ #include "storage/buf_internals.h"
  #include "storage/dsm_impl.h"
  #include "storage/standby.h"
  #include "storage/fd.h"
***************
*** 2056,2061 **** static struct config_int ConfigureNamesInt[] =
--- 2057,2072 ----
  	},
  
  	{
+ 		{"drop_duplicate_buffers", PGC_SIGHUP, RESOURCES_CHECKPOINTER,
+ 			gettext_noop("drop duplicate buffers in OS file cache"),
+ 			NULL
+ 		},
+ 		&DropDuplicateBuffers,
+ 		-1, -1, BM_MAX_USAGE_COUNT,
+ 		NULL, NULL, NULL
+ 	},
+ 
+ 	{
  		{"wal_buffers", PGC_POSTMASTER, WAL_SETTINGS,
  			gettext_noop("Sets the number of disk-page buffers in shared memory for WAL."),
  			NULL,
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 115,120 ****
--- 115,122 ----
  
  #shared_buffers = 32MB			# min 128kB
  					# (change requires restart)
+ #drop_duplicate_buffers = -1            # set target min usage_count in shared_buffers
+ 					# -1 is disables
  #temp_buffers = 8MB			# min 800kB
  #max_prepared_transactions = 0		# zero disables the feature
  					# (change requires restart)
*** a/src/include/postmaster/bgwriter.h
--- b/src/include/postmaster/bgwriter.h
***************
*** 23,28 ****
--- 23,29 ----
  extern int	BgWriterDelay;
  extern int	CheckPointTimeout;
  extern int	CheckPointWarning;
+ extern int	DropDuplicateBuffers;
  extern double CheckPointCompletionTarget;
  
  extern void BackgroundWriterMain(void) __attribute__((noreturn));
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 40,45 ****
--- 40,46 ----
  #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
  #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
+ #define BM_FADVED		(1 << 9)			/* remove duplicate file cache flag */
  
  typedef bits16 BufFlags;
  
*** a/src/include/storage/fd.h
--- b/src/include/storage/fd.h
***************
*** 68,73 **** extern int	max_safe_fds;
--- 68,74 ----
  extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
  extern File OpenTemporaryFile(bool interXact);
  extern void FileClose(File file);
+ extern int	FileCacheAdvise(File file, off_t offset, off_t amount, int advise);
  extern int	FilePrefetch(File file, off_t offset, int amount);
  extern int	FileRead(File file, char *buffer, int amount);
  extern int	FileWrite(File file, char *buffer, int amount);
***************
*** 113,118 **** extern int	pg_fsync_no_writethrough(int fd);
--- 114,120 ----
  extern int	pg_fsync_writethrough(int fd);
  extern int	pg_fdatasync(int fd);
  extern int	pg_flush_data(int fd, off_t offset, off_t amount);
+ extern int	pg_fadvise(int fd, off_t offset, off_t amount, int advise);
  extern void fsync_fname(char *fname, bool isdir);
  
  /* Filename components for OpenTemporaryFile */
*** a/src/include/storage/smgr.h
--- b/src/include/storage/smgr.h
***************
*** 17,23 ****
  #include "fmgr.h"
  #include "storage/block.h"
  #include "storage/relfilenode.h"
! 
  
  /*
   * smgr.c maintains a table of SMgrRelation objects, which are essentially
--- 17,23 ----
  #include "fmgr.h"
  #include "storage/block.h"
  #include "storage/relfilenode.h"
! #include "storage/fd.h"
  
  /*
   * smgr.c maintains a table of SMgrRelation objects, which are essentially
***************
*** 76,81 **** typedef SMgrRelationData *SMgrRelation;
--- 76,96 ----
  #define SmgrIsTemp(smgr) \
  	RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
  
+ /* If you want to get more detail about MdfdVec, you read src/storage/smgr/md.c */
+ typedef struct _MdfdVec
+ {
+ 	File		mdfd_vfd;	/* fd number in fd.c's pool */
+ 	BlockNumber mdfd_segno;		/* segment number, from 0 */
+ 	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
+ } MdfdVec;
+ 
+ typedef enum		/* behavior for mdopen & _mdfd_getseg */
+ {
+ 	EXTENSION_FAIL,			/* ereport if segment not present */
+ 	EXTENSION_RETURN_NULL,		/* return NULL if not present */
+ 	EXTENSION_CREATE		/* create new segments as needed */
+ } ExtensionBehavior;
+ 
  extern void smgrinit(void);
  extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
  extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
***************
*** 128,133 **** extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
--- 143,150 ----
  extern void mdpreckpt(void);
  extern void mdsync(void);
  extern void mdpostckpt(void);
+ extern MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
+ 		BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
  
  extern void SetForwardFsyncRequests(void);
  extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
*** a/src/include/utils/guc_tables.h
--- b/src/include/utils/guc_tables.h
***************
*** 63,68 **** enum config_group
--- 63,69 ----
  	RESOURCES_KERNEL,
  	RESOURCES_VACUUM_DELAY,
  	RESOURCES_BGWRITER,
+ 	RESOURCES_CHECKPOINTER,
  	RESOURCES_ASYNCHRONOUS,
  	WAL,
  	WAL_SETTINGS,
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to