Recent results from Robert show clog contention is still an issue.

In various discussions Tom noted that pages prior to RecentXmin are
readonly and we might find a way to make use of that fact in providing
different mechanisms or resources.

I've taken that idea and used it to build a second Clog cache, known
as ClogHistory which allows access to the read-only tail of pages in
the clog. Once a page has been written to for the last time, it will
be accessed via the ClogHistory Slru in preference to the normal Clog
Slru. This separates historical accesses by readers from current write
access by committers. Historical access doesn't force dirty writes,
nor are commits made to wait when historical access occurs.

The patch is very simple because all the writes still continue through
the normal route, so is suitable for 9.2.

I'm no longer working on "clog partitioning" patch for this release.

-- 
 Simon Riggs                   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 69b6ef3..6ff6894 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -37,6 +37,7 @@
 #include "access/transam.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
+#include "utils/snapmgr.h"
 
 /*
  * Defines for CLOG page sizes.  A page is the same BLCKSZ as is used
@@ -70,10 +71,17 @@
 
 /*
  * Link to shared-memory data structures for CLOG control
+ *
+ * As of 9.2, we have 2 structures for commit log data.
+ * ClogCtl manages the main read/write part of the commit log, while
+ * the ClogHistoryCtl manages the now read-only, older part. ClogHistory
+ * removes contention from the path of transaction commits.
  */
 static SlruCtlData ClogCtlData;
+static SlruCtlData ClogHistoryCtlData;
 
-#define ClogCtl (&ClogCtlData)
+#define ClogCtl 		(&ClogCtlData)
+#define ClogHistoryCtl	(&ClogHistoryCtlData)
 
 
 static int	ZeroCLOGPage(int pageno, bool writeXlog);
@@ -296,6 +304,10 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
 
 		/* ... then the main transaction */
 		TransactionIdSetStatusBit(xid, status, lsn, slotno);
+
+		/* When we commit advance ClogCtl's shared RecentXminPageno if needed */
+		if (ClogCtl->shared->RecentXminPageno < TransactionIdToPage(RecentXmin))
+			ClogCtl->shared->RecentXminPageno = TransactionIdToPage(RecentXmin);
 	}
 
 	/* Set the subtransactions */
@@ -387,6 +399,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
 XidStatus
 TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
 {
+	SlruCtl		clog = ClogCtl;
+	bool		useClogHistory = true;
 	int			pageno = TransactionIdToPage(xid);
 	int			byteno = TransactionIdToByte(xid);
 	int			bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
@@ -397,15 +411,35 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
 
 	/* lock is acquired by SimpleLruReadPage_ReadOnly */
 
-	slotno = SimpleLruReadPage_ReadOnly(ClogCtl, pageno, xid);
-	byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
+	/*
+	 * Decide whether to use main Clog or read-only ClogHistory.
+	 *
+	 * Our knowledge of the boundary between the two may be a little out
+	 * of date, so if we try Clog and can't find it we need to try again
+	 * against ClogHistory.
+	 */
+	if (pageno >= ClogCtl->recent_oldest_active_page_number)
+	{
+		slotno = SimpleLruReadPage_ReadOnly(clog, pageno, xid);
+		if (slotno >= 0)
+			useClogHistory = false;
+	}
+
+	if (useClogHistory)
+	{
+		clog = ClogHistoryCtl;
+		slotno = SimpleLruReadPage_ReadOnly(clog, pageno, xid);
+		Assert(slotno >= 0);
+	}
+
+	byteptr = clog->shared->page_buffer[slotno] + byteno;
 
 	status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
 
 	lsnindex = GetLSNIndex(slotno, xid);
-	*lsn = ClogCtl->shared->group_lsn[lsnindex];
+	*lsn = clog->shared->group_lsn[lsnindex];
 
-	LWLockRelease(CLogControlLock);
+	LWLockRelease(clog->shared->ControlLock);
 
 	return status;
 }
@@ -445,15 +479,19 @@ CLOGShmemBuffers(void)
 Size
 CLOGShmemSize(void)
 {
-	return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
+	/* Reserve shmem for both ClogCtl and ClogHistoryCtl */
+	return SimpleLruShmemSize(2 * CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
 }
 
 void
 CLOGShmemInit(void)
 {
 	ClogCtl->PagePrecedes = CLOGPagePrecedes;
+	ClogHistoryCtl->PagePrecedes = CLOGPagePrecedes;
 	SimpleLruInit(ClogCtl, "CLOG Ctl", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
 				  CLogControlLock, "pg_clog");
+	SimpleLruInit(ClogHistoryCtl, "CLOG History Ctl", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
+				  CLogHistoryControlLock, "pg_clog");
 }
 
 /*
@@ -592,6 +630,16 @@ CheckPointCLOG(void)
 	TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
 	SimpleLruFlush(ClogCtl, true);
 	TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
+
+	/*
+	 * Now that we've written out all dirty buffers the only pages that
+	 * will get dirty again will be pages with active transactions on them.
+	 * So we can move forward the oldest_active_page_number and allow
+	 * read only operations via ClogHistoryCtl.
+	 */
+	LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
+	ClogCtl->shared->oldest_active_page_number = ClogCtl->shared->RecentXminPageno;
+	LWLockRelease(CLogControlLock);
 }
 
 
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 30538ff..2cebdf9 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -188,6 +188,9 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 
 		shared->cur_lru_count = 0;
 
+		shared->oldest_active_page_number = -1;
+		shared->RecentXminPageno = -1;
+
 		/* shared->latest_page_number will be set later */
 
 		ptr = (char *) shared;
@@ -476,6 +479,16 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
 	LWLockRelease(shared->ControlLock);
 	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
+	/* update local state while we have the lock */
+	ctl->recent_oldest_active_page_number = shared->oldest_active_page_number;
+
+	/* Check if our cached boundary information was out of date */
+	if (pageno < ctl->recent_oldest_active_page_number)
+	{
+		LWLockRelease(shared->ControlLock);
+		return -1;
+	}
+
 	return SimpleLruReadPage(ctl, pageno, true, xid);
 }
 
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 41cd484..f7b0d87 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -99,6 +99,15 @@ typedef struct SlruSharedData
 	 * the latest page.
 	 */
 	int			latest_page_number;
+
+	/*
+	 * RecentXminPageno is the oldest page that any active
+	 * transaction would ever wish to write to.
+	 * oldest_active_page_number is the oldest dirty page, or the
+	 * RecentXminPageno, whichever is lower. We advance oldest at checkpoint.
+	 */
+	int			oldest_active_page_number;
+	int			RecentXminPageno;
 } SlruSharedData;
 
 typedef SlruSharedData *SlruShared;
@@ -125,6 +134,11 @@ typedef struct SlruCtlData
 	bool		(*PagePrecedes) (int, int);
 
 	/*
+	 * Local cached value of oldest_active_page_number.
+	 */
+	int			recent_oldest_active_page_number;
+
+	/*
 	 * Dir is set during SimpleLruInit and does not change thereafter. Since
 	 * it's always the same, it doesn't need to be in shared memory.
 	 */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index df3df29..3d8838f 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -79,6 +79,7 @@ typedef enum LWLockId
 	SerializablePredicateLockListLock,
 	OldSerXidLock,
 	SyncRepLock,
+	CLogHistoryControlLock,
 	/* Individual lock IDs end here */
 	FirstBufMappingLock,
 	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to