From 9c8528913575edd9dd8a095e9cd7dd648fed0c5f Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilip.kumar@enterprisedb.com>
Date: Thu, 12 Oct 2023 16:04:14 +0530
Subject: [PATCH v2 3/3] Introduce bank-wise LRU counter

Since we have already divided buffer pool in banks and victim
buffer search is also done at the bank level so there is no need
to have a centralized lru counter.  And this will also improve
the performance by reducing the frequent cpu cache invalidation by
not updating the common variable.

Dilip Kumar based on design idea from Robert Haas
---
 src/backend/access/transam/slru.c | 23 +++++++++++++++--------
 src/include/access/slru.h         | 28 +++++++++++++++++-----------
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index d0931308f8..318d9ea3fa 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -110,13 +110,13 @@ typedef struct SlruWriteAllData *SlruWriteAll;
  *
  * The reason for the if-test is that there are often many consecutive
  * accesses to the same page (particularly the latest page).  By suppressing
- * useless increments of cur_lru_count, we reduce the probability that old
+ * useless increments of bank_cur_lru_count, we reduce the probability that old
  * pages' counts will "wrap around" and make them appear recently used.
  *
  * We allow this code to be executed concurrently by multiple processes within
  * SimpleLruReadPage_ReadOnly().  As long as int reads and writes are atomic,
  * this should not cause any completely-bogus values to enter the computation.
- * However, it is possible for either cur_lru_count or individual
+ * However, it is possible for either bank_cur_lru_count or individual
  * page_lru_count entries to be "reset" to lower values than they should have,
  * in case a process is delayed while it executes this macro.  With care in
  * SlruSelectLRUPage(), this does little harm, and in any case the absolute
@@ -125,9 +125,10 @@ typedef struct SlruWriteAllData *SlruWriteAll;
  */
 #define SlruRecentlyUsed(shared, slotno)	\
 	do { \
-		int		new_lru_count = (shared)->cur_lru_count; \
+		int		slrubankno = (slotno) / SLRU_BANK_SIZE; \
+		int		new_lru_count = (shared)->bank_cur_lru_count[slrubankno]; \
 		if (new_lru_count != (shared)->page_lru_count[slotno]) { \
-			(shared)->cur_lru_count = ++new_lru_count; \
+			(shared)->bank_cur_lru_count[slrubankno] = ++new_lru_count; \
 			(shared)->page_lru_count[slotno] = new_lru_count; \
 		} \
 	} while (0)
@@ -200,6 +201,7 @@ SimpleLruShmemSize(int nslots, int nlsns)
 	sz += MAXALIGN(nslots * sizeof(int));	/* page_lru_count[] */
 	sz += MAXALIGN(nslots * sizeof(LWLockPadded));	/* buffer_locks[] */
 	sz += MAXALIGN((bankmask + 1) * sizeof(LWLockPadded));	/* bank_locks[] */
+	sz += MAXALIGN((bankmask + 1) * sizeof(int));   /* bank_cur_lru_count[] */
 
 	if (nlsns > 0)
 		sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));	/* group_lsn[] */
@@ -277,8 +279,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 		shared->num_slots = nslots;
 		shared->lsn_groups_per_page = nlsns;
 
-		shared->cur_lru_count = 0;
-
 		/* shared->latest_page_number will be set later */
 
 		shared->slru_stats_idx = pgstat_get_slru_index(name);
@@ -301,6 +301,8 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 		offset += MAXALIGN(nslots * sizeof(LWLockPadded));
 		shared->bank_locks = (LWLockPadded *) (ptr + offset);
 		offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
+		shared->bank_cur_lru_count = (int *) (ptr + offset);
+		offset += MAXALIGN(nbanks * sizeof(int));
 
 		if (nlsns > 0)
 		{
@@ -322,8 +324,11 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 		}
 		/* Initialize bank locks for each buffer bank. */
 		for (bankno = 0; bankno < nbanks; bankno++)
+		{
 			LWLockInitialize(&shared->bank_locks[bankno].lock,
 							 bank_tranche_id);
+			shared->bank_cur_lru_count[bankno] = 0;
+		}
 
 		/* Should fit to estimated shmem size */
 		Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
@@ -1113,9 +1118,11 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		int			best_invalid_page_number = 0;	/* keep compiler quiet */
 
 		/* See if page already has a buffer assigned */
-		int			bankstart = (pageno & ctl->bank_mask) * SLRU_BANK_SIZE;
+		int			bankno = pageno & ctl->bank_mask;
+		int			bankstart = bankno * SLRU_BANK_SIZE;
 		int			bankend = bankstart + SLRU_BANK_SIZE;
 
+
 		for (slotno = bankstart; slotno < bankend; slotno++)
 		{
 			if (shared->page_number[slotno] == pageno &&
@@ -1150,7 +1157,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		 * That gets us back on the path to having good data when there are
 		 * multiple pages with the same lru_count.
 		 */
-		cur_count = (shared->cur_lru_count)++;
+		cur_count = (shared->bank_cur_lru_count[bankno])++;
 		for (slotno = bankstart; slotno < bankend; slotno++)
 		{
 			int			this_delta;
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 8844853a57..9be6d26d78 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -73,6 +73,23 @@ typedef struct SlruSharedData
 	 */
 	LWLockPadded *bank_locks;
 
+	/*----------
+	 * Instead of global counter we maintain a bank-wise lru counter because
+	 * a) we are doing the victim buffer selection as bank level so there is
+	 * no point of having a global counter b) manipulating a global counter
+	 * will have frequent cpu cache invalidation and that will affect the
+	 * performance.
+	 *
+	 * We mark a page "most recently used" by setting
+	 *		page_lru_count[slotno] = ++bank_cur_lru_count[bankno];
+	 * The oldest page is therefore the one with the highest value of
+	 *		bank_cur_lru_count[bankno] - page_lru_count[slotno]
+	 * The counts will eventually wrap around, but this calculation still
+	 * works as long as no page's age exceeds INT_MAX counts.
+	 *----------
+	 */
+	int			 *bank_cur_lru_count;
+
 	/*
 	 * Optional array of WAL flush LSNs associated with entries in the SLRU
 	 * pages.  If not zero/NULL, we must flush WAL before writing pages (true
@@ -84,17 +101,6 @@ typedef struct SlruSharedData
 	XLogRecPtr *group_lsn;
 	int			lsn_groups_per_page;
 
-	/*----------
-	 * We mark a page "most recently used" by setting
-	 *		page_lru_count[slotno] = ++cur_lru_count;
-	 * The oldest page is therefore the one with the highest value of
-	 *		cur_lru_count - page_lru_count[slotno]
-	 * The counts will eventually wrap around, but this calculation still
-	 * works as long as no page's age exceeds INT_MAX counts.
-	 *----------
-	 */
-	int			cur_lru_count;
-
 	/*
 	 * latest_page_number is the page number of the current end of the log;
 	 * this is not critical data, since we use it only to avoid swapping out
-- 
2.39.2 (Apple Git-143)

