Add os_page_num to pg_buffercache

Bertrand Drouvot Thu, 10 Apr 2025 06:18:15 -0700

Hi hackers,

I was doing some more tests around ba2a3c2302f (pg_buffercache_numa) and
thought that seeing how buffers are spread across multiple OS pages (if that's
the case) thanks to the os_page_num field is good information to have.


The thing that I think is annoying is that to get this information 
(os_page_num):

- One needs to use pg_buffercache_numa (which is more costly/slower) than 
pg_buffercache
- One needs a system with NUMA support enabled

So why not also add this information (os_page_num) in pg_buffercache?

- It would make this information available on all systems, not just 
NUMA-enabled ones
- It would help understand the memory layout implications of configuration 
changes
such as database block size, OS page size (huge pages for example) and see how 
the
buffers are spread across OS pages (if that's the case).

So, please find attached a patch to $SUBJECT then.

Remarks:

- Maybe we could create a helper function to reduce the code duplication between
pg_buffercache_pages() and pg_buffercache_numa_pages()
- I think it would have made sense to also add this information while working
on ba2a3c2302f but (unfortunately) I doubt that this patch is candidate for v18
post freeze (it looks more a feature enhancement than anything else)
- It's currently doing the changes in pg_buffercache v1.6 but will need to
create v1.7 for 19 (if the above stands true)

Looking forward to your feedback,

Regards,

-- 
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com

>From 1d3516efd6ee1c18a82f56b87c5afc3d54fb62cf Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot...@gmail.com>
Date: Thu, 10 Apr 2025 05:49:45 +0000
Subject: [PATCH v1] Add os_page_num to pg_buffercache

ba2a3c2302f added a way to check if a buffer is spread across multiple pages.

Adding the same information in pg_buffercache so that one does not need NUMA
support enabled to get this information.
---
 .../expected/pg_buffercache.out               |   2 +-
 .../pg_buffercache--1.5--1.6.sql              |   7 ++
 contrib/pg_buffercache/pg_buffercache_pages.c | 118 ++++++++++++++----
 contrib/pg_buffercache/sql/pg_buffercache.sql |   2 +-
 doc/src/sgml/pgbuffercache.sgml               |  34 +++++
 5 files changed, 135 insertions(+), 28 deletions(-)

diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1..52e38dcc027 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -1,5 +1,5 @@
 CREATE EXTENSION pg_buffercache;
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
                    from pg_settings
                    where name = 'shared_buffers')
 from pg_buffercache;
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
index 458f054a691..8aa81d15688 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -44,3 +44,10 @@ CREATE FUNCTION pg_buffercache_evict_all(
     OUT buffers_skipped int4)
 AS 'MODULE_PATHNAME', 'pg_buffercache_evict_all'
 LANGUAGE C PARALLEL SAFE VOLATILE;
+
+-- Upgrade view to 1.6. format
+CREATE OR REPLACE VIEW pg_buffercache AS
+    SELECT P.* FROM pg_buffercache_pages() AS P
+    (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+     relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+     pinning_backends int4, os_page_num bigint);
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index e1701bd56ef..b941aba2b7e 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -19,7 +19,7 @@
 
 
 #define NUM_BUFFERCACHE_PAGES_MIN_ELEM	8
-#define NUM_BUFFERCACHE_PAGES_ELEM	9
+#define NUM_BUFFERCACHE_PAGES_ELEM 10
 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
 #define NUM_BUFFERCACHE_EVICT_ELEM 2
@@ -54,6 +54,7 @@ typedef struct
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
+	int64		page_num;
 } BufferCachePagesRec;
 
 
@@ -63,6 +64,9 @@ typedef struct
 typedef struct
 {
 	TupleDesc	tupdesc;
+	int			buffers_per_page;
+	int			pages_per_buffer;
+	int			os_page_size;
 	BufferCachePagesRec *record;
 } BufferCachePagesContext;
 
@@ -119,8 +123,25 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 
 	if (SRF_IS_FIRSTCALL())
 	{
-		int			i;
+		int			i,
+					idx;
+		Size		os_page_size;
+		char		*startptr;
+		int			pages_per_buffer;
+		int			max_entries;
 
+		/*
+		 * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
+		 * the OS may have different memory page sizes.
+		 *
+		 * To correctly map between them, we need to: 1. Determine the OS memory
+		 * page size 2. Calculate how many OS pages are used by all buffer blocks
+		 * 3. Calculate how many OS pages are contained within each database
+		 * block.
+		 */
+		os_page_size = pg_get_shmem_pagesize();
+
+		/* Initialize the multi-call context, load entries about buffers */
 		funcctx = SRF_FIRSTCALL_INIT();
 
 		/* Switch context when allocating stuff to be used in later calls */
@@ -163,24 +184,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
 						   INT2OID, -1, 0);
 
-		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+		if (expected_tupledesc->natts >= (NUM_BUFFERCACHE_PAGES_ELEM - 1))
 			TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
 							   INT4OID, -1, 0);
 
+		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+			TupleDescInitEntry(tupledesc, (AttrNumber) 10, "os_page_num",
+							   INT8OID, -1, 0);
+
 		fctx->tupdesc = BlessTupleDesc(tupledesc);
 
-		/* Allocate NBuffers worth of BufferCachePagesRec records. */
+		/*
+		 * Each buffer needs at least one entry, but it might be offset in
+		 * some way, and use one extra entry. So we allocate space for the
+		 * maximum number of entries we might need, and then count the exact
+		 * number as we're walking buffers. That way we can do it in one pass,
+		 * without reallocating memory.
+		 */
+		pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
+		max_entries = NBuffers * pages_per_buffer;
+
+		/* Allocate entries for BufferCachePagesRec records. */
 		fctx->record = (BufferCachePagesRec *)
 			MemoryContextAllocHuge(CurrentMemoryContext,
-								   sizeof(BufferCachePagesRec) * NBuffers);
-
-		/* Set max calls and remember the user function context. */
-		funcctx->max_calls = NBuffers;
-		funcctx->user_fctx = fctx;
+								   sizeof(BufferCachePagesRec) * max_entries);
 
 		/* Return to original context when allocating transient memory */
 		MemoryContextSwitchTo(oldcontext);
 
+		startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1));
+		idx = 0;
 		/*
 		 * Scan through all the buffers, saving the relevant fields in the
 		 * fctx->record structure.
@@ -191,35 +224,65 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		 */
 		for (i = 0; i < NBuffers; i++)
 		{
+			char       *buffptr = (char *) BufferGetBlock(i + 1);
 			BufferDesc *bufHdr;
 			uint32		buf_state;
+			int32		page_num;
+			char		*startptr_buff,
+						*endptr_buff;
 
 			bufHdr = GetBufferDescriptor(i);
 			/* Lock each buffer header before inspecting. */
 			buf_state = LockBufHdr(bufHdr);
 
-			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
-			fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
-			fctx->record[i].reltablespace = bufHdr->tag.spcOid;
-			fctx->record[i].reldatabase = bufHdr->tag.dbOid;
-			fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
-			fctx->record[i].blocknum = bufHdr->tag.blockNum;
-			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
-			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+			/* start of the first page of this buffer */
+			startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
 
-			if (buf_state & BM_DIRTY)
-				fctx->record[i].isdirty = true;
-			else
-				fctx->record[i].isdirty = false;
+			/* end of the buffer (no need to align to memory page) */
+			endptr_buff = buffptr + BLCKSZ;
 
-			/* Note if the buffer is valid, and has storage created */
-			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
-				fctx->record[i].isvalid = true;
-			else
-				fctx->record[i].isvalid = false;
+			Assert(startptr_buff < endptr_buff);
+
+			/* calculate ID of the first page for this buffer */
+			page_num = (startptr_buff - startptr) / os_page_size;
+
+			/* Add an entry for each OS page overlapping with this buffer. */
+			for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+			{
+				fctx->record[idx].bufferid = BufferDescriptorGetBuffer(bufHdr);
+				fctx->record[idx].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
+				fctx->record[idx].reltablespace = bufHdr->tag.spcOid;
+				fctx->record[idx].reldatabase = bufHdr->tag.dbOid;
+				fctx->record[idx].forknum = BufTagGetForkNum(&bufHdr->tag);
+				fctx->record[idx].blocknum = bufHdr->tag.blockNum;
+				fctx->record[idx].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
+				fctx->record[idx].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
+
+				if (buf_state & BM_DIRTY)
+					fctx->record[idx].isdirty = true;
+				else
+					fctx->record[idx].isdirty = false;
+
+				/* Note if the buffer is valid, and has storage created */
+				if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+					fctx->record[idx].isvalid = true;
+				else
+					fctx->record[idx].isvalid = false;
+
+				fctx->record[idx].page_num = page_num;
+				/* advance to the next entry/page */
+				++idx;
+				++page_num;
+			}
 
 			UnlockBufHdr(bufHdr, buf_state);
 		}
+
+		Assert(idx <= max_entries);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = idx;
+		funcctx->user_fctx = fctx;
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
@@ -252,6 +315,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			nulls[7] = true;
 			/* unused for v1.0 callers, but the array is always long enough */
 			nulls[8] = true;
+			nulls[9] = true;
 		}
 		else
 		{
@@ -272,6 +336,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			/* unused for v1.0 callers, but the array is always long enough */
 			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
 			nulls[8] = false;
+			values[9] = Int64GetDatum(fctx->record[i].page_num);
+			nulls[9] = false;
 		}
 
 		/* Build and return the tuple. */
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c7..78ec7a88ea0 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -1,6 +1,6 @@
 CREATE EXTENSION pg_buffercache;
 
-select count(*) = (select setting::bigint
+select count(*) >= (select setting::bigint
                    from pg_settings
                    where name = 'shared_buffers')
 from pg_buffercache;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 537d6014942..6183adf73db 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -205,6 +205,15 @@
        Number of backends pinning this buffer
       </para></entry>
      </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>os_page_num</structfield> <type>bigint</type>
+      </para>
+      <para>
+       number of OS memory page for this buffer
+      </para></entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
@@ -550,6 +559,31 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers
  public     | spgist_text_tbl        |     182
 (10 rows)
 
+regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages
+             FROM pg_buffercache b JOIN pg_class c
+             ON b.relfilenode = pg_relation_filenode(c.oid) AND
+                b.reldatabase IN (0, (SELECT oid FROM pg_database
+                                      WHERE datname = current_database()))
+             JOIN pg_namespace n ON n.oid = c.relnamespace
+             JOIN (SELECT bufferid FROM pg_buffercache
+                   GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid
+             GROUP BY n.nspname, c.relname
+             ORDER BY 3 DESC
+             LIMIT 10;
+
+  nspname   |             relname             | buffers_on_multiple_pages
+------------+---------------------------------+---------------------------
+ public     | gin_test_tbl                    |                         4
+ public     | delete_test_table               |                         4
+ public     | tenk1                           |                         4
+ pg_catalog | pg_attribute_relid_attnum_index |                         4
+ pg_catalog | pg_class                        |                         2
+ pg_catalog | pg_depend_depender_index        |                         2
+ pg_catalog | pg_attribute                    |                         2
+ pg_catalog | pg_opfamily                     |                         2
+ pg_catalog | pg_opclass_oid_index            |                         2
+ pg_catalog | pg_description                  |                         2
+(10 rows)
 
 regression=# SELECT * FROM pg_buffercache_summary();
  buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg
-- 
2.34.1

Add os_page_num to pg_buffercache

Reply via email to