From e7239148d6dab4482e0f97958c077102295c31c7 Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 21 Feb 2025 11:17:28 +0100
Subject: [PATCH v3 2/3] Extend pg_buffercache with new view
 pg_buffercache_numa to show NUMA zone

---
 contrib/pg_buffercache/Makefile               |   3 +-
 contrib/pg_buffercache/meson.build            |   1 +
 .../pg_buffercache--1.5--1.6.sql              |  30 +++++
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c | 113 +++++++++++++++++-
 src/backend/utils/misc/guc_tables.c           |   2 +-
 src/include/storage/pg_shmem.h                |   1 +
 7 files changed, 146 insertions(+), 6 deletions(-)
 create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index eae65ead9e5..2a33602537e 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -8,7 +8,8 @@ OBJS = \
 EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
-	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql
+	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
+	pg_buffercache--1.5--1.6.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
 REGRESS = pg_buffercache
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 12d1fe48717..9b2e9393410 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -23,6 +23,7 @@ install_data(
   'pg_buffercache--1.2.sql',
   'pg_buffercache--1.3--1.4.sql',
   'pg_buffercache--1.4--1.5.sql',
+  'pg_buffercache--1.5--1.6.sql',
   'pg_buffercache.control',
   kwargs: contrib_data_args,
 )
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
new file mode 100644
index 00000000000..e5b3d1f7dd2
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -0,0 +1,30 @@
+/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_buffercache" to load this file. \quit
+
+-- Register the function.
+DROP FUNCTION pg_buffercache_pages() CASCADE;
+
+CREATE OR REPLACE FUNCTION pg_buffercache_pages(boolean)
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE OR REPLACE VIEW pg_buffercache AS
+	SELECT P.* FROM pg_buffercache_pages(false) AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4);
+
+CREATE OR REPLACE VIEW pg_buffercache_numa AS
+	SELECT P.* FROM pg_buffercache_pages(true) AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4, numa_zone_id int4);
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_pages(boolean) FROM PUBLIC;
+REVOKE ALL ON pg_buffercache FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_numa FROM PUBLIC;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index 5ee875f77dd..b030ba3a6fa 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 3ae0a018e10..a5aab07fc99 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -6,6 +6,7 @@
  *	  contrib/pg_buffercache/pg_buffercache_pages.c
  *-------------------------------------------------------------------------
  */
+#include "pg_config.h"
 #include "postgres.h"
 
 #include "access/htup_details.h"
@@ -13,10 +14,16 @@
 #include "funcapi.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#include <unistd.h>
+#endif
+#include "storage/pg_shmem.h"
 
 
 #define NUM_BUFFERCACHE_PAGES_MIN_ELEM	8
-#define NUM_BUFFERCACHE_PAGES_ELEM	9
+#define NUM_BUFFERCACHE_PAGES_ELEM	10
 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
 
@@ -43,6 +50,7 @@ typedef struct
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
+	int32 		numa_zone_id;
 } BufferCachePagesRec;
 
 
@@ -65,6 +73,15 @@ PG_FUNCTION_INFO_V1(pg_buffercache_summary);
 PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict);
 
+static void
+pg_buffercache_mark_numa_invalid(BufferCachePagesContext *fctx, int n)
+{
+	int i;
+	for (i = 0; i < n; i++) {
+		fctx->record[i].numa_zone_id = -1;
+	}
+}
+
 Datum
 pg_buffercache_pages(PG_FUNCTION_ARGS)
 {
@@ -75,10 +92,16 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 	TupleDesc	tupledesc;
 	TupleDesc	expected_tupledesc;
 	HeapTuple	tuple;
+	Buffer		query_numa = PG_GETARG_BOOL(0);
 
 	if (SRF_IS_FIRSTCALL())
 	{
-		int			i;
+		int			i, blk2page, j;
+		Size		os_page_size;
+		void		**os_page_ptrs;
+		int			*os_pages_status;
+		int			os_page_count;
+		float		pages_per_blk;
 
 		funcctx = SRF_FIRSTCALL_INIT();
 
@@ -122,10 +145,14 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
 						   INT2OID, -1, 0);
 
-		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+		if (expected_tupledesc->natts >= NUM_BUFFERCACHE_PAGES_ELEM-1)
 			TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
 							   INT4OID, -1, 0);
 
+		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+			TupleDescInitEntry(tupledesc, (AttrNumber) 10, "numa_zone_id",
+							   INT4OID, -1, 0);
+
 		fctx->tupdesc = BlessTupleDesc(tupledesc);
 
 		/* Allocate NBuffers worth of BufferCachePagesRec records. */
@@ -140,6 +167,30 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		/* Return to original context when allocating transient memory */
 		MemoryContextSwitchTo(oldcontext);
 
+		/* This is for gathering some NUMA statistics. We might be using
+		 * various DB block sizes (4kB, 8kB , .. 32kB) that end up being
+		 * allocated in various different OS memory pages sizes, so first
+		 * we need to understand the OS memory page size before
+		 * calling move_pages()
+		 */
+		os_page_size = sysconf(_SC_PAGESIZE);
+		if(huge_pages_status == HUGE_PAGES_ON)
+			GetHugePageSize(&os_page_size, NULL);
+		os_page_count = (NBuffers * BLCKSZ) / os_page_size;
+		pages_per_blk = (float) BLCKSZ / os_page_size;
+
+		elog(NOTICE, "os_page_count=%d os_page_size=%ld pages_per_blk=%f",
+			os_page_count, os_page_size, pages_per_blk);
+
+		os_page_ptrs = palloc(sizeof(void *) * os_page_count);
+		os_pages_status = palloc(sizeof(int) * os_page_count);
+		memset(os_page_ptrs, 0, sizeof(void *) * os_page_count);
+		/*
+		 * If we get ever 0xff back from kernel inquiry, then we probably
+		 * have bug in our buffers to OS page mapping code here
+		 */
+		memset(os_pages_status, 0xff, sizeof(int) * os_page_count);
+
 		/*
 		 * Scan through all the buffers, saving the relevant fields in the
 		 * fctx->record structure.
@@ -177,8 +228,61 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			else
 				fctx->record[i].isvalid = false;
 
+#ifdef USE_LIBNUMA
+/* FIXME: taken from bufmgr.c, maybe move to .h ? */
+#define BufHdrGetBlock(bufHdr)        ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+			blk2page = (int) i * pages_per_blk;
+			j = 0;
+			do {
+				/*
+				 * Many buffers can point to the same page, but we want to
+				 * query just first address.
+				 *
+				 * In order to get reliable results we also need to touch memory pages
+				 * so that inquiry about NUMA zone doesn't return -2.
+				 */
+				if(os_page_ptrs[blk2page+j] == 0) {
+					volatile uint64 touch pg_attribute_unused();
+					os_page_ptrs[blk2page+j] = (char *)BufHdrGetBlock(bufHdr) + (os_page_size*j);
+					touch = *(uint64 *)os_page_ptrs[blk2page+j];
+				}
+				j++;
+			} while(j < (int)pages_per_blk);
+#endif
+
 			UnlockBufHdr(bufHdr, buf_state);
 		}
+
+
+#ifdef USE_LIBNUMA
+		if(query_numa) {
+			/* According to numa(3) it is required to initialize library even if that's no-op. */
+			if(numa_available() == -1) {
+				pg_buffercache_mark_numa_invalid(fctx, NBuffers);
+				elog(NOTICE, "libnuma initialization failed, some NUMA data might be unavailable.");;
+			} else {
+				/* Amortize the number of pages we need to query about */
+				if(numa_move_pages(0, os_page_count, os_page_ptrs, NULL, os_pages_status, 0) == -1) {
+					elog(ERROR, "failed NUMA pages inquiry status");
+				}
+				for (i = 0; i < NBuffers; i++) {
+					blk2page = (int) i * pages_per_blk;
+					/* Technically we can get errors too here and pass that to user
+					*
+					* XXX:: also we could somehow report single DB block spanning
+					*       more than 2 NUMA zones, but it should be rare (?)
+					*/
+					fctx->record[i].numa_zone_id = os_pages_status[blk2page];
+				}
+			}
+		} else
+			pg_buffercache_mark_numa_invalid(fctx, NBuffers);
+#else
+		pg_buffercache_mark_numa_invalid(fctx, NBuffers);
+#endif
+
+		pfree(os_page_ptrs);
+		pfree(os_pages_status);
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
@@ -211,6 +315,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			nulls[7] = true;
 			/* unused for v1.0 callers, but the array is always long enough */
 			nulls[8] = true;
+			nulls[9] = true;
 		}
 		else
 		{
@@ -231,6 +336,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			/* unused for v1.0 callers, but the array is always long enough */
 			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
 			nulls[8] = false;
+			values[9] = Int32GetDatum(fctx->record[i].numa_zone_id);
+			nulls[9] = false;
 		}
 
 		/* Build and return the tuple. */
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 03a6dd49154..172309d389a 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -562,7 +562,7 @@ static int	ssl_renegotiation_limit;
  */
 int			huge_pages = HUGE_PAGES_TRY;
 int			huge_page_size;
-static int	huge_pages_status = HUGE_PAGES_UNKNOWN;
+int			huge_pages_status = HUGE_PAGES_UNKNOWN;
 
 /*
  * These variables are all dummies that don't do anything, except in some
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index b99ebc9e86f..5f7d4b83a60 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -45,6 +45,7 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 extern PGDLLIMPORT int shared_memory_type;
 extern PGDLLIMPORT int huge_pages;
 extern PGDLLIMPORT int huge_page_size;
+extern PGDLLIMPORT int huge_pages_status;
 
 /* Possible values for huge_pages and huge_pages_status */
 typedef enum
-- 
2.39.5

