Hi hackers, I was doing some more tests around ba2a3c2302f (pg_buffercache_numa) and thought that seeing how buffers are spread across multiple OS pages (if that's the case) thanks to the os_page_num field is good information to have.
The thing that I think is annoying is that to get this information (os_page_num): - One needs to use pg_buffercache_numa (which is more costly/slower) than pg_buffercache - One needs a system with NUMA support enabled So why not also add this information (os_page_num) in pg_buffercache? - It would make this information available on all systems, not just NUMA-enabled ones - It would help understand the memory layout implications of configuration changes such as database block size, OS page size (huge pages for example) and see how the buffers are spread across OS pages (if that's the case). So, please find attached a patch to $SUBJECT then. Remarks: - Maybe we could create a helper function to reduce the code duplication between pg_buffercache_pages() and pg_buffercache_numa_pages() - I think it would have made sense to also add this information while working on ba2a3c2302f but (unfortunately) I doubt that this patch is candidate for v18 post freeze (it looks more a feature enhancement than anything else) - It's currently doing the changes in pg_buffercache v1.6 but will need to create v1.7 for 19 (if the above stands true) Looking forward to your feedback, Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
>From 1d3516efd6ee1c18a82f56b87c5afc3d54fb62cf Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <bertranddrouvot...@gmail.com> Date: Thu, 10 Apr 2025 05:49:45 +0000 Subject: [PATCH v1] Add os_page_num to pg_buffercache ba2a3c2302f added a way to check if a buffer is spread across multiple pages. Adding the same information in pg_buffercache so that one does not need NUMA support enabled to get this information. --- .../expected/pg_buffercache.out | 2 +- .../pg_buffercache--1.5--1.6.sql | 7 ++ contrib/pg_buffercache/pg_buffercache_pages.c | 118 ++++++++++++++---- contrib/pg_buffercache/sql/pg_buffercache.sql | 2 +- doc/src/sgml/pgbuffercache.sgml | 34 +++++ 5 files changed, 135 insertions(+), 28 deletions(-) diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index 9a9216dc7b1..52e38dcc027 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -1,5 +1,5 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint +select count(*) >= (select setting::bigint from pg_settings where name = 'shared_buffers') from pg_buffercache; diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql index 458f054a691..8aa81d15688 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -44,3 +44,10 @@ CREATE FUNCTION pg_buffercache_evict_all( OUT buffers_skipped int4) AS 'MODULE_PATHNAME', 'pg_buffercache_evict_all' LANGUAGE C PARALLEL SAFE VOLATILE; + +-- Upgrade view to 1.6. format +CREATE OR REPLACE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4, os_page_num bigint); diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index e1701bd56ef..b941aba2b7e 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -19,7 +19,7 @@ #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 -#define NUM_BUFFERCACHE_PAGES_ELEM 9 +#define NUM_BUFFERCACHE_PAGES_ELEM 10 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 #define NUM_BUFFERCACHE_EVICT_ELEM 2 @@ -54,6 +54,7 @@ typedef struct * because of bufmgr.c's PrivateRefCount infrastructure. */ int32 pinning_backends; + int64 page_num; } BufferCachePagesRec; @@ -63,6 +64,9 @@ typedef struct typedef struct { TupleDesc tupdesc; + int buffers_per_page; + int pages_per_buffer; + int os_page_size; BufferCachePagesRec *record; } BufferCachePagesContext; @@ -119,8 +123,25 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) if (SRF_IS_FIRSTCALL()) { - int i; + int i, + idx; + Size os_page_size; + char *startptr; + int pages_per_buffer; + int max_entries; + /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while + * the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS memory + * page size 2. Calculate how many OS pages are used by all buffer blocks + * 3. Calculate how many OS pages are contained within each database + * block. + */ + os_page_size = pg_get_shmem_pagesize(); + + /* Initialize the multi-call context, load entries about buffers */ funcctx = SRF_FIRSTCALL_INIT(); /* Switch context when allocating stuff to be used in later calls */ @@ -163,24 +184,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); - if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + if (expected_tupledesc->natts >= (NUM_BUFFERCACHE_PAGES_ELEM - 1)) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); + if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + TupleDescInitEntry(tupledesc, (AttrNumber) 10, "os_page_num", + INT8OID, -1, 0); + fctx->tupdesc = BlessTupleDesc(tupledesc); - /* Allocate NBuffers worth of BufferCachePagesRec records. */ + /* + * Each buffer needs at least one entry, but it might be offset in + * some way, and use one extra entry. So we allocate space for the + * maximum number of entries we might need, and then count the exact + * number as we're walking buffers. That way we can do it in one pass, + * without reallocating memory. + */ + pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1; + max_entries = NBuffers * pages_per_buffer; + + /* Allocate entries for BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, - sizeof(BufferCachePagesRec) * NBuffers); - - /* Set max calls and remember the user function context. */ - funcctx->max_calls = NBuffers; - funcctx->user_fctx = fctx; + sizeof(BufferCachePagesRec) * max_entries); /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); + startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1)); + idx = 0; /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. @@ -191,35 +224,65 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) */ for (i = 0; i < NBuffers; i++) { + char *buffptr = (char *) BufferGetBlock(i + 1); BufferDesc *bufHdr; uint32 buf_state; + int32 page_num; + char *startptr_buff, + *endptr_buff; bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); - fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); - fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag); - fctx->record[i].reltablespace = bufHdr->tag.spcOid; - fctx->record[i].reldatabase = bufHdr->tag.dbOid; - fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag); - fctx->record[i].blocknum = bufHdr->tag.blockNum; - fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); - fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); + /* start of the first page of this buffer */ + startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr); - if (buf_state & BM_DIRTY) - fctx->record[i].isdirty = true; - else - fctx->record[i].isdirty = false; + /* end of the buffer (no need to align to memory page) */ + endptr_buff = buffptr + BLCKSZ; - /* Note if the buffer is valid, and has storage created */ - if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) - fctx->record[i].isvalid = true; - else - fctx->record[i].isvalid = false; + Assert(startptr_buff < endptr_buff); + + /* calculate ID of the first page for this buffer */ + page_num = (startptr_buff - startptr) / os_page_size; + + /* Add an entry for each OS page overlapping with this buffer. */ + for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size) + { + fctx->record[idx].bufferid = BufferDescriptorGetBuffer(bufHdr); + fctx->record[idx].relfilenumber = BufTagGetRelNumber(&bufHdr->tag); + fctx->record[idx].reltablespace = bufHdr->tag.spcOid; + fctx->record[idx].reldatabase = bufHdr->tag.dbOid; + fctx->record[idx].forknum = BufTagGetForkNum(&bufHdr->tag); + fctx->record[idx].blocknum = bufHdr->tag.blockNum; + fctx->record[idx].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); + fctx->record[idx].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); + + if (buf_state & BM_DIRTY) + fctx->record[idx].isdirty = true; + else + fctx->record[idx].isdirty = false; + + /* Note if the buffer is valid, and has storage created */ + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) + fctx->record[idx].isvalid = true; + else + fctx->record[idx].isvalid = false; + + fctx->record[idx].page_num = page_num; + /* advance to the next entry/page */ + ++idx; + ++page_num; + } UnlockBufHdr(bufHdr, buf_state); } + + Assert(idx <= max_entries); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = idx; + funcctx->user_fctx = fctx; } funcctx = SRF_PERCALL_SETUP(); @@ -252,6 +315,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) nulls[7] = true; /* unused for v1.0 callers, but the array is always long enough */ nulls[8] = true; + nulls[9] = true; } else { @@ -272,6 +336,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) /* unused for v1.0 callers, but the array is always long enough */ values[8] = Int32GetDatum(fctx->record[i].pinning_backends); nulls[8] = false; + values[9] = Int64GetDatum(fctx->record[i].page_num); + nulls[9] = false; } /* Build and return the tuple. */ diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 47cca1907c7..78ec7a88ea0 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -1,6 +1,6 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint +select count(*) >= (select setting::bigint from pg_settings where name = 'shared_buffers') from pg_buffercache; diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 537d6014942..6183adf73db 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -205,6 +205,15 @@ Number of backends pinning this buffer </para></entry> </row> + + <row> + <entry role="catalog_table_entry"><para role="column_definition"> + <structfield>os_page_num</structfield> <type>bigint</type> + </para> + <para> + number of OS memory page for this buffer + </para></entry> + </row> </tbody> </tgroup> </table> @@ -550,6 +559,31 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers public | spgist_text_tbl | 182 (10 rows) +regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages + FROM pg_buffercache b JOIN pg_class c + ON b.relfilenode = pg_relation_filenode(c.oid) AND + b.reldatabase IN (0, (SELECT oid FROM pg_database + WHERE datname = current_database())) + JOIN pg_namespace n ON n.oid = c.relnamespace + JOIN (SELECT bufferid FROM pg_buffercache + GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid + GROUP BY n.nspname, c.relname + ORDER BY 3 DESC + LIMIT 10; + + nspname | relname | buffers_on_multiple_pages +------------+---------------------------------+--------------------------- + public | gin_test_tbl | 4 + public | delete_test_table | 4 + public | tenk1 | 4 + pg_catalog | pg_attribute_relid_attnum_index | 4 + pg_catalog | pg_class | 2 + pg_catalog | pg_depend_depender_index | 2 + pg_catalog | pg_attribute | 2 + pg_catalog | pg_opfamily | 2 + pg_catalog | pg_opclass_oid_index | 2 + pg_catalog | pg_description | 2 +(10 rows) regression=# SELECT * FROM pg_buffercache_summary(); buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg -- 2.34.1