(new thread) On Wed, Sep 03, 2025 at 02:47:25PM -0400, Andres Freund wrote: >> I see a variety for increased CPU usage: >> >> 1) The private ref count infrastructure in bufmgr.c gets a bit slower once >> more buffers are pinned > > The problem mainly seems to be that the branches in the loop at the start of > GetPrivateRefCountEntry() are entirely unpredictable in this workload. I had > an old patch that tried to make it possible to use SIMD for the search, by > using a separate array for the Buffer ids - with that gcc generates fairly > crappy code, but does make the code branchless. > > Here that substantially reduces the overhead of doing prefetching. Afterwards > it's not a meaningful source of misses anymore.
I quickly hacked together some patches for this. 0001 adds new static variables so that we have a separate array of the buffers and the index for the current ReservedRefCountEntry. 0002 optimizes the linear search in GetPrivateRefCountEntry() using our simd.h routines. This stuff feels expensive (see vector8_highbit_mask()'s implementation for AArch64), but if the main goal is to avoid branches, I think this is about as "branchless" as we can make it. I'm going to stare at this a bit longer, but I figured I'd get something on the lists while it is fresh in my mind. -- nathan
>From 5f180625c17dc76e5e68df56a35ea7ccda674013 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Thu, 2 Oct 2025 21:22:17 -0500 Subject: [PATCH v1 1/2] prepare bufmgr for simd --- src/backend/storage/buffer/bufmgr.c | 35 +++++++++++++++-------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index fe470de63f2..14222516237 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -213,10 +213,12 @@ static BufferDesc *PinCountWaitBuf = NULL; * because in some scenarios it's called with a spinlock held... */ static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]; +static Buffer PrivateRefCountArrayBuffers[REFCOUNT_ARRAY_ENTRIES]; static HTAB *PrivateRefCountHash = NULL; static int32 PrivateRefCountOverflowed = 0; static uint32 PrivateRefCountClock = 0; static PrivateRefCountEntry *ReservedRefCountEntry = NULL; +static int ReservedRefCountEntryIdx; static uint32 MaxProportionalPins; @@ -267,17 +269,12 @@ ReservePrivateRefCountEntry(void) * majority of cases. */ { - int i; - - for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) { - PrivateRefCountEntry *res; - - res = &PrivateRefCountArray[i]; - - if (res->buffer == InvalidBuffer) + if (PrivateRefCountArrayBuffers[i] == InvalidBuffer) { - ReservedRefCountEntry = res; + ReservedRefCountEntry = &PrivateRefCountArray[i]; + ReservedRefCountEntryIdx = i; return; } } @@ -296,8 +293,8 @@ ReservePrivateRefCountEntry(void) bool found; /* select victim slot */ - ReservedRefCountEntry = - &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES]; + ReservedRefCountEntryIdx = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES; + ReservedRefCountEntry = &PrivateRefCountArray[ReservedRefCountEntryIdx]; /* Better be used, otherwise we shouldn't get here. */ Assert(ReservedRefCountEntry->buffer != InvalidBuffer); @@ -312,6 +309,7 @@ ReservePrivateRefCountEntry(void) /* clear the now free array slot */ ReservedRefCountEntry->buffer = InvalidBuffer; + PrivateRefCountArrayBuffers[ReservedRefCountEntryIdx] = InvalidBuffer; ReservedRefCountEntry->refcount = 0; PrivateRefCountOverflowed++; @@ -335,6 +333,7 @@ NewPrivateRefCountEntry(Buffer buffer) /* and fill it */ res->buffer = buffer; + PrivateRefCountArrayBuffers[ReservedRefCountEntryIdx] = buffer; res->refcount = 0; return res; @@ -351,7 +350,6 @@ static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move) { PrivateRefCountEntry *res; - int i; Assert(BufferIsValid(buffer)); Assert(!BufferIsLocal(buffer)); @@ -360,12 +358,10 @@ GetPrivateRefCountEntry(Buffer buffer, bool do_move) * First search for references in the array, that'll be sufficient in the * majority of cases. */ - for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) { - res = &PrivateRefCountArray[i]; - - if (res->buffer == buffer) - return res; + if (PrivateRefCountArrayBuffers[i] == buffer) + return &PrivateRefCountArray[i]; } /* @@ -404,6 +400,7 @@ GetPrivateRefCountEntry(Buffer buffer, bool do_move) /* and fill it */ free->buffer = buffer; + PrivateRefCountArrayBuffers[ReservedRefCountEntryIdx] = buffer; free->refcount = res->refcount; /* delete from hashtable */ @@ -460,6 +457,9 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) * entries. */ ReservedRefCountEntry = ref; + + ReservedRefCountEntryIdx = ref - &PrivateRefCountArray[0]; + PrivateRefCountArrayBuffers[ReservedRefCountEntryIdx] = InvalidBuffer; } else { @@ -3993,6 +3993,7 @@ InitBufferManagerAccess(void) MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS); memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray)); + memset(&PrivateRefCountArrayBuffers, InvalidBuffer, sizeof(PrivateRefCountArrayBuffers)); hash_ctl.keysize = sizeof(int32); hash_ctl.entrysize = sizeof(PrivateRefCountEntry); -- 2.39.5 (Apple Git-154)
>From fc93e2e5d53484f2562c3dea4dea1462d575c889 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Thu, 2 Oct 2025 22:27:43 -0500 Subject: [PATCH v1 2/2] simd-ify GetPrivateRefCountEntry() --- src/backend/storage/buffer/bufmgr.c | 45 +++++++++++++++++++++++++++++ src/include/port/simd.h | 25 ++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 14222516237..8257fde22fd 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -50,6 +50,8 @@ #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" +#include "port/pg_bitutils.h" +#include "port/simd.h" #include "postmaster/bgwriter.h" #include "storage/aio.h" #include "storage/buf_internals.h" @@ -358,11 +360,54 @@ GetPrivateRefCountEntry(Buffer buffer, bool do_move) * First search for references in the array, that'll be sufficient in the * majority of cases. */ +#ifndef USE_NO_SIMD + + { + const Vector32 keys = vector32_broadcast(buffer); + Vector32 bufs1; + Vector32 bufs2; + Vector32 res1; + Vector32 res2; + Vector32 res; + uint32 msk; + +#ifdef USE_ASSERT_CHECKING + PrivateRefCountEntry *ret = NULL; + + for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + { + if (PrivateRefCountArrayBuffers[i] == buffer) + { + ret = &PrivateRefCountArray[i]; + break; + } + } +#endif + + vector32_load(&bufs1, (uint32 *) &PrivateRefCountArrayBuffers[0]); + vector32_load(&bufs2, (uint32 *) &PrivateRefCountArrayBuffers[sizeof(Vector32) / sizeof(int)]); + + res1 = vector32_eq(bufs1, keys); + res2 = vector32_eq(bufs2, keys); + res = vector32_pack_32(res1, res2); + + msk = vector32_highbit_mask(res); + if (likely(msk)) + { + Assert(ret == &PrivateRefCountArray[pg_rightmost_one_pos32(msk) / 2]); + return &PrivateRefCountArray[pg_rightmost_one_pos32(msk) / 2]; + } + else + Assert(ret == NULL); + } + +#else for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) { if (PrivateRefCountArrayBuffers[i] == buffer) return &PrivateRefCountArray[i]; } +#endif /* * By here we know that the buffer, if already pinned, isn't residing in diff --git a/src/include/port/simd.h b/src/include/port/simd.h index 97c5f353022..5a8933f45e0 100644 --- a/src/include/port/simd.h +++ b/src/include/port/simd.h @@ -331,6 +331,18 @@ vector8_highbit_mask(const Vector8 v) } #endif /* ! USE_NO_SIMD */ +#ifndef USE_NO_SIMD +static inline uint32 +vector32_highbit_mask(const Vector32 v) +{ +#ifdef USE_NEON + return vector8_highbit_mask((Vector8) v); +#else + return vector8_highbit_mask(v); +#endif +} +#endif /* ! USE_NO_SIMD */ + /* * Return the bitwise OR of the inputs */ @@ -419,4 +431,17 @@ vector8_min(const Vector8 v1, const Vector8 v2) } #endif /* ! USE_NO_SIMD */ + +#ifndef USE_NO_SIMD +static inline Vector32 +vector32_pack_32(const Vector32 v1, const Vector32 v2) +{ +#ifdef USE_SSE2 + return _mm_packus_epi32(v1, v2); +#elif defined(USE_NEON) + return (Vector32) vuzp1q_u16((uint16x8_t) v1, (uint16x8_t) v2); +#endif +} +#endif /* ! USE_NO_SIMD */ + #endif /* SIMD_H */ -- 2.39.5 (Apple Git-154)
