On Tue, 27 Jun 2023 at 07:09, Andres Freund <[email protected]> wrote:
> On 2023-06-27 15:33:57 +1200, Thomas Munro wrote:
> > On Tue, Jun 27, 2023 at 2:05 PM Andres Freund <[email protected]> wrote:
> > > Unfortunately it scaled way worse at first. This is not an inherent
> > > issue, but
> > > due to an implementation choice in ReadRecentBuffer(). Whereas the normal
> > > BufferAlloc() path uses PinBuffer(), ReadRecentBuffer() first does
> > > LockBufHdr(), checks if the buffer ID is the same and then uses
> > > PinBuffer_Locked().
> > >
> > > The problem with that is that PinBuffer() takes care to not hold the
> > > buffer
> > > header spinlock, it uses compare_exchange to atomically acquire the pin,
> > > while
> > > guaranteing nobody holds the lock. When holding the buffer header
> > > spinlock,
> > > there obviously is the risk of being scheduled out (or even just not have
> > > exclusive access to the cacheline).
> >
> > Yeah. Aside from inherent nastiness of user-space spinlocks
>
> I've been wondering about making our backoff path use futexes, after some
> adaptive spinning.
If you want to experiment, here is a rebased version of something I
hacked up a couple of years back on the way to Fosdem Pgday. I didn't
pursue it further because I didn't have a use case where it showed a
significant difference.
--
Ants
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
index 327ac64f7c2..67a5e8a0246 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -92,6 +92,7 @@ s_lock_stuck(const char *file, int line, const char *func)
int
s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
{
+#ifndef HAS_FUTEX
SpinDelayStatus delayStatus;
init_spin_delay(&delayStatus, file, line, func);
@@ -104,6 +105,8 @@ s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
finish_spin_delay(&delayStatus);
return delayStatus.delays;
+#endif
+ elog(FATAL, "Should not be called");
}
#ifdef USE_DEFAULT_S_UNLOCK
@@ -230,6 +233,71 @@ update_spins_per_delay(int shared_spins_per_delay)
return (shared_spins_per_delay * 15 + spins_per_delay) / 16;
}
+#ifdef HAS_FUTEX
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/futex.h>
+
+static int
+futex(volatile uint32 *uaddr, int futex_op, int val,
+ const struct timespec *timeout, int *uaddr2, int val3)
+{
+ return syscall(SYS_futex, uaddr, futex_op, val,
+ timeout, uaddr, val3);
+}
+
+int
+futex_lock(volatile slock_t *lock, uint32 current, const char *file, int line, const char *func)
+{
+ int i, s;
+ /*
+ * First lets wait for a bit without involving the kernel, it is quite likely
+ * the lock holder is still running.
+ **/
+ if (likely(current < 2))
+ {
+ uint32 expected;
+ for (i = 0; i < DEFAULT_SPINS_PER_DELAY; i++)
+ {
+ SPIN_DELAY();
+ expected = lock->value;
+ if (expected == 0 && pg_atomic_compare_exchange_u32(lock, &expected, 1))
+ return i;
+ }
+
+ while (expected != 2 && !pg_atomic_compare_exchange_u32(lock, &expected, 2)) {
+ if (expected == 0 && pg_atomic_compare_exchange_u32(lock, &expected, 2))
+ return i;
+ }
+ }
+
+ /* At this point lock value is 2 and we will get waken up */
+ while (true)
+ {
+ uint32 expected = 0;
+ s = futex(&(lock->value), FUTEX_WAIT, 2, NULL, NULL, 0);
+ if (s == -1 && errno != EAGAIN)
+ elog(FATAL, "Futex wait failed with error: %m");
+
+ /* Maybe someone else was waiting too, we will try to wake them up. */
+ if (pg_atomic_compare_exchange_u32(lock, &expected, 2))
+ break;
+
+ }
+
+ return i;
+}
+
+int futex_unlock(volatile slock_t *lock, uint32 current)
+{
+ lock->value = 0;
+ if (futex(&(lock->value), FUTEX_WAKE, 1, NULL, NULL, 0) == -1)
+ elog(FATAL, "Futex wake failed with error: %m");
+
+ return 0;
+}
+
+#endif /* HAS_FUTEX */
/*****************************************************************************/
#if defined(S_LOCK_TEST)
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index c9fa84cc43c..6351ec0804e 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -205,6 +205,52 @@ spin_delay(void)
#ifdef __x86_64__ /* AMD Opteron, Intel EM64T */
#define HAS_TEST_AND_SET
+#if defined(__linux__)
+#define HAS_FUTEX 1 /* TODO: move to configure to check for old kernels */
+#endif
+
+#ifdef HAS_FUTEX
+
+#include "port/atomics.h"
+
+typedef pg_atomic_uint32 slock_t;
+
+#define S_LOCK(lock) \
+ do { \
+ uint32 expected = 0; \
+ if (unlikely(!pg_atomic_compare_exchange_u32((lock), &expected, 1))) \
+ futex_lock((lock), expected, __FILE__, __LINE__, __func__); \
+ } while (0)
+
+
+#define S_UNLOCK(lock) \
+ do { \
+ uint32 actual = pg_atomic_exchange_u32((lock), 0); \
+ if (unlikely(actual == 2)) \
+ futex_unlock((lock), actual); \
+ } while (0)
+extern int futex_lock(volatile slock_t *lock, uint32 current, const char *file, int line, const char *func);
+extern int futex_unlock(volatile slock_t *lock, uint32 current);
+
+/* TAS only needed for regress */
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ register uint32 _res = 1;
+
+ __asm__ __volatile__(
+ " lock \n"
+ " xchgl %0,%1 \n"
+: "+q"(_res), "+m"(*lock)
+: /* no inputs */
+: "memory", "cc");
+ return (int) _res;
+}
+
+
+#else
typedef unsigned char slock_t;
#define TAS(lock) tas(lock)
@@ -247,6 +293,7 @@ spin_delay(void)
" rep; nop \n");
}
+#endif /* HAS_FUTEX */
#endif /* __x86_64__ */
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index bcbc6d910f1..b92f8542ae9 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -857,7 +857,11 @@ test_spinlock(void)
S_UNLOCK(&struct_w_lock.lock);
/* and that "contended" acquisition works */
+#ifdef HAS_FUTEX
+ futex_lock(&struct_w_lock.lock, 1, "testfile", 17, "testfunc");
+#else
s_lock(&struct_w_lock.lock, "testfile", 17, "testfunc");
+#endif
S_UNLOCK(&struct_w_lock.lock);
/*