On Thu, 2004-10-14 at 04:57, Mark Wong wrote: 
> I have some DBT-3 (decision support) results using Gavin's original
> futex patch fix.

I sent an initial description of the futex patch to the mailing lists
last week, but it never appeared (from talking to Marc I believe it
exceeded the size limit on -performance). In any case, the "futex patch"
uses the Linux 2.6 futex API to implement PostgreSQL spinlocks. The hope
is that using futexes will lead to better performance when there is
contention for spinlocks (e.g. on a busy SMP system). The original patch
was written by Stephen Hemminger at OSDL; Gavin and myself have done a
bunch of additional bugfixing and optimization, as well as added IA64
support.

I've attached a WIP copy of the patch to this email (it supports x86,
x86-64 (untested) and IA64 -- more architectures can be added at
request). I'll post a longer writeup when I submit the patch to
-patches.

> Definitely see some overall throughput performance on the tests, about
> 15% increase, but not change with respect to the number of context
> switches.

I'm glad to see that there is a performance improvement; in my own
testing on an 8-way P3 system provided by OSDL, I saw a similar
improvement in pgbench performance (50 concurrent clients, 1000
transactions each, scale factor 75; without the patch, TPS/sec was
between 180 and 185, with the patch TPS/sec was between 200 and 215).

As for context switching, there was some earlier speculation that the
patch might improve or even resolve the "CS storm" issue that some
people have experienced with SMP Xeon P4 systems. I don't think we have
enough evidence to answer this one way or the other at this point.

-Neil

Index: src/backend/storage/lmgr/s_lock.c
===================================================================
RCS file: /var/lib/cvs/pgsql/src/backend/storage/lmgr/s_lock.c,v
retrieving revision 1.32
diff -c -r1.32 s_lock.c
*** src/backend/storage/lmgr/s_lock.c	30 Aug 2004 23:47:20 -0000	1.32
--- src/backend/storage/lmgr/s_lock.c	13 Oct 2004 06:23:26 -0000
***************
*** 15,26 ****
   */
  #include "postgres.h"
  
  #include <time.h>
- #include <unistd.h>
  
  #include "storage/s_lock.h"
  #include "miscadmin.h"
  
  /*
   * s_lock_stuck() - complain about a stuck spinlock
   */
--- 15,49 ----
   */
  #include "postgres.h"
  
+ #ifdef S_LOCK_TEST
+ #undef Assert
+ #define Assert(cond) DoAssert(cond, #cond, __FILE__, __LINE__)
+ 
+ #define DoAssert(cond, text, file, line)		\
+ 	if (!(cond))								\
+ 	{											\
+ 		printf("ASSERTION FAILED! [%s], file = %s, line = %d\n", \
+ 			   text, file, line);				\
+ 		abort();								\
+ 	}
+ #endif
+ 
  #include <time.h>
  
  #include "storage/s_lock.h"
  #include "miscadmin.h"
  
+ #ifdef S_LOCK_TEST
+ #define LOCK_TEST_MSG()			\
+ 	do							\
+ 	{							\
+ 		fprintf(stdout, "*");	\
+ 		fflush(stdout);			\
+ 	} while (0);
+ #else
+ #define LOCK_TEST_MSG()
+ #endif
+ 
  /*
   * s_lock_stuck() - complain about a stuck spinlock
   */
***************
*** 38,43 ****
--- 61,131 ----
  #endif
  }
  
+ #ifdef HAVE_FUTEX
+ /*
+  * futex_lock_contended() is similar to s_lock() for the normal TAS
+  * implementation of spinlocks. When this function is invoked, we have
+  * failed to immediately acquire the spinlock, so we should spin some
+  * number of times attempting to acquire the lock before invoking
+  * sys_futex() to have the kernel wake us up later. "val" is the
+  * current value of the mutex we saw when we tried to acquire it; it
+  * may have changed since then, of course.
+  */
+ void
+ futex_lock_contended(volatile slock_t *lock, slock_t val,
+ 					 const char *file, int line)
+ {
+ 	int loop_count = 0;
+ 
+ #define MAX_LOCK_WAIT		30
+ #define SPINS_BEFORE_WAIT	100
+ 
+ 	Assert(val != FUTEX_UNLOCKED);
+ 
+ 	if (val == FUTEX_LOCKED_NOWAITER)
+ 		val = atomic_exchange(lock, FUTEX_LOCKED_WAITER);
+ 
+ 	while (val != FUTEX_UNLOCKED)
+ 	{
+ 		static struct timespec delay = { .tv_sec = MAX_LOCK_WAIT,
+ 										 .tv_nsec = 0 };
+ 
+ 		LOCK_TEST_MSG();
+ 
+ #if defined(__i386__) || defined(__x86_64__)
+ 		/* See spin_delay() */
+ 		__asm__ __volatile__(" rep; nop\n");
+ #endif
+ 
+ 		/*
+ 		 * XXX: This code is derived from the Drepper algorithm, which
+ 		 * doesn't spin (why, I'm not sure). We should actually change
+ 		 * the lock status to "lock, with waiters" just before we wait
+ 		 * on the futex, not before we begin looping (that avoids a
+ 		 * system call when the lock is released).
+ 		 */
+ 
+ 		/* XXX: worth using __builtin_expect() here? */
+ 		if (++loop_count >= SPINS_BEFORE_WAIT)
+ 		{
+ 			loop_count = 0;
+ 			if (sys_futex(lock, FUTEX_OP_WAIT,
+ 						  FUTEX_LOCKED_WAITER, &delay))
+ 			{
+ 				if (errno == ETIMEDOUT)
+ 					s_lock_stuck(lock, file, line);
+ 			}
+ 		}
+ 
+ 		/*
+ 		 * Do a non-locking test before asserting the bus lock.
+ 		 */
+ 		if (*lock == FUTEX_UNLOCKED)
+ 			val = atomic_exchange(lock, FUTEX_LOCKED_WAITER);
+ 	}
+ }
+ 
+ #else
  
  /*
   * s_lock(lock) - platform-independent portion of waiting for a spinlock.
***************
*** 98,107 ****
  
  			pg_usleep(cur_delay * 10000L);
  
! #if defined(S_LOCK_TEST)
! 			fprintf(stdout, "*");
! 			fflush(stdout);
! #endif
  
  			/* increase delay by a random fraction between 1X and 2X */
  			cur_delay += (int) (cur_delay *
--- 186,192 ----
  
  			pg_usleep(cur_delay * 10000L);
  
! 			LOCK_TEST_MSG();
  
  			/* increase delay by a random fraction between 1X and 2X */
  			cur_delay += (int) (cur_delay *
***************
*** 114,119 ****
--- 199,205 ----
  		}
  	}
  }
+ #endif	/* HAVE_FUTEX */
  
  /*
   * Various TAS implementations that cannot live in s_lock.h as no inline
***************
*** 134,140 ****
   * All the gcc flavors that are not inlined
   */
  
- 
  #if defined(__m68k__)
  static void
  tas_dummy()						/* really means: extern int tas(slock_t
--- 220,225 ----
***************
*** 265,272 ****
--- 350,361 ----
  
  	test_lock.pad1 = test_lock.pad2 = 0x44;
  
+ 	printf("Address of lock variable: %p\n", &test_lock.lock);
+ 
  	S_INIT_LOCK(&test_lock.lock);
  
+ 	printf("<1> Value of lock variable: %d\n", test_lock.lock);
+ 
  	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
  	{
  		printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
***************
*** 280,285 ****
--- 369,375 ----
  	}
  
  	S_LOCK(&test_lock.lock);
+ 	printf("<2> Value of lock variable: %d\n", test_lock.lock);
  
  	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
  	{
***************
*** 289,299 ****
--- 379,391 ----
  
  	if (S_LOCK_FREE(&test_lock.lock))
  	{
+ 		printf("<3> Value of lock variable: %d\n", test_lock.lock);
  		printf("S_LOCK_TEST: failed, lock not locked\n");
  		return 1;
  	}
  
  	S_UNLOCK(&test_lock.lock);
+ 	printf("<4> Value of lock variable: %d\n", test_lock.lock);
  
  	if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
  	{
***************
*** 326,332 ****
  	printf("             if S_LOCK() and TAS() are working.\n");
  	fflush(stdout);
  
! 	s_lock(&test_lock.lock, __FILE__, __LINE__);
  
  	printf("S_LOCK_TEST: failed, lock not locked\n");
  	return 1;
--- 418,424 ----
  	printf("             if S_LOCK() and TAS() are working.\n");
  	fflush(stdout);
  
! 	S_LOCK(&test_lock.lock);
  
  	printf("S_LOCK_TEST: failed, lock not locked\n");
  	return 1;
Index: src/include/storage/s_lock.h
===================================================================
RCS file: /var/lib/cvs/pgsql/src/include/storage/s_lock.h,v
retrieving revision 1.132
diff -c -r1.132 s_lock.h
*** src/include/storage/s_lock.h	6 Oct 2004 23:41:59 -0000	1.132
--- src/include/storage/s_lock.h	13 Oct 2004 06:22:54 -0000
***************
*** 46,55 ****
   *	will "fail" if interrupted.  Therefore TAS() should always be invoked
   *	in a retry loop, even if you are certain the lock is free.
   *
!  *	ANOTHER CAUTION: be sure that TAS() and S_UNLOCK() represent sequence
!  *	points, ie, loads and stores of other values must not be moved across
!  *	a lock or unlock.  In most cases it suffices to make the operation be
!  *	done through a "volatile" pointer.
   *
   *	On most supported platforms, TAS() uses a tas() function written
   *	in assembly language to execute a hardware atomic-test-and-set
--- 46,57 ----
   *	will "fail" if interrupted.  Therefore TAS() should always be invoked
   *	in a retry loop, even if you are certain the lock is free.
   *
!  *	ANOTHER CAUTION: be sure that TAS() and S_UNLOCK() represent
!  *	sequence points, ie, loads and stores of other values must not be
!  *	moved across a lock or unlock.  In most cases it suffices to make
!  *	the operation be done through a "volatile" pointer; also, adding
!  *	"memory" to the list of clobbered registers (for GCC inline asm)
!  *	may also be necessary.
   *
   *	On most supported platforms, TAS() uses a tas() function written
   *	in assembly language to execute a hardware atomic-test-and-set
***************
*** 73,83 ****
  #ifndef S_LOCK_H
  #define S_LOCK_H
  
  #include "storage/pg_sema.h"
  
  #ifdef HAVE_SPINLOCKS	/* skip spinlocks if requested */
  
- 
  #if defined(__GNUC__) || defined(__ICC)
  /*************************************************************************
   * All the gcc inlines
--- 75,86 ----
  #ifndef S_LOCK_H
  #define S_LOCK_H
  
+ #include <unistd.h>
+ 
  #include "storage/pg_sema.h"
  
  #ifdef HAVE_SPINLOCKS	/* skip spinlocks if requested */
  
  #if defined(__GNUC__) || defined(__ICC)
  /*************************************************************************
   * All the gcc inlines
***************
*** 107,116 ****
   *----------
   */
  
  
! #if defined(__i386__) || defined(__x86_64__) /* AMD Opteron */
! #define HAS_TEST_AND_SET
  
  typedef unsigned char slock_t;
  
  #define TAS(lock) tas(lock)
--- 110,322 ----
   *----------
   */
  
+ /*
+  * Linux 2.6 introduced the "futex" API. This provides an efficient
+  * building block for implementing concurrency primitives, via the
+  * sys_futex system call. The gist is that a futex is an aligned
+  * integer. It is decremented and incremented via atomic instructions,
+  * like a normal spinlock. In the contended case, the sys_futex()
+  * system call allows us to enter the kernel and have it awaken us
+  * when the futex is free. The best reference for understanding
+  * futexes is "Futexes are Tricky" by Ulrich Drepper; the
+  * implementation of spinlocks using mutexes is a fairly close
+  * transcription of the example code provided in that paper.
+  *
+  * If futexes are available at compile-time, we attempt to use them at
+  * run-time. It is possible that they aren't available (e.g. we're
+  * compiled on a Linux 2.6 system and run on a Linux 2.4 system), so
+  * we fallback to using normal TAS in that case.
+  */
+ #if defined(__linux__)
  
! #include <syscall.h>
! 
! #ifdef SYS_futex
! /*
!  * Additional atomic operations are required to use futexes. We
!  * consider futexes to be available on a system iff the sys_futex
!  * system call is available AND we have implemented the necessary
!  * assembly routines for the CPU architecture. If both conditions do
!  * not hold, we fall back to using the normal TAS technique.
!  */
! #if defined(__i386__) || defined(__x86_64__)
! #define HAVE_FUTEX
! 
! typedef int slock_t;
! 
! static __inline__ slock_t
! atomic_cmp_exchange(volatile slock_t *ptr, slock_t old, slock_t new)
! {
! 	slock_t prev;
! 	__asm__ __volatile__(
! 		"	lock		\n"
! 		"	cmpxchgl %1,%2	\n"
! 		: "=a"(prev)
! 		: "q"(new), "m"(*ptr), "0"(old)
! 		: "memory", "cc");
! 
! 	return prev;
! }
! 
! static __inline__ slock_t
! atomic_exchange(volatile slock_t *ptr, slock_t x)
! {
! 	/* no lock prefix needed, xchgl is always locked */
! 	__asm__ __volatile__(
! 		"	xchgl %0,%1	\n"
! 		:"=q"(x)
! 		:"m"(*ptr), "0"(x)
! 		:"memory");
! 
! 	return x;
! }
! 
! /*
!  * XXX: is there a more efficient way to write this? Perhaps using
!  * decl...?
!  */
! static __inline__ slock_t
! atomic_dec(volatile slock_t *ptr)
! {
! 	slock_t prev = -1;
! 
! 	__asm__ __volatile__(
! 		"	lock		\n"
! 		"	xadd %0,%1	\n"
! 		:"=q"(prev)
! 		:"m"(*ptr), "0"(prev)
! 		:"memory", "cc");
! 
! 	return prev;
! }
! #endif	/* defined(__i386__) || defined(__x86_64__) */
! 
! #if defined(__ia64__) || defined(__ia64)
! /* Intel Itanium */
! #define HAVE_FUTEX
! 
! typedef int slock_t;
! 
! static __inline__ slock_t
! atomic_cmp_exchange(volatile slock_t *ptr, slock_t old, slock_t new)
! {
! 	long int prev;
! 
! 	__asm__ __volatile__(
! 		"	mov ar.ccv=%2;;"
! 		"	cmpxchg4.acq %0 = %4,%3,ar.ccv"
! 		: "=r" (prev), "=m" (*ptr)
! 		: "r" (old), "r" (new), "m" (*ptr)
! 		: "memory");
! 
! 	return (slock_t) prev;
! }
! 
! static __inline__ slock_t
! atomic_exchange(volatile slock_t *ptr, slock_t x)
! {
! 	/* Note that xchg has "acquire" semantics implicitely */
! 	__asm__ __volatile__(
! 		"	xchg4 	%0=%1,%2	\n"
! 		: "+r"(x), "+m"(*ptr)
! 		:
! 		: "memory");
! 
! 	return x;
! }
! 
! static __inline__ slock_t
! atomic_dec(volatile slock_t *ptr)
! {
! 	slock_t result;
! 
! 	/*
! 	 * Note that IA64 does _not_ guarantee that fetchadd is always
! 	 * atomic. However, it should always be atomic when the operand is
! 	 * -1; see the IA64 docs for details.
! 	 */
! 	__asm__ __volatile__(
! 		"fetchadd4.rel %0=[%1],%2"
! 		: "=r"(result)
! 		: "r"(ptr), "i"(-1)
! 		: "memory");
! 
! 	return result;
! }
! 
! #endif	 /* __ia64__ || __ia64 */
! 
! #endif	/* SYS_futex */
! #endif	/* defined(__linux__) */
! 
! #ifdef HAVE_FUTEX
! 
! /*
!  * Legal states of a futex.
!  */
! #define FUTEX_UNLOCKED 0
! #define FUTEX_LOCKED_NOWAITER 1
! #define FUTEX_LOCKED_WAITER 2
! 
! /*
!  * Futex operations (for sys_futex()).
!  */
! #define FUTEX_OP_WAIT 0
! #define FUTEX_OP_WAKE 1
! 
! #define S_LOCK(lock)		futex_lock((lock), __FILE__, __LINE__)
! #define S_UNLOCK(lock)		futex_unlock(lock)
! #define S_LOCK_FREE(lock)	(*(lock) == FUTEX_UNLOCKED)
! #define S_INIT_LOCK(lock)	(*(lock) = FUTEX_UNLOCKED)
! #define SPIN_DELAY()
! 
! #define NUM_DELAYS 100
  
+ extern void
+ futex_lock_contended(volatile slock_t *lock, slock_t val,
+ 					 const char *file, int line);
+ 
+ static __inline__ int 
+ sys_futex(volatile int *ptr, int op, int val, struct timespec *timeout)
+ {
+ 	return syscall(SYS_futex, ptr, op, val, timeout);
+ }
+ 
+ static __inline__ void
+ futex_lock(volatile slock_t *lock, const char *file, int line)
+ {
+ 	/*
+ 	 * In the common case, there is no contention for the lock, so we
+ 	 * can acquire it with a single compare-exchange instruction. Thus,
+ 	 * it is important to avoid the overhead of a function call. If we
+ 	 * need to contend for the futex we need to spin or make a kernel
+ 	 * call anyway, so a function call doesn't hurt.
+ 	 */
+ 	slock_t val = atomic_cmp_exchange(lock,
+ 									  FUTEX_UNLOCKED,
+ 									  FUTEX_LOCKED_NOWAITER);
+ 
+ 	if (val != FUTEX_UNLOCKED)
+ 		futex_lock_contended(lock, val, file, line);
+ }
+ 
+ static __inline__ void
+ futex_unlock(volatile slock_t *lock)
+ {
+ 	Assert(*lock == FUTEX_LOCKED_NOWAITER ||
+ 		   *lock == FUTEX_LOCKED_WAITER);
+ 	if (atomic_dec(lock) != FUTEX_LOCKED_NOWAITER)
+ 	{
+ 		*lock = FUTEX_UNLOCKED;
+ 		sys_futex(lock, FUTEX_OP_WAKE, 1, NULL);
+ 	}
+ }
+ 
+ #else
+ 
+ #if defined(__i386__) || defined(__x86_64__)
+ 
+ #define HAS_TEST_AND_SET
  typedef unsigned char slock_t;
  
  #define TAS(lock) tas(lock)
***************
*** 120,130 ****
  {
  	register slock_t _res = 1;
  
! 	/* Use a non-locking test before asserting the bus lock */
  	__asm__ __volatile__(
  		"	cmpb	$0,%1	\n"
  		"	jne		1f		\n"
- 		"	lock			\n"
  		"	xchgb	%0,%1	\n"
  		"1: \n"
  :		"+q"(_res), "+m"(*lock)
--- 326,336 ----
  {
  	register slock_t _res = 1;
  
! 	/* Use a non-locking test before doing xchgb, which implicitely
! 	 * asserts the bus lock */
  	__asm__ __volatile__(
  		"	cmpb	$0,%1	\n"
  		"	jne		1f		\n"
  		"	xchgb	%0,%1	\n"
  		"1: \n"
  :		"+q"(_res), "+m"(*lock)
***************
*** 167,173 ****
  
  #endif	 /* __i386__ || __x86_64__ */
  
- 
  #if defined(__ia64__) || defined(__ia64)  /* __ia64 used by ICC compiler? */
  /* Intel Itanium */
  #define HAS_TEST_AND_SET
--- 373,378 ----
***************
*** 453,459 ****
  
  typedef unsigned int slock_t;
  #endif
! 
  
  #endif	/* __GNUC__ */
  
--- 658,664 ----
  
  typedef unsigned int slock_t;
  #endif
! #endif	/* ! HAVE_FUTEX */
  
  #endif	/* __GNUC__ */
  
***************
*** 697,703 ****
  
  
  /* Blow up if we didn't have any way to do spinlocks */
! #ifndef HAS_TEST_AND_SET
  #error PostgreSQL does not have native spinlock support on this platform.  To continue the compilation, rerun configure using --disable-spinlocks.  However, performance will be poor.  Please report this to [EMAIL PROTECTED]
  #endif
  
--- 902,908 ----
  
  
  /* Blow up if we didn't have any way to do spinlocks */
! #if !defined(HAS_TEST_AND_SET) && !defined(HAVE_FUTEX)
  #error PostgreSQL does not have native spinlock support on this platform.  To continue the compilation, rerun configure using --disable-spinlocks.  However, performance will be poor.  Please report this to [EMAIL PROTECTED]
  #endif
  
---------------------------(end of broadcast)---------------------------
TIP 2: you can get off all lists at once with the unregister command
    (send "unregister YourEmailAddressHere" to [EMAIL PROTECTED])

Reply via email to