incubator-hawq git commit: HAWQ-1030. Forward PostgreSQL spin lock peformance enhancement to HAWQ

mli Mon, 29 Aug 2016 20:13:06 -0700

Repository: incubator-hawq
Updated Branches:
  refs/heads/master 569f7545d -> a16030888



HAWQ-1030. Forward PostgreSQL spin lock peformance enhancement to HAWQ


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/a1603088
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/a1603088
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/a1603088

Branch: refs/heads/master
Commit: a16030888bdc2d8e743207d3abfa5f8cd6c32129
Parents: 569f754
Author: Ming LI <[email protected]>
Authored: Mon Aug 29 16:42:33 2016 +0800
Committer: Ming LI <[email protected]>
Committed: Tue Aug 30 11:08:02 2016 +0800

----------------------------------------------------------------------
 src/backend/storage/lmgr/s_lock.c | 215 ++++++++------
 src/include/storage/s_lock.h      | 527 +++++++++++++++++++--------------
 src/test/regress/checkinc.py      |   1 +
 3 files changed, 427 insertions(+), 316 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/a1603088/src/backend/storage/lmgr/s_lock.c
----------------------------------------------------------------------
diff --git a/src/backend/storage/lmgr/s_lock.c 
b/src/backend/storage/lmgr/s_lock.c
index 2e32b69..da8e781 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -3,13 +3,45 @@
  * s_lock.c
  *        Hardware-dependent implementation of spinlocks.
  *
+ * When waiting for a contended spinlock we loop tightly for awhile, then
+ * delay using pg_usleep() and try again.  Preferably, "awhile" should be a
+ * small multiple of the maximum time we expect a spinlock to be held.  100
+ * iterations seems about right as an initial guess.  However, on a
+ * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario
+ * it's usually better to spin a bit longer than to call the kernel, so we try
+ * to adapt the spin loop count depending on whether we seem to be in a
+ * uniprocessor or multiprocessor.
  *
- * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
+ * be wrong; there are platforms where that can result in a "stuck
+ * spinlock" failure.  This has been seen particularly on Alphas; it seems
+ * that the first TAS after returning from kernel space will always fail
+ * on that hardware.
+ *
+ * Once we do decide to block, we use randomly increasing pg_usleep()
+ * delays. The first delay is 1 msec, then the delay randomly increases to
+ * about one second, after which we reset to 1 msec and start again.  The
+ * idea here is that in the presence of heavy contention we need to
+ * increase the delay, else the spinlock holder may never get to run and
+ * release the lock.  (Consider situation where spinlock holder has been
+ * nice'd down in priority by the scheduler --- it will not get scheduled
+ * until all would-be acquirers are sleeping, so if we always use a 1-msec
+ * sleep, there is a real possibility of starvation.)  But we can't just
+ * clamp the delay to an upper bound, else it would take a long time to
+ * make a reasonable number of tries.
+ *
+ * We time out and declare error after NUM_DELAYS delays (thus, exactly
+ * that many tries).  With the given settings, this will usually take 2 or
+ * so minutes.  It seems better to fix the total number of tries (and thus
+ * the probability of unintended failure) than to fix the total time
+ * spent.
+ *
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/lmgr/s_lock.c,v 1.50 
2009/01/01 17:23:48 momjian Exp $
+ *       src/backend/storage/lmgr/s_lock.c
  *
  *-------------------------------------------------------------------------
  */
@@ -20,6 +52,14 @@
 
 #include "storage/s_lock.h"
 
+#define MIN_SPINS_PER_DELAY 10
+#define MAX_SPINS_PER_DELAY 1000
+#define NUM_DELAYS                     1000
+#define MIN_DELAY_USEC         1000L
+#define MAX_DELAY_USEC         1000000L
+
+
+slock_t                dummy_spinlock;
 
 static int     spins_per_delay = DEFAULT_SPINS_PER_DELAY;
 
@@ -28,122 +68,111 @@ static int        spins_per_delay = 
DEFAULT_SPINS_PER_DELAY;
  * s_lock_stuck() - complain about a stuck spinlock
  */
 static void
-s_lock_stuck(volatile slock_t *lock, const char *file, int line)
+s_lock_stuck(const char *file, int line, const char *func)
 {
+       if (!func)
+               func = "(unknown)";
 #if defined(S_LOCK_TEST)
        fprintf(stderr,
-                       "\nStuck spinlock (%p) detected at %s:%d.\n",
-                       lock, file, line);
+                       "\nStuck spinlock detected at %s, %s:%d.\n",
+                       func, file, line);
        exit(1);
 #else
-       elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
-                lock, file, line);
+       elog(PANIC, "stuck spinlock detected at %s, %s:%d",
+                func, file, line);
 #endif
 }
 
-
 /*
  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
  */
-void
-s_lock(volatile slock_t *lock, const char *file, int line)
+int
+s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
 {
-       /*
-        * We loop tightly for awhile, then delay using pg_usleep() and try 
again.
-        * Preferably, "awhile" should be a small multiple of the maximum time 
we
-        * expect a spinlock to be held.  100 iterations seems about right as an
-        * initial guess.  However, on a uniprocessor the loop is a waste of
-        * cycles, while in a multi-CPU scenario it's usually better to spin a 
bit
-        * longer than to call the kernel, so we try to adapt the spin loop 
count
-        * depending on whether we seem to be in a uniprocessor or 
multiprocessor.
-        *
-        * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
-        * be wrong; there are platforms where that can result in a "stuck
-        * spinlock" failure.  This has been seen particularly on Alphas; it 
seems
-        * that the first TAS after returning from kernel space will always fail
-        * on that hardware.
-        *
-        * Once we do decide to block, we use randomly increasing pg_usleep()
-        * delays. The first delay is 1 msec, then the delay randomly increases 
to
-        * about one second, after which we reset to 1 msec and start again.  
The
-        * idea here is that in the presence of heavy contention we need to
-        * increase the delay, else the spinlock holder may never get to run and
-        * release the lock.  (Consider situation where spinlock holder has been
-        * nice'd down in priority by the scheduler --- it will not get 
scheduled
-        * until all would-be acquirers are sleeping, so if we always use a 
1-msec
-        * sleep, there is a real possibility of starvation.)  But we can't just
-        * clamp the delay to an upper bound, else it would take a long time to
-        * make a reasonable number of tries.
-        *
-        * We time out and declare error after NUM_DELAYS delays (thus, exactly
-        * that many tries).  With the given settings, this will usually take 2 
or
-        * so minutes.  It seems better to fix the total number of tries (and 
thus
-        * the probability of unintended failure) than to fix the total time
-        * spent.
-        *
-        * The pg_usleep() delays are measured in milliseconds because 1 msec 
is a
-        * common resolution limit at the OS level for newer platforms. On older
-        * platforms the resolution limit is usually 10 msec, in which case the
-        * total delay before timeout will be a bit more.
-        */
-#define MIN_SPINS_PER_DELAY 10
-#define MAX_SPINS_PER_DELAY 1000
-#define NUM_DELAYS                     1000
-#define MIN_DELAY_MSEC         1
-#define MAX_DELAY_MSEC         1000
+       SpinDelayStatus delayStatus;
 
-       int                     spins = 0;
-       int                     delays = 0;
-       int                     cur_delay = 0;
+       init_spin_delay(&delayStatus, file, line, func);
 
-       while (TAS(lock))
+       while (TAS_SPIN(lock))
        {
-               /* CPU-specific delay each time through the loop */
-               SPIN_DELAY();
+               perform_spin_delay(&delayStatus);
+       }
 
-               /* Block the process every spins_per_delay tries */
-               if (++spins >= spins_per_delay)
-               {
-                       if (++delays > NUM_DELAYS)
-                               s_lock_stuck(lock, file, line);
+       finish_spin_delay(&delayStatus);
 
-                       if (cur_delay == 0) /* first time to delay? */
-                               cur_delay = MIN_DELAY_MSEC;
+       return delayStatus.delays;
+}
 
-                       pg_usleep(cur_delay * 1000L);
+#ifdef USE_DEFAULT_S_UNLOCK
+void
+s_unlock(volatile slock_t *lock)
+{
+#ifdef TAS_ACTIVE_WORD
+       /* HP's PA-RISC */
+       *TAS_ACTIVE_WORD(lock) = -1;
+#else
+       *lock = 0;
+#endif
+}
+#endif
+
+/*
+ * Wait while spinning on a contended spinlock.
+ */
+void
+perform_spin_delay(SpinDelayStatus *status)
+{
+       /* CPU-specific delay each time through the loop */
+       SPIN_DELAY();
+
+       /* Block the process every spins_per_delay tries */
+       if (++(status->spins) >= spins_per_delay)
+       {
+               if (++(status->delays) > NUM_DELAYS)
+                       s_lock_stuck(status->file, status->line, status->func);
+
+               if (status->cur_delay == 0)             /* first time to delay? 
*/
+                       status->cur_delay = MIN_DELAY_USEC;
+
+               pg_usleep(status->cur_delay);
 
 #if defined(S_LOCK_TEST)
-                       fprintf(stdout, "*");
-                       fflush(stdout);
+               fprintf(stdout, "*");
+               fflush(stdout);
 #endif
 
-                       /* increase delay by a random fraction between 1X and 
2X */
-                       cur_delay += (int) (cur_delay *
+               /* increase delay by a random fraction between 1X and 2X */
+               status->cur_delay += (int) (status->cur_delay *
                                          ((double) random() / (double) 
MAX_RANDOM_VALUE) + 0.5);
-                       /* wrap back to minimum delay when max is exceeded */
-                       if (cur_delay > MAX_DELAY_MSEC)
-                               cur_delay = MIN_DELAY_MSEC;
+               /* wrap back to minimum delay when max is exceeded */
+               if (status->cur_delay > MAX_DELAY_USEC)
+                       status->cur_delay = MIN_DELAY_USEC;
 
-                       spins = 0;
-               }
+               status->spins = 0;
        }
+}
 
        /*
-        * If we were able to acquire the lock without delaying, it's a good
-        * indication we are in a multiprocessor.  If we had to delay, it's a 
sign
-        * (but not a sure thing) that we are in a uniprocessor. Hence, we
-        * decrement spins_per_delay slowly when we had to delay, and increase 
it
-        * rapidly when we didn't.  It's expected that spins_per_delay will
-        * converge to the minimum value on a uniprocessor and to the maximum
-        * value on a multiprocessor.
-        *
-        * Note: spins_per_delay is local within our current process. We want to
-        * average these observations across multiple backends, since it's
-        * relatively rare for this function to even get entered, and so a 
single
-        * backend might not live long enough to converge on a good value.      
That
-        * is handled by the two routines below.
-        */
-       if (cur_delay == 0)
+       * After acquiring a spinlock, update estimates about how long to loop.
+       *
+       * If we were able to acquire the lock without delaying, it's a good
+       * indication we are in a multiprocessor.  If we had to delay, it's a 
sign
+       * (but not a sure thing) that we are in a uniprocessor. Hence, we
+       * decrement spins_per_delay slowly when we had to delay, and increase it
+       * rapidly when we didn't.  It's expected that spins_per_delay will
+       * converge to the minimum value on a uniprocessor and to the maximum
+       * value on a multiprocessor.
+       *
+       * Note: spins_per_delay is local within our current process. We want to
+       * average these observations across multiple backends, since it's
+       * relatively rare for this function to even get entered, and so a single
+       * backend might not live long enough to converge on a good value.       
That
+       * is handled by the two routines below.
+       */
+void
+finish_spin_delay(SpinDelayStatus *status)
+{
+       if (status->cur_delay == 0)
        {
                /* we never had to delay */
                if (spins_per_delay < MAX_SPINS_PER_DELAY)
@@ -169,7 +198,7 @@ set_spins_per_delay(int shared_spins_per_delay)
 }
 
 /*
- * Recompute shared estimate of spins_per_delay during backend exit.
+ * Update shared estimate of spins_per_delay during backend exit.
  *
  * NB: this has to be pretty fast as it is called while holding a spinlock
  */
@@ -179,7 +208,7 @@ recompute_spins_per_delay(int shared_spins_per_delay)
        /*
         * We use an exponential moving average with a relatively slow adaption
         * rate, so that noise in any one backend's result won't affect the 
shared
-        * value too much.      As long as both inputs are within the allowed 
range,
+        * value too much.  As long as both inputs are within the allowed range,
         * the result must be too, so we need not worry about clamping the 
result.
         *
         * We deliberately truncate rather than rounding; this is so that single

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/a1603088/src/include/storage/s_lock.h
----------------------------------------------------------------------
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index dfdacb7..45981e0 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -12,10 +12,11 @@
  *     void S_INIT_LOCK(slock_t *lock)
  *             Initialize a spinlock (to the unlocked state).
  *
- *     void S_LOCK(slock_t *lock)
+ *     int S_LOCK(slock_t *lock)
  *             Acquire a spinlock, waiting if necessary.
  *             Time out and abort() if unable to acquire the lock in a
  *             "reasonable" amount of time --- typically ~ 1 minute.
+ *             Should return number of "delays"; see s_lock.c
  *
  *     void S_UNLOCK(slock_t *lock)
  *             Unlock a previously acquired lock.
@@ -31,25 +32,47 @@
  *     macros at the bottom of the file.  Check if your platform can use
  *     these or needs to override them.
  *
- *  Usually, S_LOCK() is implemented in terms of an even lower-level macro
- *     TAS():
+ *  Usually, S_LOCK() is implemented in terms of even lower-level macros
+ *     TAS() and TAS_SPIN():
  *
  *     int TAS(slock_t *lock)
  *             Atomic test-and-set instruction.  Attempt to acquire the lock,
  *             but do *not* wait.      Returns 0 if successful, nonzero if 
unable
  *             to acquire the lock.
  *
- *     TAS() is NOT part of the API, and should never be called directly.
+ *     int TAS_SPIN(slock_t *lock)
+ *             Like TAS(), but this version is used when waiting for a lock
+ *             previously found to be contended.  By default, this is the
+ *             same as TAS(), but on some architectures it's better to poll a
+ *             contended lock using an unlocked instruction and retry the
+ *             atomic test-and-set only when it appears free.
  *
- *     CAUTION: on some platforms TAS() may sometimes report failure to acquire
- *     a lock even when the lock is not locked.  For example, on Alpha TAS()
- *     will "fail" if interrupted.  Therefore TAS() should always be invoked
- *     in a retry loop, even if you are certain the lock is free.
+ *     TAS() and TAS_SPIN() are NOT part of the API, and should never be called
+ *     directly.
  *
- *     ANOTHER CAUTION: be sure that TAS() and S_UNLOCK() represent sequence
- *     points, ie, loads and stores of other values must not be moved across
- *     a lock or unlock.  In most cases it suffices to make the operation be
- *     done through a "volatile" pointer.
+ *     CAUTION: on some platforms TAS() and/or TAS_SPIN() may sometimes report
+ *     failure to acquire a lock even when the lock is not locked.  For 
example,
+ *     on Alpha TAS() will "fail" if interrupted.  Therefore a retry loop must
+ *     always be used, even if you are certain the lock is free.
+ *
+ *     It is the responsibility of these macros to make sure that the compiler
+ *     does not re-order accesses to shared memory to precede the actual lock
+ *     acquisition, or follow the lock release.  Prior to PostgreSQL 9.5, this
+ *     was the caller's responsibility, which meant that callers had to use
+ *     volatile-qualified pointers to refer to both the spinlock itself and the
+ *     shared data being accessed within the spinlocked critical section.  This
+ *     was notationally awkward, easy to forget (and thus error-prone), and
+ *     prevented some useful compiler optimizations.  For these reasons, we
+ *     now require that the macros themselves prevent compiler re-ordering,
+ *     so that the caller doesn't need to take special precautions.
+ *
+ *     On platforms with weak memory ordering, the TAS(), TAS_SPIN(), and
+ *     S_UNLOCK() macros must further include hardware-level memory fence
+ *     instructions to prevent similar re-ordering at the hardware level.
+ *     TAS() and TAS_SPIN() must guarantee that loads and stores issued after
+ *     the macro are not executed until the lock has been obtained.  
Conversely,
+ *     S_UNLOCK() must guarantee that loads and stores issued before the macro
+ *     have been executed before the lock is released.
  *
  *     On most supported platforms, TAS() uses a tas() function written
  *     in assembly language to execute a hardware atomic-test-and-set
@@ -63,21 +86,19 @@
  *     when using the SysV semaphore code.
  *
  *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- *       $PostgreSQL: pgsql/src/include/storage/s_lock.h,v 1.171 2010/01/05 
11:06:28 mha Exp $
+ *       src/include/storage/s_lock.h
  *
  *-------------------------------------------------------------------------
  */
 #ifndef S_LOCK_H
 #define S_LOCK_H
 
-#include "storage/pg_sema.h"
 
 #ifdef HAVE_SPINLOCKS  /* skip spinlocks if requested */
 
-
 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
 /*************************************************************************
  * All the gcc inlines
@@ -124,6 +145,12 @@ tas(volatile slock_t *lock)
         * Use a non-locking test before asserting the bus lock.  Note that the
         * extra test appears to be a small loss on some x86 platforms and a 
small
         * win on others; it's by no means clear that we should keep it.
+        *
+        * When this was last tested, we didn't have separate TAS() and 
TAS_SPIN()
+        * macros.  Nowadays it probably would be better to do a non-locking 
test
+        * in TAS_SPIN() but not in TAS(), like on x86_64, but no-one's done the
+        * testing to verify that.  Without some empirical evidence, better to
+        * leave it alone.
         */
        __asm__ __volatile__(
                "       cmpb    $0,%1   \n"
@@ -132,7 +159,7 @@ tas(volatile slock_t *lock)
                "       xchgb   %0,%1   \n"
                "1: \n"
 :              "+q"(_res), "+m"(*lock)
-:
+:              /* no inputs */
 :              "memory", "cc");
        return (int) _res;
 }
@@ -179,21 +206,27 @@ typedef unsigned char slock_t;
 
 #define TAS(lock) tas(lock)
 
+/*
+ * On Intel EM64T, it's a win to use a non-locking test before the xchg proper,
+ * but only when spinning.
+ *
+ * See also Implementing Scalable Atomic Locks for Multi-Core Intel(tm) EM64T
+ * and IA32, by Michael Chynoweth and Mary R. Lee. As of this writing, it is
+ * available at:
+ * 
http://software.intel.com/en-us/articles/implementing-scalable-atomic-locks-for-multi-core-intel-em64t-and-ia32-architectures
+ */
+#define TAS_SPIN(lock)    (*(lock) ? 1 : TAS(lock))
+
 static __inline__ int
 tas(volatile slock_t *lock)
 {
        register slock_t _res = 1;
 
-       /*
-        * On Opteron, using a non-locking test before the locking instruction
-        * is a huge loss.  On EM64T, it appears to be a wash or small loss,
-        * so we needn't bother to try to distinguish the sub-architectures.
-        */
        __asm__ __volatile__(
                "       lock                    \n"
                "       xchgb   %0,%1   \n"
 :              "+q"(_res), "+m"(*lock)
-:
+:              /* no inputs */
 :              "memory", "cc");
        return (int) _res;
 }
@@ -214,13 +247,36 @@ spin_delay(void)
 #endif  /* __x86_64__ */
 
 
-#if defined(__ia64__) || defined(__ia64)       /* Intel Itanium */
+#if defined(__ia64__) || defined(__ia64)
+/*
+ * Intel Itanium, gcc or Intel's compiler.
+ *
+ * Itanium has weak memory ordering, but we rely on the compiler to enforce
+ * strict ordering of accesses to volatile data.  In particular, while the
+ * xchg instruction implicitly acts as a memory barrier with 'acquire'
+ * semantics, we do not have an explicit memory fence instruction in the
+ * S_UNLOCK macro.  We use a regular assignment to clear the spinlock, and
+ * trust that the compiler marks the generated store instruction with the
+ * ".rel" opcode.
+ *
+ * Testing shows that assumption to hold on gcc, although I could not find
+ * any explicit statement on that in the gcc manual.  In Intel's compiler,
+ * the -m[no-]serialize-volatile option controls that, and testing shows that
+ * it is enabled by default.
+ *
+ * While icc accepts gcc asm blocks on x86[_64], this is not true on ia64
+ * (at least not in icc versions before 12.x).  So we have to carry a separate
+ * compiler-intrinsic-based implementation for it.
+ */
 #define HAS_TEST_AND_SET
 
 typedef unsigned int slock_t;
 
 #define TAS(lock) tas(lock)
 
+/* On IA64, it's a win to use a non-locking test before the xchg proper */
+#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock))
+
 #ifndef __INTEL_COMPILER
 
 static __inline__ int
@@ -248,31 +304,37 @@ tas(volatile slock_t *lock)
        return ret;
 }
 
+/* icc can't use the regular gcc S_UNLOCK() macro either in this case */
+#define S_UNLOCK(lock) \
+       do { __memory_barrier(); *(lock) = 0; } while (0)
+
 #endif /* __INTEL_COMPILER */
 #endif  /* __ia64__ || __ia64 */
 
-
-#if defined(__arm__) || defined(__arm)
+/*
+ * On ARM and ARM64, we use __sync_lock_test_and_set(int *, int) if available.
+ *
+ * We use the int-width variant of the builtin because it works on more chips
+ * than other widths.
+ */
+#if defined(__arm__) || defined(__arm) || defined(__aarch64__) || 
defined(__aarch64)
+#ifdef HAVE_GCC__SYNC_INT32_TAS
 #define HAS_TEST_AND_SET
 
-typedef unsigned char slock_t;
-
 #define TAS(lock) tas(lock)
 
+typedef int slock_t;
+
 static __inline__ int
 tas(volatile slock_t *lock)
 {
-       register slock_t _res = 1;
-
-       __asm__ __volatile__(
-               "       swpb    %0, %0, [%2]    \n"
-:              "+r"(_res), "+m"(*lock)
-:              "r"(lock)
-:              "memory");
-       return (int) _res;
+       return __sync_lock_test_and_set(lock, 1);
 }
 
-#endif  /* __arm__ */
+#define S_UNLOCK(lock) __sync_lock_release(lock)
+
+#endif  /* HAVE_GCC__SYNC_INT32_TAS */
+#endif  /* __arm__ || __arm || __aarch64__ || __aarch64 */
 
 
 /* S/390 and S/390x Linux (32- and 64-bit zSeries) */
@@ -300,6 +362,12 @@ tas(volatile slock_t *lock)
 
 
 #if defined(__sparc__)         /* Sparc */
+/*
+ * Solaris has always run sparc processors in TSO (total store) mode, but
+ * linux didn't use to and the *BSDs still don't. So, be careful about
+ * acquire/release semantics. The CPU will treat superfluous membars as
+ * NOPs, so it's just code space.
+ */
 #define HAS_TEST_AND_SET
 
 typedef unsigned char slock_t;
@@ -321,9 +389,51 @@ tas(volatile slock_t *lock)
 :              "=r"(_res), "+m"(*lock)
 :              "r"(lock)
 :              "memory");
+#if defined(__sparcv7) || defined(__sparc_v7__)
+       /*
+        * No stbar or membar available, luckily no actually produced hardware
+        * requires a barrier.
+        */
+#elif defined(__sparcv8) || defined(__sparc_v8__)
+       /* stbar is available (and required for both PSO, RMO), membar isn't */
+       __asm__ __volatile__ ("stbar     \n":::"memory");
+#else
+       /*
+        * #LoadStore (RMO) | #LoadLoad (RMO) together are the appropriate 
acquire
+        * barrier for sparcv8+ upwards.
+        */
+       __asm__ __volatile__ ("membar #LoadStore | #LoadLoad \n":::"memory");
+#endif
        return (int) _res;
 }
 
+#if defined(__sparcv7) || defined(__sparc_v7__)
+/*
+ * No stbar or membar available, luckily no actually produced hardware
+ * requires a barrier.  We fall through to the default gcc definition of
+ * S_UNLOCK in this case.
+ */
+#elif defined(__sparcv8) || defined(__sparc_v8__)
+/* stbar is available (and required for both PSO, RMO), membar isn't */
+#define S_UNLOCK(lock) \
+do \
+{ \
+       __asm__ __volatile__ ("stbar     \n":::"memory"); \
+       *((volatile slock_t *) (lock)) = 0; \
+} while (0)
+#else
+/*
+ * #LoadStore (RMO) | #StoreStore (RMO, PSO) together are the appropriate
+ * release barrier for sparcv8+ upwards.
+ */
+#define S_UNLOCK(lock) \
+do \
+{ \
+       __asm__ __volatile__ ("membar #LoadStore | #StoreStore \n":::"memory"); 
\
+       *((volatile slock_t *) (lock)) = 0; \
+} while (0)
+#endif
+
 #endif  /* __sparc__ */
 
 
@@ -331,16 +441,23 @@ tas(volatile slock_t *lock)
 #if defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || 
defined(__powerpc64__)
 #define HAS_TEST_AND_SET
 
-#if defined(__ppc64__) || defined(__powerpc64__)
-typedef unsigned long slock_t;
-#else
 typedef unsigned int slock_t;
-#endif
 
 #define TAS(lock) tas(lock)
+
+/* On PPC, it's a win to use a non-locking test before the lwarx */
+#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock))
+
 /*
  * NOTE: per the Enhanced PowerPC Architecture manual, v1.0 dated 7-May-2002,
  * an isync is a sufficient synchronization barrier after a lwarx/stwcx loop.
+ * On newer machines, we can use lwsync instead for better performance.
+ *
+ * Ordinarily, we'd code the branches here using GNU-style local symbols, that
+ * is "1f" referencing "1:" and so on.  But some people run gcc on AIX with
+ * IBM's assembler as backend, and IBM's assembler doesn't do local symbols.
+ * So hand-code the branch offsets; fortunately, all PPC instructions are
+ * exactly 4 bytes each, so it's not too hard to count.
  */
 static __inline__ int
 tas(volatile slock_t *lock)
@@ -349,18 +466,24 @@ tas(volatile slock_t *lock)
        int _res;
 
        __asm__ __volatile__(
+#ifdef USE_PPC_LWARX_MUTEX_HINT
+"      lwarx   %0,0,%3,1       \n"
+#else
 "      lwarx   %0,0,%3         \n"
+#endif
 "      cmpwi   %0,0            \n"
-"      bne     1f                      \n"
+"      bne     $+16            \n"             /* branch to li %1,1 */
 "      addi    %0,%0,1         \n"
 "      stwcx.  %0,0,%3         \n"
-"      beq     2f              \n"
-"1:    li      %1,1            \n"
-"      b               3f                      \n"
-"2:                                            \n"
+"      beq     $+12            \n"             /* branch to lwsync/isync */
+"      li      %1,1            \n"
+"      b       $+12            \n"             /* branch to end of asm 
sequence */
+#ifdef USE_PPC_LWSYNC
+"      lwsync                          \n"
+#else
 "      isync                           \n"
+#endif
 "      li      %1,0            \n"
-"3:                                            \n"
 
 :      "=&r"(_t), "=r"(_res), "+m"(*lock)
 :      "r"(lock)
@@ -368,13 +491,25 @@ tas(volatile slock_t *lock)
        return _res;
 }
 
-/* PowerPC S_UNLOCK is almost standard but requires a "sync" instruction */
+/*
+ * PowerPC S_UNLOCK is almost standard but requires a "sync" instruction.
+ * On newer machines, we can use lwsync instead for better performance.
+ */
+#ifdef USE_PPC_LWSYNC
 #define S_UNLOCK(lock) \
 do \
 { \
-       __asm__ __volatile__ (" sync \n"); \
+       __asm__ __volatile__ (" lwsync \n" ::: "memory"); \
        *((volatile slock_t *) (lock)) = 0; \
 } while (0)
+#else
+#define S_UNLOCK(lock) \
+do \
+{ \
+       __asm__ __volatile__ (" sync \n" ::: "memory"); \
+       *((volatile slock_t *) (lock)) = 0; \
+} while (0)
+#endif /* USE_PPC_LWSYNC */
 
 #endif /* powerpc */
 
@@ -397,7 +532,7 @@ tas(volatile slock_t *lock)
                "       tas             %1              \n"
                "       sne             %0              \n"
 :              "=d"(rv), "+m"(*lock)
-:
+:              /* no inputs */
 :              "memory", "cc");
        return rv;
 }
@@ -435,76 +570,6 @@ tas(volatile slock_t *lock)
 #endif  /* __vax__ */
 
 
-#if defined(__ns32k__)         /* National Semiconductor 32K */
-#define HAS_TEST_AND_SET
-
-typedef unsigned char slock_t;
-
-#define TAS(lock) tas(lock)
-
-static __inline__ int
-tas(volatile slock_t *lock)
-{
-       register int    _res;
-
-       __asm__ __volatile__(
-               "       sbitb   0, %1   \n"
-               "       sfsd    %0              \n"
-:              "=r"(_res), "+m"(*lock)
-:
-:              "memory");
-       return _res;
-}
-
-#endif  /* __ns32k__ */
-
-
-#if defined(__alpha) || defined(__alpha__)     /* Alpha */
-/*
- * Correct multi-processor locking methods are explained in section 5.5.3
- * of the Alpha AXP Architecture Handbook, which at this writing can be
- * found at ftp://ftp.netbsd.org/pub/NetBSD/misc/dec-docs/index.html.
- * For gcc we implement the handbook's code directly with inline assembler.
- */
-#define HAS_TEST_AND_SET
-
-typedef unsigned long slock_t;
-
-#define TAS(lock)  tas(lock)
-
-static __inline__ int
-tas(volatile slock_t *lock)
-{
-       register slock_t _res;
-
-       __asm__ __volatile__(
-               "       ldq             $0, %1  \n"
-               "       bne             $0, 2f  \n"
-               "       ldq_l   %0, %1  \n"
-               "       bne             %0, 2f  \n"
-               "       mov             1,  $0  \n"
-               "       stq_c   $0, %1  \n"
-               "       beq             $0, 2f  \n"
-               "       mb                              \n"
-               "       br              3f              \n"
-               "2:     mov             1, %0   \n"
-               "3:                                     \n"
-:              "=&r"(_res), "+m"(*lock)
-:
-:              "memory", "0");
-       return (int) _res;
-}
-
-#define S_UNLOCK(lock) \
-do \
-{\
-       __asm__ __volatile__ (" mb \n"); \
-       *((volatile slock_t *) (lock)) = 0; \
-} while (0)
-
-#endif /* __alpha || __alpha__ */
-
-
 #if defined(__mips__) && !defined(__sgi)       /* non-SGI MIPS */
 /* Note: on SGI we use the OS' mutex ABI, see below */
 /* Note: R10000 processors require a separate SYNC */
@@ -534,7 +599,7 @@ tas(volatile slock_t *lock)
                "       sync                \n"
                "       .set pop              "
 :              "=&r" (_res), "=&r" (_tmp), "+R" (*_l)
-:
+:              /* no inputs */
 :              "memory");
        return _res;
 }
@@ -549,7 +614,10 @@ do \
                "       .set noreorder      \n" \
                "       .set nomacro        \n" \
                "       sync                \n" \
-               "       .set pop              "); \
+               "       .set pop              " \
+:              /* no outputs */ \
+:              /* no inputs */ \
+:              "memory"); \
        *((volatile slock_t *) (lock)) = 0; \
 } while (0)
 
@@ -607,6 +675,20 @@ tas(volatile slock_t *lock)
 typedef unsigned char slock_t;
 #endif
 
+/*
+ * Default implementation of S_UNLOCK() for gcc/icc.
+ *
+ * Note that this implementation is unsafe for any platform that can reorder
+ * a memory access (either load or store) after a following store.  That
+ * happens not to be possible on x86 and most legacy architectures (some are
+ * single-processor!), but many modern systems have weaker memory ordering.
+ * Those that do must define their own version of S_UNLOCK() rather than
+ * relying on this one.
+ */
+#if !defined(S_UNLOCK)
+#define S_UNLOCK(lock) \
+       do { __asm__ __volatile__("" : : : "memory");  *(lock) = 0; } while (0)
+#endif
 
 #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
 
@@ -631,7 +713,7 @@ typedef unsigned char slock_t;
 asm int
 tas(volatile slock_t *s_lock)
 {
-/* UNIVEL wants %mem in column 1, so we don't pg_indent this file */
+/* UNIVEL wants %mem in column 1, so we don't pgindent this file */
 %mem s_lock
        pushl %ebx
        movl s_lock, %ebx
@@ -644,27 +726,6 @@ tas(volatile slock_t *s_lock)
 #endif  /* defined(USE_UNIVEL_CC) */
 
 
-#if defined(__alpha) || defined(__alpha__)     /* Tru64 Unix Alpha compiler */
-/*
- * The Tru64 compiler doesn't support gcc-style inline asm, but it does
- * have some builtin functions that accomplish much the same results.
- * For simplicity, slock_t is defined as long (ie, quadword) on Alpha
- * regardless of the compiler in use.  LOCK_LONG and UNLOCK_LONG only
- * operate on an int (ie, longword), but that's OK as long as we define
- * S_INIT_LOCK to zero out the whole quadword.
- */
-#define HAS_TEST_AND_SET
-
-typedef unsigned long slock_t;
-
-#include <alpha/builtins.h>
-#define S_INIT_LOCK(lock)  (*(lock) = 0)
-#define TAS(lock)                 (__LOCK_LONG_RETRY((lock), 1) == 0)
-#define S_UNLOCK(lock)    __UNLOCK_LONG(lock)
-
-#endif  /* __alpha || __alpha__ */
-
-
 #if defined(__hppa) || defined(__hppa__)       /* HP PA-RISC, GCC and HP 
compilers */
 /*
  * HP's PA-RISC
@@ -701,9 +762,22 @@ tas(volatile slock_t *lock)
        return (lockval == 0);
 }
 
-#endif /* __GNUC__ */
+/*
+ * The hppa implementation doesn't follow the rules of this files and provides
+ * a gcc specific implementation outside of the above defined(__GNUC__). It
+ * does so to avoid duplication between the HP compiler and gcc. So undefine
+ * the generic fallback S_UNLOCK from above.
+ */
+#ifdef S_UNLOCK
+#undef S_UNLOCK
+#endif
+#define S_UNLOCK(lock) \
+       do { \
+               __asm__ __volatile__("" : : : "memory"); \
+               *TAS_ACTIVE_WORD(lock) = -1; \
+       } while (0)
 
-#define S_UNLOCK(lock) (*TAS_ACTIVE_WORD(lock) = -1)
+#endif /* __GNUC__ */
 
 #define S_INIT_LOCK(lock) \
        do { \
@@ -720,59 +794,31 @@ tas(volatile slock_t *lock)
 
 
 #if defined(__hpux) && defined(__ia64) && !defined(__GNUC__)
-
-#define HAS_TEST_AND_SET
-
-typedef unsigned int slock_t;
-
-#include <ia64/sys/inline.h>
-#define TAS(lock) _Asm_xchg(_SZ_W, lock, 1, _LDHINT_NONE)
-
-#endif /* HPUX on IA64, non gcc */
-
-
-#if defined(__sgi)     /* SGI compiler */
 /*
- * SGI IRIX 5
- * slock_t is defined as a unsigned long. We use the standard SGI
- * mutex API.
+ * HP-UX on Itanium, non-gcc/icc compiler
  *
- * The following comment is left for historical reasons, but is probably
- * not a good idea since the mutex ABI is supported.
+ * We assume that the compiler enforces strict ordering of loads/stores on
+ * volatile data (see comments on the gcc-version earlier in this file).
+ * Note that this assumption does *not* hold if you use the
+ * +Ovolatile=__unordered option on the HP-UX compiler, so don't do that.
  *
- * This stuff may be supplemented in the future with Masato Kataoka's MIPS-II
- * assembly from his NECEWS SVR4 port, but we probably ought to retain this
- * for the R3000 chips out there.
- */
-#define HAS_TEST_AND_SET
-
-typedef unsigned long slock_t;
-
-#include "mutex.h"
-#define TAS(lock)      (test_and_set(lock,1))
-#define S_UNLOCK(lock) (test_then_and(lock,0))
-#define S_INIT_LOCK(lock)      (test_then_and(lock,0))
-#define S_LOCK_FREE(lock)      (test_then_add(lock,0) == 0)
-#endif  /* __sgi */
-
-
-#if defined(sinix)             /* Sinix */
-/*
- * SINIX / Reliant UNIX
- * slock_t is defined as a struct abilock_t, which has a single unsigned long
- * member. (Basically same as SGI)
+ * See also Implementing Spinlocks on the Intel Itanium Architecture and
+ * PA-RISC, by Tor Ekqvist and David Graves, for more information.  As of
+ * this writing, version 1.0 of the manual is available at:
+ * http://h21007.www2.hp.com/portal/download/files/unprot/itanium/spinlocks.pdf
  */
 #define HAS_TEST_AND_SET
 
-#include "abi_mutex.h"
-typedef abilock_t slock_t;
+typedef unsigned int slock_t;
 
-#define TAS(lock)      (!acquire_lock(lock))
-#define S_UNLOCK(lock) release_lock(lock)
-#define S_INIT_LOCK(lock)      init_lock(lock)
-#define S_LOCK_FREE(lock)      (stat_lock(lock) == UNLOCKED)
-#endif  /* sinix */
+#include <ia64/sys/inline.h>
+#define TAS(lock) _Asm_xchg(_SZ_W, lock, 1, _LDHINT_NONE)
+/* On IA64, it's a win to use a non-locking test before the xchg proper */
+#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock))
+#define S_UNLOCK(lock) \
+       do { _Asm_mf(); (*(lock)) = 0; } while (0)
 
+#endif /* HPUX on IA64, non gcc/icc */
 
 #if defined(_AIX)      /* AIX */
 /*
@@ -789,28 +835,7 @@ typedef int slock_t;
 #endif  /* _AIX */
 
 
-#if defined (nextstep)         /* Nextstep */
-#define HAS_TEST_AND_SET
-
-typedef struct mutex slock_t;
-
-#define S_LOCK(lock)   mutex_lock(lock)
-#define S_UNLOCK(lock) mutex_unlock(lock)
-#define S_INIT_LOCK(lock)      mutex_init(lock)
-/* For Mach, we have to delve inside the entrails of `struct mutex'.  Ick! */
-#define S_LOCK_FREE(alock)     ((alock)->lock == 0)
-#endif  /* nextstep */
-
-
-/* These are in s_lock.c */
-
-
-#if defined(sun3)              /* Sun3 */
-#define HAS_TEST_AND_SET
-
-typedef unsigned char slock_t;
-#endif
-
+/* These are in sunstudio_(sparc|x86).s */
 
 #if defined(__SUNPRO_C) && (defined(__i386) || defined(__x86_64__) || 
defined(__sparc__) || defined(__sparc))
 #define HAS_TEST_AND_SET
@@ -854,9 +879,15 @@ spin_delay(void)
 }
 #endif
 
+#include <intrin.h>
+#pragma intrinsic(_ReadWriteBarrier)
+
+#define S_UNLOCK(lock) \
+       do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0)
+
 #endif
 
-  
+
 #endif /* !defined(HAS_TEST_AND_SET) */
 
 
@@ -874,16 +905,16 @@ spin_delay(void)
  * to fall foul of kernel limits on number of semaphores, so don't use this
  * unless you must!  The subroutines appear in spin.c.
  */
-typedef PGSemaphoreData slock_t;
+typedef int slock_t;
 
 extern bool s_lock_free_sema(volatile slock_t *lock);
 extern void s_unlock_sema(volatile slock_t *lock);
-extern void s_init_lock_sema(volatile slock_t *lock);
+extern void s_init_lock_sema(volatile slock_t *lock, bool nested);
 extern int     tas_sema(volatile slock_t *lock);
 
 #define S_LOCK_FREE(lock)      s_lock_free_sema(lock)
 #define S_UNLOCK(lock)  s_unlock_sema(lock)
-#define S_INIT_LOCK(lock)      s_init_lock_sema(lock)
+#define S_INIT_LOCK(lock)      s_init_lock_sema(lock, false)
 #define TAS(lock)      tas_sema(lock)
 
 
@@ -896,10 +927,7 @@ extern int tas_sema(volatile slock_t *lock);
 
 #if !defined(S_LOCK)
 #define S_LOCK(lock) \
-       do { \
-               if (TAS(lock)) \
-                       s_lock((lock), __FILE__, __LINE__); \
-       } while (0)
+       (TAS(lock) ? s_lock((lock), __FILE__, __LINE__, PG_FUNCNAME_MACRO) : 0)
 #endif  /* S_LOCK */
 
 #if !defined(S_LOCK_FREE)
@@ -907,7 +935,25 @@ extern int tas_sema(volatile slock_t *lock);
 #endif  /* S_LOCK_FREE */
 
 #if !defined(S_UNLOCK)
-#define S_UNLOCK(lock)         (*((volatile slock_t *) (lock)) = 0)
+/*
+ * Our default implementation of S_UNLOCK is essentially *(lock) = 0.  This
+ * is unsafe if the platform can reorder a memory access (either load or
+ * store) after a following store; platforms where this is possible must
+ * define their own S_UNLOCK.  But CPU reordering is not the only concern:
+ * if we simply defined S_UNLOCK() as an inline macro, the compiler might
+ * reorder instructions from inside the critical section to occur after the
+ * lock release.  Since the compiler probably can't know what the external
+ * function s_unlock is doing, putting the same logic there should be adequate.
+ * A sufficiently-smart globally optimizing compiler could break that
+ * assumption, though, and the cost of a function call for every spinlock
+ * release may hurt performance significantly, so we use this implementation
+ * only for platforms where we don't know of a suitable intrinsic.  For the
+ * most part, those are relatively obscure platform/compiler combinations to
+ * which the PostgreSQL project does not have access.
+ */
+#define USE_DEFAULT_S_UNLOCK
+extern void s_unlock(volatile slock_t *lock);
+#define S_UNLOCK(lock)         s_unlock(lock)
 #endif  /* S_UNLOCK */
 
 #if !defined(S_INIT_LOCK)
@@ -925,11 +971,16 @@ extern int        tas(volatile slock_t *lock);            
/* in port/.../tas.s, or
 #define TAS(lock)              tas(lock)
 #endif  /* TAS */
 
+#if !defined(TAS_SPIN)
+#define TAS_SPIN(lock) TAS(lock)
+#endif  /* TAS_SPIN */
+
+extern slock_t dummy_spinlock;
 
 /*
  * Platform-independent out-of-line support routines
  */
-extern void s_lock(volatile slock_t *lock, const char *file, int line);
+extern int s_lock(volatile slock_t *lock, const char *file, int line, const 
char *func);
 
 /* Support for dynamic adjustment of spins_per_delay */
 #define DEFAULT_SPINS_PER_DELAY  100
@@ -937,4 +988,34 @@ extern void s_lock(volatile slock_t *lock, const char 
*file, int line);
 extern void set_spins_per_delay(int shared_spins_per_delay);
 extern int     recompute_spins_per_delay(int shared_spins_per_delay);
 
+/*
+ * Support for spin delay which is useful in various places where
+ * spinlock-like procedures take place.
+ */
+typedef struct
+{
+       int                     spins;
+       int                     delays;
+       int                     cur_delay;
+       const char *file;
+       int                     line;
+       const char *func;
+} SpinDelayStatus;
+
+static inline void
+init_spin_delay(SpinDelayStatus *status,
+                               const char *file, int line, const char *func)
+{
+       status->spins = 0;
+       status->delays = 0;
+       status->cur_delay = 0;
+       status->file = file;
+       status->line = line;
+       status->func = func;
+}
+
+#define init_local_spin_delay(status) init_spin_delay(status, __FILE__, 
__LINE__, PG_FUNCNAME_MACRO)
+void perform_spin_delay(SpinDelayStatus *status);
+void finish_spin_delay(SpinDelayStatus *status);
+
 #endif  /* S_LOCK_H */

http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/a1603088/src/test/regress/checkinc.py
----------------------------------------------------------------------
diff --git a/src/test/regress/checkinc.py b/src/test/regress/checkinc.py
index 59addf7..6f4e006 100755
--- a/src/test/regress/checkinc.py
+++ b/src/test/regress/checkinc.py
@@ -48,6 +48,7 @@ fileset = {
     'ia64/sys/inline.h': [],
     'io.h':              [],
     'ioctl.h':           [],
+    'intrin.h':          [],
     'kernel/OS.h':       [],
     'kernel/image.h':    [],
     'libintl.h':         [],

incubator-hawq git commit: HAWQ-1030. Forward PostgreSQL spin lock peformance enhancement to HAWQ

Reply via email to