Repository: incubator-hawq Updated Branches: refs/heads/master 569f7545d -> a16030888
HAWQ-1030. Forward PostgreSQL spin lock peformance enhancement to HAWQ Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/a1603088 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/a1603088 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/a1603088 Branch: refs/heads/master Commit: a16030888bdc2d8e743207d3abfa5f8cd6c32129 Parents: 569f754 Author: Ming LI <[email protected]> Authored: Mon Aug 29 16:42:33 2016 +0800 Committer: Ming LI <[email protected]> Committed: Tue Aug 30 11:08:02 2016 +0800 ---------------------------------------------------------------------- src/backend/storage/lmgr/s_lock.c | 215 ++++++++------ src/include/storage/s_lock.h | 527 +++++++++++++++++++-------------- src/test/regress/checkinc.py | 1 + 3 files changed, 427 insertions(+), 316 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/a1603088/src/backend/storage/lmgr/s_lock.c ---------------------------------------------------------------------- diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c index 2e32b69..da8e781 100644 --- a/src/backend/storage/lmgr/s_lock.c +++ b/src/backend/storage/lmgr/s_lock.c @@ -3,13 +3,45 @@ * s_lock.c * Hardware-dependent implementation of spinlocks. * + * When waiting for a contended spinlock we loop tightly for awhile, then + * delay using pg_usleep() and try again. Preferably, "awhile" should be a + * small multiple of the maximum time we expect a spinlock to be held. 100 + * iterations seems about right as an initial guess. However, on a + * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario + * it's usually better to spin a bit longer than to call the kernel, so we try + * to adapt the spin loop count depending on whether we seem to be in a + * uniprocessor or multiprocessor. * - * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd + * be wrong; there are platforms where that can result in a "stuck + * spinlock" failure. This has been seen particularly on Alphas; it seems + * that the first TAS after returning from kernel space will always fail + * on that hardware. + * + * Once we do decide to block, we use randomly increasing pg_usleep() + * delays. The first delay is 1 msec, then the delay randomly increases to + * about one second, after which we reset to 1 msec and start again. The + * idea here is that in the presence of heavy contention we need to + * increase the delay, else the spinlock holder may never get to run and + * release the lock. (Consider situation where spinlock holder has been + * nice'd down in priority by the scheduler --- it will not get scheduled + * until all would-be acquirers are sleeping, so if we always use a 1-msec + * sleep, there is a real possibility of starvation.) But we can't just + * clamp the delay to an upper bound, else it would take a long time to + * make a reasonable number of tries. + * + * We time out and declare error after NUM_DELAYS delays (thus, exactly + * that many tries). With the given settings, this will usually take 2 or + * so minutes. It seems better to fix the total number of tries (and thus + * the probability of unintended failure) than to fix the total time + * spent. + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/lmgr/s_lock.c,v 1.50 2009/01/01 17:23:48 momjian Exp $ + * src/backend/storage/lmgr/s_lock.c * *------------------------------------------------------------------------- */ @@ -20,6 +52,14 @@ #include "storage/s_lock.h" +#define MIN_SPINS_PER_DELAY 10 +#define MAX_SPINS_PER_DELAY 1000 +#define NUM_DELAYS 1000 +#define MIN_DELAY_USEC 1000L +#define MAX_DELAY_USEC 1000000L + + +slock_t dummy_spinlock; static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; @@ -28,122 +68,111 @@ static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; * s_lock_stuck() - complain about a stuck spinlock */ static void -s_lock_stuck(volatile slock_t *lock, const char *file, int line) +s_lock_stuck(const char *file, int line, const char *func) { + if (!func) + func = "(unknown)"; #if defined(S_LOCK_TEST) fprintf(stderr, - "\nStuck spinlock (%p) detected at %s:%d.\n", - lock, file, line); + "\nStuck spinlock detected at %s, %s:%d.\n", + func, file, line); exit(1); #else - elog(PANIC, "stuck spinlock (%p) detected at %s:%d", - lock, file, line); + elog(PANIC, "stuck spinlock detected at %s, %s:%d", + func, file, line); #endif } - /* * s_lock(lock) - platform-independent portion of waiting for a spinlock. */ -void -s_lock(volatile slock_t *lock, const char *file, int line) +int +s_lock(volatile slock_t *lock, const char *file, int line, const char *func) { - /* - * We loop tightly for awhile, then delay using pg_usleep() and try again. - * Preferably, "awhile" should be a small multiple of the maximum time we - * expect a spinlock to be held. 100 iterations seems about right as an - * initial guess. However, on a uniprocessor the loop is a waste of - * cycles, while in a multi-CPU scenario it's usually better to spin a bit - * longer than to call the kernel, so we try to adapt the spin loop count - * depending on whether we seem to be in a uniprocessor or multiprocessor. - * - * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd - * be wrong; there are platforms where that can result in a "stuck - * spinlock" failure. This has been seen particularly on Alphas; it seems - * that the first TAS after returning from kernel space will always fail - * on that hardware. - * - * Once we do decide to block, we use randomly increasing pg_usleep() - * delays. The first delay is 1 msec, then the delay randomly increases to - * about one second, after which we reset to 1 msec and start again. The - * idea here is that in the presence of heavy contention we need to - * increase the delay, else the spinlock holder may never get to run and - * release the lock. (Consider situation where spinlock holder has been - * nice'd down in priority by the scheduler --- it will not get scheduled - * until all would-be acquirers are sleeping, so if we always use a 1-msec - * sleep, there is a real possibility of starvation.) But we can't just - * clamp the delay to an upper bound, else it would take a long time to - * make a reasonable number of tries. - * - * We time out and declare error after NUM_DELAYS delays (thus, exactly - * that many tries). With the given settings, this will usually take 2 or - * so minutes. It seems better to fix the total number of tries (and thus - * the probability of unintended failure) than to fix the total time - * spent. - * - * The pg_usleep() delays are measured in milliseconds because 1 msec is a - * common resolution limit at the OS level for newer platforms. On older - * platforms the resolution limit is usually 10 msec, in which case the - * total delay before timeout will be a bit more. - */ -#define MIN_SPINS_PER_DELAY 10 -#define MAX_SPINS_PER_DELAY 1000 -#define NUM_DELAYS 1000 -#define MIN_DELAY_MSEC 1 -#define MAX_DELAY_MSEC 1000 + SpinDelayStatus delayStatus; - int spins = 0; - int delays = 0; - int cur_delay = 0; + init_spin_delay(&delayStatus, file, line, func); - while (TAS(lock)) + while (TAS_SPIN(lock)) { - /* CPU-specific delay each time through the loop */ - SPIN_DELAY(); + perform_spin_delay(&delayStatus); + } - /* Block the process every spins_per_delay tries */ - if (++spins >= spins_per_delay) - { - if (++delays > NUM_DELAYS) - s_lock_stuck(lock, file, line); + finish_spin_delay(&delayStatus); - if (cur_delay == 0) /* first time to delay? */ - cur_delay = MIN_DELAY_MSEC; + return delayStatus.delays; +} - pg_usleep(cur_delay * 1000L); +#ifdef USE_DEFAULT_S_UNLOCK +void +s_unlock(volatile slock_t *lock) +{ +#ifdef TAS_ACTIVE_WORD + /* HP's PA-RISC */ + *TAS_ACTIVE_WORD(lock) = -1; +#else + *lock = 0; +#endif +} +#endif + +/* + * Wait while spinning on a contended spinlock. + */ +void +perform_spin_delay(SpinDelayStatus *status) +{ + /* CPU-specific delay each time through the loop */ + SPIN_DELAY(); + + /* Block the process every spins_per_delay tries */ + if (++(status->spins) >= spins_per_delay) + { + if (++(status->delays) > NUM_DELAYS) + s_lock_stuck(status->file, status->line, status->func); + + if (status->cur_delay == 0) /* first time to delay? */ + status->cur_delay = MIN_DELAY_USEC; + + pg_usleep(status->cur_delay); #if defined(S_LOCK_TEST) - fprintf(stdout, "*"); - fflush(stdout); + fprintf(stdout, "*"); + fflush(stdout); #endif - /* increase delay by a random fraction between 1X and 2X */ - cur_delay += (int) (cur_delay * + /* increase delay by a random fraction between 1X and 2X */ + status->cur_delay += (int) (status->cur_delay * ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5); - /* wrap back to minimum delay when max is exceeded */ - if (cur_delay > MAX_DELAY_MSEC) - cur_delay = MIN_DELAY_MSEC; + /* wrap back to minimum delay when max is exceeded */ + if (status->cur_delay > MAX_DELAY_USEC) + status->cur_delay = MIN_DELAY_USEC; - spins = 0; - } + status->spins = 0; } +} /* - * If we were able to acquire the lock without delaying, it's a good - * indication we are in a multiprocessor. If we had to delay, it's a sign - * (but not a sure thing) that we are in a uniprocessor. Hence, we - * decrement spins_per_delay slowly when we had to delay, and increase it - * rapidly when we didn't. It's expected that spins_per_delay will - * converge to the minimum value on a uniprocessor and to the maximum - * value on a multiprocessor. - * - * Note: spins_per_delay is local within our current process. We want to - * average these observations across multiple backends, since it's - * relatively rare for this function to even get entered, and so a single - * backend might not live long enough to converge on a good value. That - * is handled by the two routines below. - */ - if (cur_delay == 0) + * After acquiring a spinlock, update estimates about how long to loop. + * + * If we were able to acquire the lock without delaying, it's a good + * indication we are in a multiprocessor. If we had to delay, it's a sign + * (but not a sure thing) that we are in a uniprocessor. Hence, we + * decrement spins_per_delay slowly when we had to delay, and increase it + * rapidly when we didn't. It's expected that spins_per_delay will + * converge to the minimum value on a uniprocessor and to the maximum + * value on a multiprocessor. + * + * Note: spins_per_delay is local within our current process. We want to + * average these observations across multiple backends, since it's + * relatively rare for this function to even get entered, and so a single + * backend might not live long enough to converge on a good value. That + * is handled by the two routines below. + */ +void +finish_spin_delay(SpinDelayStatus *status) +{ + if (status->cur_delay == 0) { /* we never had to delay */ if (spins_per_delay < MAX_SPINS_PER_DELAY) @@ -169,7 +198,7 @@ set_spins_per_delay(int shared_spins_per_delay) } /* - * Recompute shared estimate of spins_per_delay during backend exit. + * Update shared estimate of spins_per_delay during backend exit. * * NB: this has to be pretty fast as it is called while holding a spinlock */ @@ -179,7 +208,7 @@ recompute_spins_per_delay(int shared_spins_per_delay) /* * We use an exponential moving average with a relatively slow adaption * rate, so that noise in any one backend's result won't affect the shared - * value too much. As long as both inputs are within the allowed range, + * value too much. As long as both inputs are within the allowed range, * the result must be too, so we need not worry about clamping the result. * * We deliberately truncate rather than rounding; this is so that single http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/a1603088/src/include/storage/s_lock.h ---------------------------------------------------------------------- diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h index dfdacb7..45981e0 100644 --- a/src/include/storage/s_lock.h +++ b/src/include/storage/s_lock.h @@ -12,10 +12,11 @@ * void S_INIT_LOCK(slock_t *lock) * Initialize a spinlock (to the unlocked state). * - * void S_LOCK(slock_t *lock) + * int S_LOCK(slock_t *lock) * Acquire a spinlock, waiting if necessary. * Time out and abort() if unable to acquire the lock in a * "reasonable" amount of time --- typically ~ 1 minute. + * Should return number of "delays"; see s_lock.c * * void S_UNLOCK(slock_t *lock) * Unlock a previously acquired lock. @@ -31,25 +32,47 @@ * macros at the bottom of the file. Check if your platform can use * these or needs to override them. * - * Usually, S_LOCK() is implemented in terms of an even lower-level macro - * TAS(): + * Usually, S_LOCK() is implemented in terms of even lower-level macros + * TAS() and TAS_SPIN(): * * int TAS(slock_t *lock) * Atomic test-and-set instruction. Attempt to acquire the lock, * but do *not* wait. Returns 0 if successful, nonzero if unable * to acquire the lock. * - * TAS() is NOT part of the API, and should never be called directly. + * int TAS_SPIN(slock_t *lock) + * Like TAS(), but this version is used when waiting for a lock + * previously found to be contended. By default, this is the + * same as TAS(), but on some architectures it's better to poll a + * contended lock using an unlocked instruction and retry the + * atomic test-and-set only when it appears free. * - * CAUTION: on some platforms TAS() may sometimes report failure to acquire - * a lock even when the lock is not locked. For example, on Alpha TAS() - * will "fail" if interrupted. Therefore TAS() should always be invoked - * in a retry loop, even if you are certain the lock is free. + * TAS() and TAS_SPIN() are NOT part of the API, and should never be called + * directly. * - * ANOTHER CAUTION: be sure that TAS() and S_UNLOCK() represent sequence - * points, ie, loads and stores of other values must not be moved across - * a lock or unlock. In most cases it suffices to make the operation be - * done through a "volatile" pointer. + * CAUTION: on some platforms TAS() and/or TAS_SPIN() may sometimes report + * failure to acquire a lock even when the lock is not locked. For example, + * on Alpha TAS() will "fail" if interrupted. Therefore a retry loop must + * always be used, even if you are certain the lock is free. + * + * It is the responsibility of these macros to make sure that the compiler + * does not re-order accesses to shared memory to precede the actual lock + * acquisition, or follow the lock release. Prior to PostgreSQL 9.5, this + * was the caller's responsibility, which meant that callers had to use + * volatile-qualified pointers to refer to both the spinlock itself and the + * shared data being accessed within the spinlocked critical section. This + * was notationally awkward, easy to forget (and thus error-prone), and + * prevented some useful compiler optimizations. For these reasons, we + * now require that the macros themselves prevent compiler re-ordering, + * so that the caller doesn't need to take special precautions. + * + * On platforms with weak memory ordering, the TAS(), TAS_SPIN(), and + * S_UNLOCK() macros must further include hardware-level memory fence + * instructions to prevent similar re-ordering at the hardware level. + * TAS() and TAS_SPIN() must guarantee that loads and stores issued after + * the macro are not executed until the lock has been obtained. Conversely, + * S_UNLOCK() must guarantee that loads and stores issued before the macro + * have been executed before the lock is released. * * On most supported platforms, TAS() uses a tas() function written * in assembly language to execute a hardware atomic-test-and-set @@ -63,21 +86,19 @@ * when using the SysV semaphore code. * * - * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/s_lock.h,v 1.171 2010/01/05 11:06:28 mha Exp $ + * src/include/storage/s_lock.h * *------------------------------------------------------------------------- */ #ifndef S_LOCK_H #define S_LOCK_H -#include "storage/pg_sema.h" #ifdef HAVE_SPINLOCKS /* skip spinlocks if requested */ - #if defined(__GNUC__) || defined(__INTEL_COMPILER) /************************************************************************* * All the gcc inlines @@ -124,6 +145,12 @@ tas(volatile slock_t *lock) * Use a non-locking test before asserting the bus lock. Note that the * extra test appears to be a small loss on some x86 platforms and a small * win on others; it's by no means clear that we should keep it. + * + * When this was last tested, we didn't have separate TAS() and TAS_SPIN() + * macros. Nowadays it probably would be better to do a non-locking test + * in TAS_SPIN() but not in TAS(), like on x86_64, but no-one's done the + * testing to verify that. Without some empirical evidence, better to + * leave it alone. */ __asm__ __volatile__( " cmpb $0,%1 \n" @@ -132,7 +159,7 @@ tas(volatile slock_t *lock) " xchgb %0,%1 \n" "1: \n" : "+q"(_res), "+m"(*lock) -: +: /* no inputs */ : "memory", "cc"); return (int) _res; } @@ -179,21 +206,27 @@ typedef unsigned char slock_t; #define TAS(lock) tas(lock) +/* + * On Intel EM64T, it's a win to use a non-locking test before the xchg proper, + * but only when spinning. + * + * See also Implementing Scalable Atomic Locks for Multi-Core Intel(tm) EM64T + * and IA32, by Michael Chynoweth and Mary R. Lee. As of this writing, it is + * available at: + * http://software.intel.com/en-us/articles/implementing-scalable-atomic-locks-for-multi-core-intel-em64t-and-ia32-architectures + */ +#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock)) + static __inline__ int tas(volatile slock_t *lock) { register slock_t _res = 1; - /* - * On Opteron, using a non-locking test before the locking instruction - * is a huge loss. On EM64T, it appears to be a wash or small loss, - * so we needn't bother to try to distinguish the sub-architectures. - */ __asm__ __volatile__( " lock \n" " xchgb %0,%1 \n" : "+q"(_res), "+m"(*lock) -: +: /* no inputs */ : "memory", "cc"); return (int) _res; } @@ -214,13 +247,36 @@ spin_delay(void) #endif /* __x86_64__ */ -#if defined(__ia64__) || defined(__ia64) /* Intel Itanium */ +#if defined(__ia64__) || defined(__ia64) +/* + * Intel Itanium, gcc or Intel's compiler. + * + * Itanium has weak memory ordering, but we rely on the compiler to enforce + * strict ordering of accesses to volatile data. In particular, while the + * xchg instruction implicitly acts as a memory barrier with 'acquire' + * semantics, we do not have an explicit memory fence instruction in the + * S_UNLOCK macro. We use a regular assignment to clear the spinlock, and + * trust that the compiler marks the generated store instruction with the + * ".rel" opcode. + * + * Testing shows that assumption to hold on gcc, although I could not find + * any explicit statement on that in the gcc manual. In Intel's compiler, + * the -m[no-]serialize-volatile option controls that, and testing shows that + * it is enabled by default. + * + * While icc accepts gcc asm blocks on x86[_64], this is not true on ia64 + * (at least not in icc versions before 12.x). So we have to carry a separate + * compiler-intrinsic-based implementation for it. + */ #define HAS_TEST_AND_SET typedef unsigned int slock_t; #define TAS(lock) tas(lock) +/* On IA64, it's a win to use a non-locking test before the xchg proper */ +#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock)) + #ifndef __INTEL_COMPILER static __inline__ int @@ -248,31 +304,37 @@ tas(volatile slock_t *lock) return ret; } +/* icc can't use the regular gcc S_UNLOCK() macro either in this case */ +#define S_UNLOCK(lock) \ + do { __memory_barrier(); *(lock) = 0; } while (0) + #endif /* __INTEL_COMPILER */ #endif /* __ia64__ || __ia64 */ - -#if defined(__arm__) || defined(__arm) +/* + * On ARM and ARM64, we use __sync_lock_test_and_set(int *, int) if available. + * + * We use the int-width variant of the builtin because it works on more chips + * than other widths. + */ +#if defined(__arm__) || defined(__arm) || defined(__aarch64__) || defined(__aarch64) +#ifdef HAVE_GCC__SYNC_INT32_TAS #define HAS_TEST_AND_SET -typedef unsigned char slock_t; - #define TAS(lock) tas(lock) +typedef int slock_t; + static __inline__ int tas(volatile slock_t *lock) { - register slock_t _res = 1; - - __asm__ __volatile__( - " swpb %0, %0, [%2] \n" -: "+r"(_res), "+m"(*lock) -: "r"(lock) -: "memory"); - return (int) _res; + return __sync_lock_test_and_set(lock, 1); } -#endif /* __arm__ */ +#define S_UNLOCK(lock) __sync_lock_release(lock) + +#endif /* HAVE_GCC__SYNC_INT32_TAS */ +#endif /* __arm__ || __arm || __aarch64__ || __aarch64 */ /* S/390 and S/390x Linux (32- and 64-bit zSeries) */ @@ -300,6 +362,12 @@ tas(volatile slock_t *lock) #if defined(__sparc__) /* Sparc */ +/* + * Solaris has always run sparc processors in TSO (total store) mode, but + * linux didn't use to and the *BSDs still don't. So, be careful about + * acquire/release semantics. The CPU will treat superfluous membars as + * NOPs, so it's just code space. + */ #define HAS_TEST_AND_SET typedef unsigned char slock_t; @@ -321,9 +389,51 @@ tas(volatile slock_t *lock) : "=r"(_res), "+m"(*lock) : "r"(lock) : "memory"); +#if defined(__sparcv7) || defined(__sparc_v7__) + /* + * No stbar or membar available, luckily no actually produced hardware + * requires a barrier. + */ +#elif defined(__sparcv8) || defined(__sparc_v8__) + /* stbar is available (and required for both PSO, RMO), membar isn't */ + __asm__ __volatile__ ("stbar \n":::"memory"); +#else + /* + * #LoadStore (RMO) | #LoadLoad (RMO) together are the appropriate acquire + * barrier for sparcv8+ upwards. + */ + __asm__ __volatile__ ("membar #LoadStore | #LoadLoad \n":::"memory"); +#endif return (int) _res; } +#if defined(__sparcv7) || defined(__sparc_v7__) +/* + * No stbar or membar available, luckily no actually produced hardware + * requires a barrier. We fall through to the default gcc definition of + * S_UNLOCK in this case. + */ +#elif defined(__sparcv8) || defined(__sparc_v8__) +/* stbar is available (and required for both PSO, RMO), membar isn't */ +#define S_UNLOCK(lock) \ +do \ +{ \ + __asm__ __volatile__ ("stbar \n":::"memory"); \ + *((volatile slock_t *) (lock)) = 0; \ +} while (0) +#else +/* + * #LoadStore (RMO) | #StoreStore (RMO, PSO) together are the appropriate + * release barrier for sparcv8+ upwards. + */ +#define S_UNLOCK(lock) \ +do \ +{ \ + __asm__ __volatile__ ("membar #LoadStore | #StoreStore \n":::"memory"); \ + *((volatile slock_t *) (lock)) = 0; \ +} while (0) +#endif + #endif /* __sparc__ */ @@ -331,16 +441,23 @@ tas(volatile slock_t *lock) #if defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__) #define HAS_TEST_AND_SET -#if defined(__ppc64__) || defined(__powerpc64__) -typedef unsigned long slock_t; -#else typedef unsigned int slock_t; -#endif #define TAS(lock) tas(lock) + +/* On PPC, it's a win to use a non-locking test before the lwarx */ +#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock)) + /* * NOTE: per the Enhanced PowerPC Architecture manual, v1.0 dated 7-May-2002, * an isync is a sufficient synchronization barrier after a lwarx/stwcx loop. + * On newer machines, we can use lwsync instead for better performance. + * + * Ordinarily, we'd code the branches here using GNU-style local symbols, that + * is "1f" referencing "1:" and so on. But some people run gcc on AIX with + * IBM's assembler as backend, and IBM's assembler doesn't do local symbols. + * So hand-code the branch offsets; fortunately, all PPC instructions are + * exactly 4 bytes each, so it's not too hard to count. */ static __inline__ int tas(volatile slock_t *lock) @@ -349,18 +466,24 @@ tas(volatile slock_t *lock) int _res; __asm__ __volatile__( +#ifdef USE_PPC_LWARX_MUTEX_HINT +" lwarx %0,0,%3,1 \n" +#else " lwarx %0,0,%3 \n" +#endif " cmpwi %0,0 \n" -" bne 1f \n" +" bne $+16 \n" /* branch to li %1,1 */ " addi %0,%0,1 \n" " stwcx. %0,0,%3 \n" -" beq 2f \n" -"1: li %1,1 \n" -" b 3f \n" -"2: \n" +" beq $+12 \n" /* branch to lwsync/isync */ +" li %1,1 \n" +" b $+12 \n" /* branch to end of asm sequence */ +#ifdef USE_PPC_LWSYNC +" lwsync \n" +#else " isync \n" +#endif " li %1,0 \n" -"3: \n" : "=&r"(_t), "=r"(_res), "+m"(*lock) : "r"(lock) @@ -368,13 +491,25 @@ tas(volatile slock_t *lock) return _res; } -/* PowerPC S_UNLOCK is almost standard but requires a "sync" instruction */ +/* + * PowerPC S_UNLOCK is almost standard but requires a "sync" instruction. + * On newer machines, we can use lwsync instead for better performance. + */ +#ifdef USE_PPC_LWSYNC #define S_UNLOCK(lock) \ do \ { \ - __asm__ __volatile__ (" sync \n"); \ + __asm__ __volatile__ (" lwsync \n" ::: "memory"); \ *((volatile slock_t *) (lock)) = 0; \ } while (0) +#else +#define S_UNLOCK(lock) \ +do \ +{ \ + __asm__ __volatile__ (" sync \n" ::: "memory"); \ + *((volatile slock_t *) (lock)) = 0; \ +} while (0) +#endif /* USE_PPC_LWSYNC */ #endif /* powerpc */ @@ -397,7 +532,7 @@ tas(volatile slock_t *lock) " tas %1 \n" " sne %0 \n" : "=d"(rv), "+m"(*lock) -: +: /* no inputs */ : "memory", "cc"); return rv; } @@ -435,76 +570,6 @@ tas(volatile slock_t *lock) #endif /* __vax__ */ -#if defined(__ns32k__) /* National Semiconductor 32K */ -#define HAS_TEST_AND_SET - -typedef unsigned char slock_t; - -#define TAS(lock) tas(lock) - -static __inline__ int -tas(volatile slock_t *lock) -{ - register int _res; - - __asm__ __volatile__( - " sbitb 0, %1 \n" - " sfsd %0 \n" -: "=r"(_res), "+m"(*lock) -: -: "memory"); - return _res; -} - -#endif /* __ns32k__ */ - - -#if defined(__alpha) || defined(__alpha__) /* Alpha */ -/* - * Correct multi-processor locking methods are explained in section 5.5.3 - * of the Alpha AXP Architecture Handbook, which at this writing can be - * found at ftp://ftp.netbsd.org/pub/NetBSD/misc/dec-docs/index.html. - * For gcc we implement the handbook's code directly with inline assembler. - */ -#define HAS_TEST_AND_SET - -typedef unsigned long slock_t; - -#define TAS(lock) tas(lock) - -static __inline__ int -tas(volatile slock_t *lock) -{ - register slock_t _res; - - __asm__ __volatile__( - " ldq $0, %1 \n" - " bne $0, 2f \n" - " ldq_l %0, %1 \n" - " bne %0, 2f \n" - " mov 1, $0 \n" - " stq_c $0, %1 \n" - " beq $0, 2f \n" - " mb \n" - " br 3f \n" - "2: mov 1, %0 \n" - "3: \n" -: "=&r"(_res), "+m"(*lock) -: -: "memory", "0"); - return (int) _res; -} - -#define S_UNLOCK(lock) \ -do \ -{\ - __asm__ __volatile__ (" mb \n"); \ - *((volatile slock_t *) (lock)) = 0; \ -} while (0) - -#endif /* __alpha || __alpha__ */ - - #if defined(__mips__) && !defined(__sgi) /* non-SGI MIPS */ /* Note: on SGI we use the OS' mutex ABI, see below */ /* Note: R10000 processors require a separate SYNC */ @@ -534,7 +599,7 @@ tas(volatile slock_t *lock) " sync \n" " .set pop " : "=&r" (_res), "=&r" (_tmp), "+R" (*_l) -: +: /* no inputs */ : "memory"); return _res; } @@ -549,7 +614,10 @@ do \ " .set noreorder \n" \ " .set nomacro \n" \ " sync \n" \ - " .set pop "); \ + " .set pop " \ +: /* no outputs */ \ +: /* no inputs */ \ +: "memory"); \ *((volatile slock_t *) (lock)) = 0; \ } while (0) @@ -607,6 +675,20 @@ tas(volatile slock_t *lock) typedef unsigned char slock_t; #endif +/* + * Default implementation of S_UNLOCK() for gcc/icc. + * + * Note that this implementation is unsafe for any platform that can reorder + * a memory access (either load or store) after a following store. That + * happens not to be possible on x86 and most legacy architectures (some are + * single-processor!), but many modern systems have weaker memory ordering. + * Those that do must define their own version of S_UNLOCK() rather than + * relying on this one. + */ +#if !defined(S_UNLOCK) +#define S_UNLOCK(lock) \ + do { __asm__ __volatile__("" : : : "memory"); *(lock) = 0; } while (0) +#endif #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */ @@ -631,7 +713,7 @@ typedef unsigned char slock_t; asm int tas(volatile slock_t *s_lock) { -/* UNIVEL wants %mem in column 1, so we don't pg_indent this file */ +/* UNIVEL wants %mem in column 1, so we don't pgindent this file */ %mem s_lock pushl %ebx movl s_lock, %ebx @@ -644,27 +726,6 @@ tas(volatile slock_t *s_lock) #endif /* defined(USE_UNIVEL_CC) */ -#if defined(__alpha) || defined(__alpha__) /* Tru64 Unix Alpha compiler */ -/* - * The Tru64 compiler doesn't support gcc-style inline asm, but it does - * have some builtin functions that accomplish much the same results. - * For simplicity, slock_t is defined as long (ie, quadword) on Alpha - * regardless of the compiler in use. LOCK_LONG and UNLOCK_LONG only - * operate on an int (ie, longword), but that's OK as long as we define - * S_INIT_LOCK to zero out the whole quadword. - */ -#define HAS_TEST_AND_SET - -typedef unsigned long slock_t; - -#include <alpha/builtins.h> -#define S_INIT_LOCK(lock) (*(lock) = 0) -#define TAS(lock) (__LOCK_LONG_RETRY((lock), 1) == 0) -#define S_UNLOCK(lock) __UNLOCK_LONG(lock) - -#endif /* __alpha || __alpha__ */ - - #if defined(__hppa) || defined(__hppa__) /* HP PA-RISC, GCC and HP compilers */ /* * HP's PA-RISC @@ -701,9 +762,22 @@ tas(volatile slock_t *lock) return (lockval == 0); } -#endif /* __GNUC__ */ +/* + * The hppa implementation doesn't follow the rules of this files and provides + * a gcc specific implementation outside of the above defined(__GNUC__). It + * does so to avoid duplication between the HP compiler and gcc. So undefine + * the generic fallback S_UNLOCK from above. + */ +#ifdef S_UNLOCK +#undef S_UNLOCK +#endif +#define S_UNLOCK(lock) \ + do { \ + __asm__ __volatile__("" : : : "memory"); \ + *TAS_ACTIVE_WORD(lock) = -1; \ + } while (0) -#define S_UNLOCK(lock) (*TAS_ACTIVE_WORD(lock) = -1) +#endif /* __GNUC__ */ #define S_INIT_LOCK(lock) \ do { \ @@ -720,59 +794,31 @@ tas(volatile slock_t *lock) #if defined(__hpux) && defined(__ia64) && !defined(__GNUC__) - -#define HAS_TEST_AND_SET - -typedef unsigned int slock_t; - -#include <ia64/sys/inline.h> -#define TAS(lock) _Asm_xchg(_SZ_W, lock, 1, _LDHINT_NONE) - -#endif /* HPUX on IA64, non gcc */ - - -#if defined(__sgi) /* SGI compiler */ /* - * SGI IRIX 5 - * slock_t is defined as a unsigned long. We use the standard SGI - * mutex API. + * HP-UX on Itanium, non-gcc/icc compiler * - * The following comment is left for historical reasons, but is probably - * not a good idea since the mutex ABI is supported. + * We assume that the compiler enforces strict ordering of loads/stores on + * volatile data (see comments on the gcc-version earlier in this file). + * Note that this assumption does *not* hold if you use the + * +Ovolatile=__unordered option on the HP-UX compiler, so don't do that. * - * This stuff may be supplemented in the future with Masato Kataoka's MIPS-II - * assembly from his NECEWS SVR4 port, but we probably ought to retain this - * for the R3000 chips out there. - */ -#define HAS_TEST_AND_SET - -typedef unsigned long slock_t; - -#include "mutex.h" -#define TAS(lock) (test_and_set(lock,1)) -#define S_UNLOCK(lock) (test_then_and(lock,0)) -#define S_INIT_LOCK(lock) (test_then_and(lock,0)) -#define S_LOCK_FREE(lock) (test_then_add(lock,0) == 0) -#endif /* __sgi */ - - -#if defined(sinix) /* Sinix */ -/* - * SINIX / Reliant UNIX - * slock_t is defined as a struct abilock_t, which has a single unsigned long - * member. (Basically same as SGI) + * See also Implementing Spinlocks on the Intel Itanium Architecture and + * PA-RISC, by Tor Ekqvist and David Graves, for more information. As of + * this writing, version 1.0 of the manual is available at: + * http://h21007.www2.hp.com/portal/download/files/unprot/itanium/spinlocks.pdf */ #define HAS_TEST_AND_SET -#include "abi_mutex.h" -typedef abilock_t slock_t; +typedef unsigned int slock_t; -#define TAS(lock) (!acquire_lock(lock)) -#define S_UNLOCK(lock) release_lock(lock) -#define S_INIT_LOCK(lock) init_lock(lock) -#define S_LOCK_FREE(lock) (stat_lock(lock) == UNLOCKED) -#endif /* sinix */ +#include <ia64/sys/inline.h> +#define TAS(lock) _Asm_xchg(_SZ_W, lock, 1, _LDHINT_NONE) +/* On IA64, it's a win to use a non-locking test before the xchg proper */ +#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock)) +#define S_UNLOCK(lock) \ + do { _Asm_mf(); (*(lock)) = 0; } while (0) +#endif /* HPUX on IA64, non gcc/icc */ #if defined(_AIX) /* AIX */ /* @@ -789,28 +835,7 @@ typedef int slock_t; #endif /* _AIX */ -#if defined (nextstep) /* Nextstep */ -#define HAS_TEST_AND_SET - -typedef struct mutex slock_t; - -#define S_LOCK(lock) mutex_lock(lock) -#define S_UNLOCK(lock) mutex_unlock(lock) -#define S_INIT_LOCK(lock) mutex_init(lock) -/* For Mach, we have to delve inside the entrails of `struct mutex'. Ick! */ -#define S_LOCK_FREE(alock) ((alock)->lock == 0) -#endif /* nextstep */ - - -/* These are in s_lock.c */ - - -#if defined(sun3) /* Sun3 */ -#define HAS_TEST_AND_SET - -typedef unsigned char slock_t; -#endif - +/* These are in sunstudio_(sparc|x86).s */ #if defined(__SUNPRO_C) && (defined(__i386) || defined(__x86_64__) || defined(__sparc__) || defined(__sparc)) #define HAS_TEST_AND_SET @@ -854,9 +879,15 @@ spin_delay(void) } #endif +#include <intrin.h> +#pragma intrinsic(_ReadWriteBarrier) + +#define S_UNLOCK(lock) \ + do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0) + #endif - + #endif /* !defined(HAS_TEST_AND_SET) */ @@ -874,16 +905,16 @@ spin_delay(void) * to fall foul of kernel limits on number of semaphores, so don't use this * unless you must! The subroutines appear in spin.c. */ -typedef PGSemaphoreData slock_t; +typedef int slock_t; extern bool s_lock_free_sema(volatile slock_t *lock); extern void s_unlock_sema(volatile slock_t *lock); -extern void s_init_lock_sema(volatile slock_t *lock); +extern void s_init_lock_sema(volatile slock_t *lock, bool nested); extern int tas_sema(volatile slock_t *lock); #define S_LOCK_FREE(lock) s_lock_free_sema(lock) #define S_UNLOCK(lock) s_unlock_sema(lock) -#define S_INIT_LOCK(lock) s_init_lock_sema(lock) +#define S_INIT_LOCK(lock) s_init_lock_sema(lock, false) #define TAS(lock) tas_sema(lock) @@ -896,10 +927,7 @@ extern int tas_sema(volatile slock_t *lock); #if !defined(S_LOCK) #define S_LOCK(lock) \ - do { \ - if (TAS(lock)) \ - s_lock((lock), __FILE__, __LINE__); \ - } while (0) + (TAS(lock) ? s_lock((lock), __FILE__, __LINE__, PG_FUNCNAME_MACRO) : 0) #endif /* S_LOCK */ #if !defined(S_LOCK_FREE) @@ -907,7 +935,25 @@ extern int tas_sema(volatile slock_t *lock); #endif /* S_LOCK_FREE */ #if !defined(S_UNLOCK) -#define S_UNLOCK(lock) (*((volatile slock_t *) (lock)) = 0) +/* + * Our default implementation of S_UNLOCK is essentially *(lock) = 0. This + * is unsafe if the platform can reorder a memory access (either load or + * store) after a following store; platforms where this is possible must + * define their own S_UNLOCK. But CPU reordering is not the only concern: + * if we simply defined S_UNLOCK() as an inline macro, the compiler might + * reorder instructions from inside the critical section to occur after the + * lock release. Since the compiler probably can't know what the external + * function s_unlock is doing, putting the same logic there should be adequate. + * A sufficiently-smart globally optimizing compiler could break that + * assumption, though, and the cost of a function call for every spinlock + * release may hurt performance significantly, so we use this implementation + * only for platforms where we don't know of a suitable intrinsic. For the + * most part, those are relatively obscure platform/compiler combinations to + * which the PostgreSQL project does not have access. + */ +#define USE_DEFAULT_S_UNLOCK +extern void s_unlock(volatile slock_t *lock); +#define S_UNLOCK(lock) s_unlock(lock) #endif /* S_UNLOCK */ #if !defined(S_INIT_LOCK) @@ -925,11 +971,16 @@ extern int tas(volatile slock_t *lock); /* in port/.../tas.s, or #define TAS(lock) tas(lock) #endif /* TAS */ +#if !defined(TAS_SPIN) +#define TAS_SPIN(lock) TAS(lock) +#endif /* TAS_SPIN */ + +extern slock_t dummy_spinlock; /* * Platform-independent out-of-line support routines */ -extern void s_lock(volatile slock_t *lock, const char *file, int line); +extern int s_lock(volatile slock_t *lock, const char *file, int line, const char *func); /* Support for dynamic adjustment of spins_per_delay */ #define DEFAULT_SPINS_PER_DELAY 100 @@ -937,4 +988,34 @@ extern void s_lock(volatile slock_t *lock, const char *file, int line); extern void set_spins_per_delay(int shared_spins_per_delay); extern int recompute_spins_per_delay(int shared_spins_per_delay); +/* + * Support for spin delay which is useful in various places where + * spinlock-like procedures take place. + */ +typedef struct +{ + int spins; + int delays; + int cur_delay; + const char *file; + int line; + const char *func; +} SpinDelayStatus; + +static inline void +init_spin_delay(SpinDelayStatus *status, + const char *file, int line, const char *func) +{ + status->spins = 0; + status->delays = 0; + status->cur_delay = 0; + status->file = file; + status->line = line; + status->func = func; +} + +#define init_local_spin_delay(status) init_spin_delay(status, __FILE__, __LINE__, PG_FUNCNAME_MACRO) +void perform_spin_delay(SpinDelayStatus *status); +void finish_spin_delay(SpinDelayStatus *status); + #endif /* S_LOCK_H */ http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/a1603088/src/test/regress/checkinc.py ---------------------------------------------------------------------- diff --git a/src/test/regress/checkinc.py b/src/test/regress/checkinc.py index 59addf7..6f4e006 100755 --- a/src/test/regress/checkinc.py +++ b/src/test/regress/checkinc.py @@ -48,6 +48,7 @@ fileset = { 'ia64/sys/inline.h': [], 'io.h': [], 'ioctl.h': [], + 'intrin.h': [], 'kernel/OS.h': [], 'kernel/image.h': [], 'libintl.h': [],
