Re: [HACKERS] Deadlock in XLogInsert at AIX

Noah Misch Tue, 08 Oct 2019 23:39:34 -0700

On Mon, Oct 07, 2019 at 03:06:35PM -0400, Tom Lane wrote:
> Noah Misch <n...@leadboat.com> writes:
> > [ fetch-add-gcc-xlc-unify-v2.patch ]
> 
> This still fails on Apple's compilers.  The first failure I get is
> 
> ccache gcc -std=gnu99 -Wall -Wmissing-prototypes -Wpointer-arith 
> -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute 
> -Wformat-security -fno-strict-aliasing -fwrapv -g -O2 -I../../../src/include 
> -I/usr/local/include -isysroot /Developer/SDKs/MacOSX10.5.sdk    -c -o 
> nodeHashjoin.o nodeHashjoin.c
> /var/tmp//ccXUM8ep.s:449:Parameter error: r0 not allowed for parameter 2 
> (code as 0 not r0)
> 
> Line 449 of the assembly file is the addi in
> 
> LM87:
>                 sync
>         lwarx   r0,0,r2
>         addi    r11,r0,1
>         stwcx.  r11,0,r2
>         bne             $-12
>         isync
> 
> which I suppose comes out of PG_PPC_FETCH_ADD.  I find this idea of
> constructing assembly code by string-pasting pretty unreadable and am not
> tempted to try to debug it, but I don't immediately see why this doesn't
> work when the existing s_lock.h code does.  I think that the assembler
> error message is probably misleading: while it seems to be saying to
> s/r0/0/ in the addi, gcc consistently uses "rN" syntax for the second
> parameter elsewhere.  I do note that gcc never generates r0 as addi's
> second parameter in several files I checked through, so maybe what it
> means is "you need to use some other register"?  (Which would imply that
> the constraint for this asm argument is too loose.)


Thanks for testing.  That error boils down to "need to use some other
register".  The second operand of addi is one of the ppc instruction operands
that can hold a constant zero or a register number[1], so the proper
constraint is "b".  I've made it so and added a comment.  I should probably
update s_lock.h, too, in a later patch.  I don't know how it has
mostly-avoided this failure mode, but its choice of constraint could explain
https://postgr.es/m/flat/36E70B06-2C5C-11D8-A096-0005024EF27F%40ifrance.com

> I'm also wondering why this isn't following s_lock.h's lead as to
> USE_PPC_LWARX_MUTEX_HINT and USE_PPC_LWSYNC.

Most or all of today's pg_atomic_compare_exchange_u32() usage does not have
the property that the mutex hint would signal.

pg_atomic_compare_exchange_u32() specifies "Full barrier semantics", which
lwsync does not provide.  We might want to relax the specification to make
lwsync acceptable, but that would be a separate, architecture-independent
project.  (generic-gcc.h:pg_atomic_compare_exchange_u32_impl() speculates
along those lines, writing "FIXME: we can probably use a lower consistency
model".)


[1] "Notice that addi and addis use the value 0, not the contents of GPR 0, if
RA=0." -- https://openpowerfoundation.org/?resource_lib=power-isa-version-3-0

diff --git a/configure b/configure
index 7a6bfc2..d6ecb33 100755
--- a/configure
+++ b/configure
@@ -14650,6 +14650,46 @@ $as_echo "$pgac_cv_have_ppc_mutex_hint" >&6; }
 $as_echo "#define HAVE_PPC_LWARX_MUTEX_HINT 1" >>confdefs.h
 
     fi
+    # Check if compiler accepts "i"(x) when __builtin_constant_p(x).
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether 
__builtin_constant_p(x) implies \"i\"(x) acceptance" >&5
+$as_echo_n "checking whether __builtin_constant_p(x) implies \"i\"(x) 
acceptance... " >&6; }
+if ${pgac_cv_have_i_constraint__builtin_constant_p+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+static inline int
+     addi(int ra, int si)
+     {
+         int res = 0;
+         if (__builtin_constant_p(si))
+             __asm__ __volatile__(
+                 " addi %0,%1,%2\n" : "=r"(res) : "r"(ra), "i"(si));
+         return res;
+     }
+     int test_adds(int x) { return addi(3, x) + addi(x, 5); }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  pgac_cv_have_i_constraint__builtin_constant_p=yes
+else
+  pgac_cv_have_i_constraint__builtin_constant_p=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_have_i_constraint__builtin_constant_p" >&5
+$as_echo "$pgac_cv_have_i_constraint__builtin_constant_p" >&6; }
+    if test x"$pgac_cv_have_i_constraint__builtin_constant_p" = xyes ; then
+
+$as_echo "#define HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P 1" >>confdefs.h
+
+    fi
   ;;
 esac
 
diff --git a/configure.in b/configure.in
index dde3eec..510bd76 100644
--- a/configure.in
+++ b/configure.in
@@ -1541,6 +1541,26 @@ case $host_cpu in
     if test x"$pgac_cv_have_ppc_mutex_hint" = xyes ; then
        AC_DEFINE(HAVE_PPC_LWARX_MUTEX_HINT, 1, [Define to 1 if the assembler 
supports PPC's LWARX mutex hint bit.])
     fi
+    # Check if compiler accepts "i"(x) when __builtin_constant_p(x).
+    AC_CACHE_CHECK([whether __builtin_constant_p(x) implies "i"(x) acceptance],
+                   [pgac_cv_have_i_constraint__builtin_constant_p],
+    [AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+    [static inline int
+     addi(int ra, int si)
+     {
+         int res = 0;
+         if (__builtin_constant_p(si))
+             __asm__ __volatile__(
+                 " addi %0,%1,%2\n" : "=r"(res) : "r"(ra), "i"(si));
+         return res;
+     }
+     int test_adds(int x) { return addi(3, x) + addi(x, 5); }], [])],
+    [pgac_cv_have_i_constraint__builtin_constant_p=yes],
+    [pgac_cv_have_i_constraint__builtin_constant_p=no])])
+    if test x"$pgac_cv_have_i_constraint__builtin_constant_p" = xyes ; then
+      AC_DEFINE(HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P, 1,
+                [Define to 1 if __builtin_constant_p(x) implies "i"(x) 
acceptance.])
+    fi
   ;;
 esac
 
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 512213a..9be84fc 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -332,6 +332,9 @@
 /* Define to 1 if you have isinf(). */
 #undef HAVE_ISINF
 
+/* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */
+#undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+
 /* Define to 1 if you have the <langinfo.h> header file. */
 #undef HAVE_LANGINFO_H
 
diff --git a/src/include/port/atomics.h b/src/include/port/atomics.h
index f93009e..e4292eb 100644
--- a/src/include/port/atomics.h
+++ b/src/include/port/atomics.h
@@ -87,14 +87,11 @@
  * using compiler intrinsics are a good idea.
  */
 /*
- * Given a gcc-compatible xlc compiler, prefer the xlc implementation.  The
- * ppc64le "IBM XL C/C++ for Linux, V13.1.2" implements both interfaces, but
- * __sync_lock_test_and_set() of one-byte types elicits SIGSEGV.
+ * gcc or compatible, including clang and icc.  Exclude xlc.  The ppc64le "IBM
+ * XL C/C++ for Linux, V13.1.2" emulates gcc, but __sync_lock_test_and_set()
+ * of one-byte types elicits SIGSEGV.
  */
-#if defined(__IBMC__) || defined(__IBMCPP__)
-#include "port/atomics/generic-xlc.h"
-/* gcc or compatible, including clang and icc */
-#elif defined(__GNUC__) || defined(__INTEL_COMPILER)
+#if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && !(defined(__IBMC__) || 
defined(__IBMCPP__))
 #include "port/atomics/generic-gcc.h"
 #elif defined(_MSC_VER)
 #include "port/atomics/generic-msvc.h"
diff --git a/src/include/port/atomics/arch-ppc.h 
b/src/include/port/atomics/arch-ppc.h
index 344b394..897f035 100644
--- a/src/include/port/atomics/arch-ppc.h
+++ b/src/include/port/atomics/arch-ppc.h
@@ -25,5 +25,187 @@
 #define pg_write_barrier_impl()                __asm__ __volatile__ ("lwsync" 
: : : "memory")
 #endif
 
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+       volatile uint32 value;
+} pg_atomic_uint32;
+
+/* 64bit atomics are only supported in 64bit mode */
+#ifdef __64BIT__
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+       volatile uint64 value pg_attribute_aligned(8);
+} pg_atomic_uint64;
+
+#endif /* __64BIT__ */
+
+/* Macros for building function bodies below. */
+#define PG_PPC_COMPARE_EXCHANGE(load_insn, cmp_insn, store_insn,       \
+                                                               
expected_constraint)    \
+       __asm__ __volatile__( \
+               "       sync                                            \n" \
+               "       " load_insn "   %0,0,%5         \n" \
+               "       " cmp_insn "    %0,%3           \n" \
+               "       bne             $+12            \n"             /* 
branch to isync */ \
+               "       " store_insn "  %4,0,%5         \n" \
+               "       bne             $-16            \n"             /* 
branch to load_insn */ \
+               "       isync                                           \n" \
+               "       mfcr            %1          \n" \
+:              "=&r"(found), "=r"(condition_register), "+m"(ptr->value) \
+:              expected_constraint(*expected), "r"(newval), "r"(&ptr->value) \
+:              "memory", "cc")
+
+#define PG_PPC_FETCH_ADD(load_insn, add_insn, store_insn,      \
+                                                res_constraint, 
add_constraint)        \
+       __asm__ __volatile__( \
+               "       sync                                            \n" \
+               "       " load_insn "   %1,0,%4         \n" \
+               "       " add_insn "    %0,%1,%3        \n" \
+               "       " store_insn "  %0,0,%4         \n" \
+               "       bne             $-12            \n"             /* 
branch to load_insn */ \
+               "       isync                                           \n" \
+:              "=&r"(_t), res_constraint(res), "+m"(ptr->value) \
+:              add_constraint(add_), "r"(&ptr->value) \
+:              "memory", "cc")
+
+/*
+ * This mimics gcc __atomic_compare_exchange_n(..., __ATOMIC_SEQ_CST), but
+ * code generation differs at the end.  __atomic_compare_exchange_n():
+ *  100:       isync
+ *  104:       mfcr    r3
+ *  108:       rlwinm  r3,r3,3,31,31
+ *  10c:       bne     120 <.eb+0x10>
+ *  110:       clrldi  r3,r3,63
+ *  114:       addi    r1,r1,112
+ *  118:       blr
+ *  11c:       nop
+ *  120:       clrldi  r3,r3,63
+ *  124:       stw     r9,0(r4)
+ *  128:       addi    r1,r1,112
+ *  12c:       blr
+ *
+ * This:
+ *   f0:       isync
+ *   f4:       mfcr    r9
+ *   f8:       rldicl. r3,r9,35,63
+ *   fc:       bne     104 <.eb>
+ *  100:       stw     r10,0(r4)
+ *  104:       addi    r1,r1,112
+ *  108:       blr
+ *
+ * This implementation may or may not have materially different performance.
+ * It's not exploiting the fact that cr0 still holds the relevant comparison
+ * bits, set during the __asm__.  One could fix that by moving more code into
+ * the __asm__.  (That would remove the freedom to eliminate dead stores when
+ * the caller ignores "expected", but few callers do.)
+ *
+ * The cmpwi variant may be dead code.  In gcc 7.2.0,
+ * __builtin_constant_p(*expected) always reports false.
+ * __atomic_compare_exchange_n() does use cmpwi when its second argument
+ * points to a constant.  Hence, using this instead of
+ * __atomic_compare_exchange_n() nominally penalizes the generic.h
+ * pg_atomic_test_set_flag_impl().  Modern GCC will use the generic-gcc.h
+ * version, making the penalty theoretical only.
+ *
+ * Recognizing constant "newval" would be superfluous, because there's no
+ * immediate-operand version of stwcx.
+ */
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+                                                                       uint32 
*expected, uint32 newval)
+{
+       uint32 found;
+       uint32 condition_register;
+       bool ret;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+       if (__builtin_constant_p(*expected) &&
+               *expected <= PG_INT16_MAX && *expected >= PG_INT16_MIN)
+               PG_PPC_COMPARE_EXCHANGE("lwarx", "cmpwi", "stwcx.", "i");
+       else
+#endif
+               PG_PPC_COMPARE_EXCHANGE("lwarx", "cmpw",  "stwcx.", "r");
+
+       ret = (condition_register >> 29) & 1;   /* test eq bit of cr0 */
+       if (!ret)
+               *expected = found;
+       return ret;
+}
+
+/*
+ * This mirrors gcc __sync_fetch_and_add().
+ *
+ * The second operand of addi can hold a constant zero or a register number,
+ * hence constraint "=&b" to avoid allocating r0.  "b" stands for "address
+ * base register"; most operands having this register-or-zero property are
+ * address bases, e.g. the second operand of lwax.  (s_lock.h:tas() has
+ * survived with "=&r", but that is fragile.)
+ */
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+       uint32 _t;
+       uint32 res;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+       if (__builtin_constant_p(add_) &&
+               add_ <= PG_INT16_MAX && add_ >= PG_INT16_MIN)
+               PG_PPC_FETCH_ADD("lwarx", "addi", "stwcx.", "=&b", "i");
+       else
+#endif
+               PG_PPC_FETCH_ADD("lwarx", "add",  "stwcx.", "=&r", "r");
+
+       return res;
+}
+
+#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+                                                                       uint64 
*expected, uint64 newval)
+{
+       uint64 found;
+       uint32 condition_register;
+       bool ret;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+       if (__builtin_constant_p(*expected) &&
+               *expected <= PG_INT16_MAX && *expected >= PG_INT16_MIN)
+               PG_PPC_COMPARE_EXCHANGE("ldarx", "cmpdi", "stdcx.", "i");
+       else
+#endif
+               PG_PPC_COMPARE_EXCHANGE("ldarx", "cmpd",  "stdcx.", "r");
+
+       ret = (condition_register >> 29) & 1;   /* test eq bit of cr0 */
+       if (!ret)
+               *expected = found;
+       return ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+       uint64 _t;
+       uint64 res;
+
+#ifdef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P
+       if (__builtin_constant_p(add_) &&
+               add_ <= PG_INT16_MAX && add_ >= PG_INT16_MIN)
+               PG_PPC_FETCH_ADD("ldarx", "addi", "stdcx.", "=&b", "i");
+       else
+#endif
+               PG_PPC_FETCH_ADD("ldarx", "add",  "stdcx.", "=&r", "r");
+
+       return res;
+}
+
+#endif /* PG_HAVE_ATOMIC_U64_SUPPORT */
+
 /* per architecture manual doubleword accesses have single copy atomicity */
 #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
diff --git a/src/include/port/atomics/generic-xlc.h 
b/src/include/port/atomics/generic-xlc.h
deleted file mode 100644
index 8b5c732..0000000
--- a/src/include/port/atomics/generic-xlc.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * generic-xlc.h
- *       Atomic operations for IBM's CC
- *
- * Portions Copyright (c) 2013-2019, PostgreSQL Global Development Group
- *
- * NOTES:
- *
- * Documentation:
- * * Synchronization and atomic built-in functions
- *   
http://www-01.ibm.com/support/knowledgecenter/SSGH3R_13.1.2/com.ibm.xlcpp131.aix.doc/compiler_ref/bifs_sync_atomic.html
- *
- * src/include/port/atomics/generic-xlc.h
- *
- * -------------------------------------------------------------------------
- */
-
-#if defined(HAVE_ATOMICS)
-
-#define PG_HAVE_ATOMIC_U32_SUPPORT
-typedef struct pg_atomic_uint32
-{
-       volatile uint32 value;
-} pg_atomic_uint32;
-
-
-/* 64bit atomics are only supported in 64bit mode */
-#ifdef __64BIT__
-#define PG_HAVE_ATOMIC_U64_SUPPORT
-typedef struct pg_atomic_uint64
-{
-       volatile uint64 value pg_attribute_aligned(8);
-} pg_atomic_uint64;
-
-#endif /* __64BIT__ */
-
-#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
-static inline bool
-pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
-                                                                       uint32 
*expected, uint32 newval)
-{
-       bool            ret;
-
-       /*
-        * atomics.h specifies sequential consistency ("full barrier semantics")
-        * for this interface.  Since "lwsync" provides acquire/release
-        * consistency only, do not use it here.  GCC atomics observe the same
-        * restriction; see its rs6000_pre_atomic_barrier().
-        */
-       __asm__ __volatile__ (" sync \n" ::: "memory");
-
-       /*
-        * XXX: __compare_and_swap is defined to take signed parameters, but 
that
-        * shouldn't matter since we don't perform any arithmetic operations.
-        */
-       ret = __compare_and_swap((volatile int*)&ptr->value,
-                                                        (int *)expected, 
(int)newval);
-
-       /*
-        * xlc's documentation tells us:
-        * "If __compare_and_swap is used as a locking primitive, insert a call 
to
-        * the __isync built-in function at the start of any critical sections."
-        *
-        * The critical section begins immediately after __compare_and_swap().
-        */
-       __isync();
-
-       return ret;
-}
-
-#define PG_HAVE_ATOMIC_FETCH_ADD_U32
-static inline uint32
-pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
-{
-       uint32 _t;
-       uint32 res;
-
-       /*
-        * xlc has a no-longer-documented __fetch_and_add() intrinsic.  In xlc
-        * 12.01.0000.0000, it emits a leading "sync" and trailing "isync".  In
-        * xlc 13.01.0003.0004, it emits neither.  Hence, using the intrinsic
-        * would add redundant syncs on xlc 12.
-        */
-       __asm__ __volatile__(
-               "       sync                            \n"
-               "       lwarx   %1,0,%4         \n"
-               "       add     %0,%1,%3        \n"
-               "       stwcx.  %0,0,%4         \n"
-               "       bne     $-12            \n"             /* branch to 
lwarx */
-               "       isync                           \n"
-:              "=&r"(_t), "=&r"(res), "+m"(ptr->value)
-:              "r"(add_), "r"(&ptr->value)
-:              "memory", "cc");
-
-       return res;
-}
-
-#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
-
-#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
-static inline bool
-pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
-                                                                       uint64 
*expected, uint64 newval)
-{
-       bool            ret;
-
-       __asm__ __volatile__ (" sync \n" ::: "memory");
-
-       ret = __compare_and_swaplp((volatile long*)&ptr->value,
-                                                          (long *)expected, 
(long)newval);
-
-       __isync();
-
-       return ret;
-}
-
-#define PG_HAVE_ATOMIC_FETCH_ADD_U64
-static inline uint64
-pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
-{
-       uint64 _t;
-       uint64 res;
-
-       /* Like u32, but s/lwarx/ldarx/; s/stwcx/stdcx/ */
-       __asm__ __volatile__(
-               "       sync                            \n"
-               "       ldarx   %1,0,%4         \n"
-               "       add     %0,%1,%3        \n"
-               "       stdcx.  %0,0,%4         \n"
-               "       bne     $-12            \n"             /* branch to 
ldarx */
-               "       isync                           \n"
-:              "=&r"(_t), "=&r"(res), "+m"(ptr->value)
-:              "r"(add_), "r"(&ptr->value)
-:              "memory", "cc");
-
-       return res;
-}
-
-#endif /* PG_HAVE_ATOMIC_U64_SUPPORT */
-
-#endif /* defined(HAVE_ATOMICS) */
diff --git a/src/tools/pginclude/cpluspluscheck 
b/src/tools/pginclude/cpluspluscheck
index b2060f3..1a35016 100755
--- a/src/tools/pginclude/cpluspluscheck
+++ b/src/tools/pginclude/cpluspluscheck
@@ -84,7 +84,6 @@ do
        test "$f" = src/include/port/atomics/generic-gcc.h && continue
        test "$f" = src/include/port/atomics/generic-msvc.h && continue
        test "$f" = src/include/port/atomics/generic-sunpro.h && continue
-       test "$f" = src/include/port/atomics/generic-xlc.h && continue
 
        # rusagestub.h is also platform-specific, and will be included
        # by utils/pg_rusage.h if necessary.

Re: [HACKERS] Deadlock in XLogInsert at AIX

Reply via email to