RE: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]

2023-03-17 Thread Kyrylo Tkachov via Gcc-patches



> -Original Message-
> From: Wilco Dijkstra 
> Sent: Thursday, March 16, 2023 6:22 PM
> To: GCC Patches 
> Cc: Richard Sandiford ; Kyrylo Tkachov
> 
> Subject: Re: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]
> 
> ping
> 
> 
> From: Wilco Dijkstra
> Sent: 23 February 2023 15:11
> To: GCC Patches 
> Cc: Richard Sandiford ; Kyrylo Tkachov
> 
> Subject: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]
> 
> 
> The LSE2 ifunc for 16-byte atomic load requires a barrier before the LDP -
> without it, it effectively has Load-AcquirePC semantics similar to LDAPR,
> which is less restrictive than what __ATOMIC_SEQ_CST requires.  This patch
> fixes this and adds comments to make it easier to see which sequence is
> used for each case. Use a load/store exclusive loop for store to simplify
> testing memory ordering is correct (it is slightly faster too).
> 
> Passes regress, OK for commit?
> 

Ok.
Thanks,
Kyrill

> libatomic/
> PR libgcc/108891
> config/linux/aarch64/atomic_16.S: Fix libat_load_16_i1.
> Add comments describing the memory order.
> 
> ---
> 
> diff --git a/libatomic/config/linux/aarch64/atomic_16.S
> b/libatomic/config/linux/aarch64/atomic_16.S
> index
> 732c3534a06678664a252bdbc53652eeab0af506..05439ce394b9653c9bcb582
> 761ff7aaa7c8f9643 100644
> --- a/libatomic/config/linux/aarch64/atomic_16.S
> +++ b/libatomic/config/linux/aarch64/atomic_16.S
> @@ -72,33 +72,38 @@ name:   \
> 
>  ENTRY (libat_load_16_i1)
>  cbnzw1, 1f
> +
> +   /* RELAXED.  */
>  ldp res0, res1, [x0]
>  ret
>  1:
> -   cmp w1, ACQUIRE
> -   b.hi2f
> +   cmp w1, SEQ_CST
> +   b.eq2f
> +
> +   /* ACQUIRE/CONSUME (Load-AcquirePC semantics).  */
>  ldp res0, res1, [x0]
>  dmb ishld
>  ret
> -2:
> +
> +   /* SEQ_CST.  */
> +2: ldartmp0, [x0]  /* Block reordering with Store-Release instr. 
>  */
>  ldp res0, res1, [x0]
> -   dmb ish
> +   dmb ishld
>  ret
>  END (libat_load_16_i1)
> 
> 
>  ENTRY (libat_store_16_i1)
>  cbnzw4, 1f
> +
> +   /* RELAXED.  */
>  stp in0, in1, [x0]
>  ret
> -1:
> -   dmb ish
> -   stp in0, in1, [x0]
> -   cmp w4, SEQ_CST
> -   beq 2f
> -   ret
> -2:
> -   dmb ish
> +
> +   /* RELEASE/SEQ_CST.  */
> +1: ldaxp   xzr, tmp0, [x0]
> +   stlxp   w4, in0, in1, [x0]
> +   cbnzw4, 1b
>  ret
>  END (libat_store_16_i1)
> 
> @@ -106,29 +111,33 @@ END (libat_store_16_i1)
>  ENTRY (libat_exchange_16_i1)
>  mov x5, x0
>  cbnzw4, 2f
> -1:
> -   ldxpres0, res1, [x5]
> +
> +   /* RELAXED.  */
> +1: ldxpres0, res1, [x5]
>  stxpw4, in0, in1, [x5]
>  cbnzw4, 1b
>  ret
>  2:
>  cmp w4, ACQUIRE
>  b.hi4f
> -3:
> -   ldaxp   res0, res1, [x5]
> +
> +   /* ACQUIRE/CONSUME.  */
> +3: ldaxp   res0, res1, [x5]
>  stxpw4, in0, in1, [x5]
>  cbnzw4, 3b
>  ret
>  4:
>  cmp w4, RELEASE
>  b.ne6f
> -5:
> -   ldxpres0, res1, [x5]
> +
> +   /* RELEASE.  */
> +5: ldxpres0, res1, [x5]
>  stlxp   w4, in0, in1, [x5]
>  cbnzw4, 5b
>  ret
> -6:
> -   ldaxp   res0, res1, [x5]
> +
> +   /* ACQ_REL/SEQ_CST.  */
> +6: ldaxp   res0, res1, [x5]
>  stlxp   w4, in0, in1, [x5]
>  cbnzw4, 6b
>  ret
> @@ -142,6 +151,8 @@ ENTRY (libat_compare_exchange_16_i1)
>  cbz w4, 2f
>  cmp w4, RELEASE
>  b.hs3f
> +
> +   /* ACQUIRE/CONSUME.  */
>  caspa   exp0, exp1, in0, in1, [x0]
>  0:
>  cmp exp0, tmp0
> @@ -153,15 +164,18 @@ ENTRY (libat_compare_exchange_16_i1)
>  stp exp0, exp1, [x1]
>  mov x0, 0
>  ret
> -2:
> -   caspexp0, exp1, in0, in1, [x0]
> +
> +   /* RELAXED.  */
> +2: caspexp0, exp1, in0, in1, [x0]
>  b   0b
> -3:
> -   b.hi4f
> +
> +   /* RELEASE.  */
> +3: b.hi4f
>  caspl   exp0, exp1, in0, in1, [x0]
>  b   0b
> -4:
> -   caspal  exp0, exp1, in0, in1, [x0]
> +
> +   /* ACQ_REL/SEQ_CST.  */
> +4: caspal  exp0, exp1, in0, in1, [x0]
>  b   0b
>  END (libat_compare_exchange_16_i1)

Re: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]

2023-03-16 Thread Wilco Dijkstra via Gcc-patches
ping


From: Wilco Dijkstra
Sent: 23 February 2023 15:11
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891] 
 

The LSE2 ifunc for 16-byte atomic load requires a barrier before the LDP -
without it, it effectively has Load-AcquirePC semantics similar to LDAPR,
which is less restrictive than what __ATOMIC_SEQ_CST requires.  This patch
fixes this and adds comments to make it easier to see which sequence is
used for each case. Use a load/store exclusive loop for store to simplify
testing memory ordering is correct (it is slightly faster too). 

Passes regress, OK for commit?

libatomic/
    PR libgcc/108891
    config/linux/aarch64/atomic_16.S: Fix libat_load_16_i1.
    Add comments describing the memory order.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
732c3534a06678664a252bdbc53652eeab0af506..05439ce394b9653c9bcb582761ff7aaa7c8f9643
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -72,33 +72,38 @@ name:   \
 
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
+
+   /* RELAXED.  */
 ldp res0, res1, [x0]
 ret
 1:
-   cmp w1, ACQUIRE
-   b.hi    2f
+   cmp w1, SEQ_CST
+   b.eq    2f
+
+   /* ACQUIRE/CONSUME (Load-AcquirePC semantics).  */
 ldp res0, res1, [x0]
 dmb ishld
 ret
-2:
+
+   /* SEQ_CST.  */
+2: ldar    tmp0, [x0]  /* Block reordering with Store-Release instr.  
*/
 ldp res0, res1, [x0]
-   dmb ish
+   dmb ishld
 ret
 END (libat_load_16_i1)
 
 
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
+
+   /* RELAXED.  */
 stp in0, in1, [x0]
 ret
-1:
-   dmb ish
-   stp in0, in1, [x0]
-   cmp w4, SEQ_CST
-   beq 2f
-   ret
-2:
-   dmb ish
+
+   /* RELEASE/SEQ_CST.  */
+1: ldaxp   xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
@@ -106,29 +111,33 @@ END (libat_store_16_i1)
 ENTRY (libat_exchange_16_i1)
 mov x5, x0
 cbnz    w4, 2f
-1:
-   ldxp    res0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 1b
 ret
 2:
 cmp w4, ACQUIRE
 b.hi    4f
-3:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME.  */
+3: ldaxp   res0, res1, [x5]
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
 4:
 cmp w4, RELEASE
 b.ne    6f
-5:
-   ldxp    res0, res1, [x5]
+
+   /* RELEASE.  */
+5: ldxp    res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
 cbnz    w4, 5b
 ret
-6:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQ_REL/SEQ_CST.  */
+6: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
 cbnz    w4, 6b
 ret
@@ -142,6 +151,8 @@ ENTRY (libat_compare_exchange_16_i1)
 cbz w4, 2f
 cmp w4, RELEASE
 b.hs    3f
+
+   /* ACQUIRE/CONSUME.  */
 caspa   exp0, exp1, in0, in1, [x0]
 0:
 cmp exp0, tmp0
@@ -153,15 +164,18 @@ ENTRY (libat_compare_exchange_16_i1)
 stp exp0, exp1, [x1]
 mov x0, 0
 ret
-2:
-   casp    exp0, exp1, in0, in1, [x0]
+
+   /* RELAXED.  */
+2: casp    exp0, exp1, in0, in1, [x0]
 b   0b
-3:
-   b.hi    4f
+
+   /* RELEASE.  */
+3: b.hi    4f
 caspl   exp0, exp1, in0, in1, [x0]
 b   0b
-4:
-   caspal  exp0, exp1, in0, in1, [x0]
+
+   /* ACQ_REL/SEQ_CST.  */
+4: caspal  exp0, exp1, in0, in1, [x0]
 b   0b
 END (libat_compare_exchange_16_i1)
 
@@ -169,15 +183,17 @@ END (libat_compare_exchange_16_i1)
 ENTRY (libat_fetch_add_16_i1)
 mov x5, x0
 cbnz    w4, 2f
-1:
-   ldxp    res0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
 adds    tmplo, reslo, inlo
 adc tmphi, reshi, inhi
 stxp    w4, tmp0, tmp1, [x5]
 cbnz    w4, 1b
 ret
-2:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
 adds    tmplo, reslo, inlo
 adc tmphi, reshi, inhi
 stlxp   w4, tmp0, tmp1, [x5]
@@ -189,15 +205,17 @@ END (libat_fetch_add_16_i1)
 ENTRY (libat_add_fetch_16_i1)
 mov x5, x0
 cbnz    w4, 2f
-1:
-   ldxp    res0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
 adds    reslo, reslo, inlo
 adc reshi, reshi, inhi
 stxp    w4, res0, res1, [x5]
 cbnz    w4, 1b
 ret
-2:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST

[PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]

2023-02-23 Thread Wilco Dijkstra via Gcc-patches

The LSE2 ifunc for 16-byte atomic load requires a barrier before the LDP -
without it, it effectively has Load-AcquirePC semantics similar to LDAPR,
which is less restrictive than what __ATOMIC_SEQ_CST requires.  This patch
fixes this and adds comments to make it easier to see which sequence is
used for each case. Use a load/store exclusive loop for store to simplify
testing memory ordering is correct (it is slightly faster too). 

Passes regress, OK for commit?

libatomic/
PR libgcc/108891
config/linux/aarch64/atomic_16.S: Fix libat_load_16_i1.
Add comments describing the memory order.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
732c3534a06678664a252bdbc53652eeab0af506..05439ce394b9653c9bcb582761ff7aaa7c8f9643
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -72,33 +72,38 @@ name:   \
 
 ENTRY (libat_load_16_i1)
cbnzw1, 1f
+
+   /* RELAXED.  */
ldp res0, res1, [x0]
ret
 1:
-   cmp w1, ACQUIRE
-   b.hi2f
+   cmp w1, SEQ_CST
+   b.eq2f
+
+   /* ACQUIRE/CONSUME (Load-AcquirePC semantics).  */
ldp res0, res1, [x0]
dmb ishld
ret
-2:
+
+   /* SEQ_CST.  */
+2: ldartmp0, [x0]  /* Block reordering with Store-Release instr.  
*/
ldp res0, res1, [x0]
-   dmb ish
+   dmb ishld
ret
 END (libat_load_16_i1)
 
 
 ENTRY (libat_store_16_i1)
cbnzw4, 1f
+
+   /* RELAXED.  */
stp in0, in1, [x0]
ret
-1:
-   dmb ish
-   stp in0, in1, [x0]
-   cmp w4, SEQ_CST
-   beq 2f
-   ret
-2:
-   dmb ish
+
+   /* RELEASE/SEQ_CST.  */
+1: ldaxp   xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnzw4, 1b
ret
 END (libat_store_16_i1)
 
@@ -106,29 +111,33 @@ END (libat_store_16_i1)
 ENTRY (libat_exchange_16_i1)
mov x5, x0
cbnzw4, 2f
-1:
-   ldxpres0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxpres0, res1, [x5]
stxpw4, in0, in1, [x5]
cbnzw4, 1b
ret
 2:
cmp w4, ACQUIRE
b.hi4f
-3:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME.  */
+3: ldaxp   res0, res1, [x5]
stxpw4, in0, in1, [x5]
cbnzw4, 3b
ret
 4:
cmp w4, RELEASE
b.ne6f
-5:
-   ldxpres0, res1, [x5]
+
+   /* RELEASE.  */
+5: ldxpres0, res1, [x5]
stlxp   w4, in0, in1, [x5]
cbnzw4, 5b
ret
-6:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQ_REL/SEQ_CST.  */
+6: ldaxp   res0, res1, [x5]
stlxp   w4, in0, in1, [x5]
cbnzw4, 6b
ret
@@ -142,6 +151,8 @@ ENTRY (libat_compare_exchange_16_i1)
cbz w4, 2f
cmp w4, RELEASE
b.hs3f
+
+   /* ACQUIRE/CONSUME.  */
caspa   exp0, exp1, in0, in1, [x0]
 0:
cmp exp0, tmp0
@@ -153,15 +164,18 @@ ENTRY (libat_compare_exchange_16_i1)
stp exp0, exp1, [x1]
mov x0, 0
ret
-2:
-   caspexp0, exp1, in0, in1, [x0]
+
+   /* RELAXED.  */
+2: caspexp0, exp1, in0, in1, [x0]
b   0b
-3:
-   b.hi4f
+
+   /* RELEASE.  */
+3: b.hi4f
caspl   exp0, exp1, in0, in1, [x0]
b   0b
-4:
-   caspal  exp0, exp1, in0, in1, [x0]
+
+   /* ACQ_REL/SEQ_CST.  */
+4: caspal  exp0, exp1, in0, in1, [x0]
b   0b
 END (libat_compare_exchange_16_i1)
 
@@ -169,15 +183,17 @@ END (libat_compare_exchange_16_i1)
 ENTRY (libat_fetch_add_16_i1)
mov x5, x0
cbnzw4, 2f
-1:
-   ldxpres0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxpres0, res1, [x5]
addstmplo, reslo, inlo
adc tmphi, reshi, inhi
stxpw4, tmp0, tmp1, [x5]
cbnzw4, 1b
ret
-2:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
addstmplo, reslo, inlo
adc tmphi, reshi, inhi
stlxp   w4, tmp0, tmp1, [x5]
@@ -189,15 +205,17 @@ END (libat_fetch_add_16_i1)
 ENTRY (libat_add_fetch_16_i1)
mov x5, x0
cbnzw4, 2f
-1:
-   ldxpres0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxpres0, res1, [x5]
addsreslo, reslo, inlo
adc reshi, reshi, inhi
stxpw4, res0, res1, [x5]
cbnzw4, 1b
ret
-2:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
addsreslo, reslo, inlo
adc reshi, reshi, inhi
stlxp   w4, res0, res1, [x5]
@@ -209,15 +227,17 @@ END (libat_add_fetch_16_i1)
 ENTRY (libat_fetch_sub_16_i1)
mov x5, x0