RE: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]
> -Original Message- > From: Wilco Dijkstra > Sent: Thursday, March 16, 2023 6:22 PM > To: GCC Patches > Cc: Richard Sandiford ; Kyrylo Tkachov > > Subject: Re: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891] > > ping > > > From: Wilco Dijkstra > Sent: 23 February 2023 15:11 > To: GCC Patches > Cc: Richard Sandiford ; Kyrylo Tkachov > > Subject: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891] > > > The LSE2 ifunc for 16-byte atomic load requires a barrier before the LDP - > without it, it effectively has Load-AcquirePC semantics similar to LDAPR, > which is less restrictive than what __ATOMIC_SEQ_CST requires. This patch > fixes this and adds comments to make it easier to see which sequence is > used for each case. Use a load/store exclusive loop for store to simplify > testing memory ordering is correct (it is slightly faster too). > > Passes regress, OK for commit? > Ok. Thanks, Kyrill > libatomic/ > PR libgcc/108891 > config/linux/aarch64/atomic_16.S: Fix libat_load_16_i1. > Add comments describing the memory order. > > --- > > diff --git a/libatomic/config/linux/aarch64/atomic_16.S > b/libatomic/config/linux/aarch64/atomic_16.S > index > 732c3534a06678664a252bdbc53652eeab0af506..05439ce394b9653c9bcb582 > 761ff7aaa7c8f9643 100644 > --- a/libatomic/config/linux/aarch64/atomic_16.S > +++ b/libatomic/config/linux/aarch64/atomic_16.S > @@ -72,33 +72,38 @@ name: \ > > ENTRY (libat_load_16_i1) > cbnzw1, 1f > + > + /* RELAXED. */ > ldp res0, res1, [x0] > ret > 1: > - cmp w1, ACQUIRE > - b.hi2f > + cmp w1, SEQ_CST > + b.eq2f > + > + /* ACQUIRE/CONSUME (Load-AcquirePC semantics). */ > ldp res0, res1, [x0] > dmb ishld > ret > -2: > + > + /* SEQ_CST. */ > +2: ldartmp0, [x0] /* Block reordering with Store-Release instr. > */ > ldp res0, res1, [x0] > - dmb ish > + dmb ishld > ret > END (libat_load_16_i1) > > > ENTRY (libat_store_16_i1) > cbnzw4, 1f > + > + /* RELAXED. */ > stp in0, in1, [x0] > ret > -1: > - dmb ish > - stp in0, in1, [x0] > - cmp w4, SEQ_CST > - beq 2f > - ret > -2: > - dmb ish > + > + /* RELEASE/SEQ_CST. */ > +1: ldaxp xzr, tmp0, [x0] > + stlxp w4, in0, in1, [x0] > + cbnzw4, 1b > ret > END (libat_store_16_i1) > > @@ -106,29 +111,33 @@ END (libat_store_16_i1) > ENTRY (libat_exchange_16_i1) > mov x5, x0 > cbnzw4, 2f > -1: > - ldxpres0, res1, [x5] > + > + /* RELAXED. */ > +1: ldxpres0, res1, [x5] > stxpw4, in0, in1, [x5] > cbnzw4, 1b > ret > 2: > cmp w4, ACQUIRE > b.hi4f > -3: > - ldaxp res0, res1, [x5] > + > + /* ACQUIRE/CONSUME. */ > +3: ldaxp res0, res1, [x5] > stxpw4, in0, in1, [x5] > cbnzw4, 3b > ret > 4: > cmp w4, RELEASE > b.ne6f > -5: > - ldxpres0, res1, [x5] > + > + /* RELEASE. */ > +5: ldxpres0, res1, [x5] > stlxp w4, in0, in1, [x5] > cbnzw4, 5b > ret > -6: > - ldaxp res0, res1, [x5] > + > + /* ACQ_REL/SEQ_CST. */ > +6: ldaxp res0, res1, [x5] > stlxp w4, in0, in1, [x5] > cbnzw4, 6b > ret > @@ -142,6 +151,8 @@ ENTRY (libat_compare_exchange_16_i1) > cbz w4, 2f > cmp w4, RELEASE > b.hs3f > + > + /* ACQUIRE/CONSUME. */ > caspa exp0, exp1, in0, in1, [x0] > 0: > cmp exp0, tmp0 > @@ -153,15 +164,18 @@ ENTRY (libat_compare_exchange_16_i1) > stp exp0, exp1, [x1] > mov x0, 0 > ret > -2: > - caspexp0, exp1, in0, in1, [x0] > + > + /* RELAXED. */ > +2: caspexp0, exp1, in0, in1, [x0] > b 0b > -3: > - b.hi4f > + > + /* RELEASE. */ > +3: b.hi4f > caspl exp0, exp1, in0, in1, [x0] > b 0b > -4: > - caspal exp0, exp1, in0, in1, [x0] > + > + /* ACQ_REL/SEQ_CST. */ > +4: caspal exp0, exp1, in0, in1, [x0] > b 0b > END (libat_compare_exchange_16_i1)
Re: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]
ping From: Wilco Dijkstra Sent: 23 February 2023 15:11 To: GCC Patches Cc: Richard Sandiford ; Kyrylo Tkachov Subject: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891] The LSE2 ifunc for 16-byte atomic load requires a barrier before the LDP - without it, it effectively has Load-AcquirePC semantics similar to LDAPR, which is less restrictive than what __ATOMIC_SEQ_CST requires. This patch fixes this and adds comments to make it easier to see which sequence is used for each case. Use a load/store exclusive loop for store to simplify testing memory ordering is correct (it is slightly faster too). Passes regress, OK for commit? libatomic/ PR libgcc/108891 config/linux/aarch64/atomic_16.S: Fix libat_load_16_i1. Add comments describing the memory order. --- diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S index 732c3534a06678664a252bdbc53652eeab0af506..05439ce394b9653c9bcb582761ff7aaa7c8f9643 100644 --- a/libatomic/config/linux/aarch64/atomic_16.S +++ b/libatomic/config/linux/aarch64/atomic_16.S @@ -72,33 +72,38 @@ name: \ ENTRY (libat_load_16_i1) cbnz w1, 1f + + /* RELAXED. */ ldp res0, res1, [x0] ret 1: - cmp w1, ACQUIRE - b.hi 2f + cmp w1, SEQ_CST + b.eq 2f + + /* ACQUIRE/CONSUME (Load-AcquirePC semantics). */ ldp res0, res1, [x0] dmb ishld ret -2: + + /* SEQ_CST. */ +2: ldar tmp0, [x0] /* Block reordering with Store-Release instr. */ ldp res0, res1, [x0] - dmb ish + dmb ishld ret END (libat_load_16_i1) ENTRY (libat_store_16_i1) cbnz w4, 1f + + /* RELAXED. */ stp in0, in1, [x0] ret -1: - dmb ish - stp in0, in1, [x0] - cmp w4, SEQ_CST - beq 2f - ret -2: - dmb ish + + /* RELEASE/SEQ_CST. */ +1: ldaxp xzr, tmp0, [x0] + stlxp w4, in0, in1, [x0] + cbnz w4, 1b ret END (libat_store_16_i1) @@ -106,29 +111,33 @@ END (libat_store_16_i1) ENTRY (libat_exchange_16_i1) mov x5, x0 cbnz w4, 2f -1: - ldxp res0, res1, [x5] + + /* RELAXED. */ +1: ldxp res0, res1, [x5] stxp w4, in0, in1, [x5] cbnz w4, 1b ret 2: cmp w4, ACQUIRE b.hi 4f -3: - ldaxp res0, res1, [x5] + + /* ACQUIRE/CONSUME. */ +3: ldaxp res0, res1, [x5] stxp w4, in0, in1, [x5] cbnz w4, 3b ret 4: cmp w4, RELEASE b.ne 6f -5: - ldxp res0, res1, [x5] + + /* RELEASE. */ +5: ldxp res0, res1, [x5] stlxp w4, in0, in1, [x5] cbnz w4, 5b ret -6: - ldaxp res0, res1, [x5] + + /* ACQ_REL/SEQ_CST. */ +6: ldaxp res0, res1, [x5] stlxp w4, in0, in1, [x5] cbnz w4, 6b ret @@ -142,6 +151,8 @@ ENTRY (libat_compare_exchange_16_i1) cbz w4, 2f cmp w4, RELEASE b.hs 3f + + /* ACQUIRE/CONSUME. */ caspa exp0, exp1, in0, in1, [x0] 0: cmp exp0, tmp0 @@ -153,15 +164,18 @@ ENTRY (libat_compare_exchange_16_i1) stp exp0, exp1, [x1] mov x0, 0 ret -2: - casp exp0, exp1, in0, in1, [x0] + + /* RELAXED. */ +2: casp exp0, exp1, in0, in1, [x0] b 0b -3: - b.hi 4f + + /* RELEASE. */ +3: b.hi 4f caspl exp0, exp1, in0, in1, [x0] b 0b -4: - caspal exp0, exp1, in0, in1, [x0] + + /* ACQ_REL/SEQ_CST. */ +4: caspal exp0, exp1, in0, in1, [x0] b 0b END (libat_compare_exchange_16_i1) @@ -169,15 +183,17 @@ END (libat_compare_exchange_16_i1) ENTRY (libat_fetch_add_16_i1) mov x5, x0 cbnz w4, 2f -1: - ldxp res0, res1, [x5] + + /* RELAXED. */ +1: ldxp res0, res1, [x5] adds tmplo, reslo, inlo adc tmphi, reshi, inhi stxp w4, tmp0, tmp1, [x5] cbnz w4, 1b ret -2: - ldaxp res0, res1, [x5] + + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ +2: ldaxp res0, res1, [x5] adds tmplo, reslo, inlo adc tmphi, reshi, inhi stlxp w4, tmp0, tmp1, [x5] @@ -189,15 +205,17 @@ END (libat_fetch_add_16_i1) ENTRY (libat_add_fetch_16_i1) mov x5, x0 cbnz w4, 2f -1: - ldxp res0, res1, [x5] + + /* RELAXED. */ +1: ldxp res0, res1, [x5] adds reslo, reslo, inlo adc reshi, reshi, inhi stxp w4, res0, res1, [x5] cbnz w4, 1b ret -2: - ldaxp res0, res1, [x5] + + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST
[PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]
The LSE2 ifunc for 16-byte atomic load requires a barrier before the LDP - without it, it effectively has Load-AcquirePC semantics similar to LDAPR, which is less restrictive than what __ATOMIC_SEQ_CST requires. This patch fixes this and adds comments to make it easier to see which sequence is used for each case. Use a load/store exclusive loop for store to simplify testing memory ordering is correct (it is slightly faster too). Passes regress, OK for commit? libatomic/ PR libgcc/108891 config/linux/aarch64/atomic_16.S: Fix libat_load_16_i1. Add comments describing the memory order. --- diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S index 732c3534a06678664a252bdbc53652eeab0af506..05439ce394b9653c9bcb582761ff7aaa7c8f9643 100644 --- a/libatomic/config/linux/aarch64/atomic_16.S +++ b/libatomic/config/linux/aarch64/atomic_16.S @@ -72,33 +72,38 @@ name: \ ENTRY (libat_load_16_i1) cbnzw1, 1f + + /* RELAXED. */ ldp res0, res1, [x0] ret 1: - cmp w1, ACQUIRE - b.hi2f + cmp w1, SEQ_CST + b.eq2f + + /* ACQUIRE/CONSUME (Load-AcquirePC semantics). */ ldp res0, res1, [x0] dmb ishld ret -2: + + /* SEQ_CST. */ +2: ldartmp0, [x0] /* Block reordering with Store-Release instr. */ ldp res0, res1, [x0] - dmb ish + dmb ishld ret END (libat_load_16_i1) ENTRY (libat_store_16_i1) cbnzw4, 1f + + /* RELAXED. */ stp in0, in1, [x0] ret -1: - dmb ish - stp in0, in1, [x0] - cmp w4, SEQ_CST - beq 2f - ret -2: - dmb ish + + /* RELEASE/SEQ_CST. */ +1: ldaxp xzr, tmp0, [x0] + stlxp w4, in0, in1, [x0] + cbnzw4, 1b ret END (libat_store_16_i1) @@ -106,29 +111,33 @@ END (libat_store_16_i1) ENTRY (libat_exchange_16_i1) mov x5, x0 cbnzw4, 2f -1: - ldxpres0, res1, [x5] + + /* RELAXED. */ +1: ldxpres0, res1, [x5] stxpw4, in0, in1, [x5] cbnzw4, 1b ret 2: cmp w4, ACQUIRE b.hi4f -3: - ldaxp res0, res1, [x5] + + /* ACQUIRE/CONSUME. */ +3: ldaxp res0, res1, [x5] stxpw4, in0, in1, [x5] cbnzw4, 3b ret 4: cmp w4, RELEASE b.ne6f -5: - ldxpres0, res1, [x5] + + /* RELEASE. */ +5: ldxpres0, res1, [x5] stlxp w4, in0, in1, [x5] cbnzw4, 5b ret -6: - ldaxp res0, res1, [x5] + + /* ACQ_REL/SEQ_CST. */ +6: ldaxp res0, res1, [x5] stlxp w4, in0, in1, [x5] cbnzw4, 6b ret @@ -142,6 +151,8 @@ ENTRY (libat_compare_exchange_16_i1) cbz w4, 2f cmp w4, RELEASE b.hs3f + + /* ACQUIRE/CONSUME. */ caspa exp0, exp1, in0, in1, [x0] 0: cmp exp0, tmp0 @@ -153,15 +164,18 @@ ENTRY (libat_compare_exchange_16_i1) stp exp0, exp1, [x1] mov x0, 0 ret -2: - caspexp0, exp1, in0, in1, [x0] + + /* RELAXED. */ +2: caspexp0, exp1, in0, in1, [x0] b 0b -3: - b.hi4f + + /* RELEASE. */ +3: b.hi4f caspl exp0, exp1, in0, in1, [x0] b 0b -4: - caspal exp0, exp1, in0, in1, [x0] + + /* ACQ_REL/SEQ_CST. */ +4: caspal exp0, exp1, in0, in1, [x0] b 0b END (libat_compare_exchange_16_i1) @@ -169,15 +183,17 @@ END (libat_compare_exchange_16_i1) ENTRY (libat_fetch_add_16_i1) mov x5, x0 cbnzw4, 2f -1: - ldxpres0, res1, [x5] + + /* RELAXED. */ +1: ldxpres0, res1, [x5] addstmplo, reslo, inlo adc tmphi, reshi, inhi stxpw4, tmp0, tmp1, [x5] cbnzw4, 1b ret -2: - ldaxp res0, res1, [x5] + + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ +2: ldaxp res0, res1, [x5] addstmplo, reslo, inlo adc tmphi, reshi, inhi stlxp w4, tmp0, tmp1, [x5] @@ -189,15 +205,17 @@ END (libat_fetch_add_16_i1) ENTRY (libat_add_fetch_16_i1) mov x5, x0 cbnzw4, 2f -1: - ldxpres0, res1, [x5] + + /* RELAXED. */ +1: ldxpres0, res1, [x5] addsreslo, reslo, inlo adc reshi, reshi, inhi stxpw4, res0, res1, [x5] cbnzw4, 1b ret -2: - ldaxp res0, res1, [x5] + + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ +2: ldaxp res0, res1, [x5] addsreslo, reslo, inlo adc reshi, reshi, inhi stlxp w4, res0, res1, [x5] @@ -209,15 +227,17 @@ END (libat_add_fetch_16_i1) ENTRY (libat_fetch_sub_16_i1) mov x5, x0