https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69904

            Bug ID: 69904
           Summary: [6 Regression] shrink-wrapping creates weird atomic
                    compare exchange loop on arm
           Product: gcc
           Version: 6.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
                CC: segher at gcc dot gnu.org
  Target Milestone: ---
            Target: arm

Consider the code:

#include <stdatomic.h>


atomic_uint foo;
atomic_uint bar;
int glob;


int main(void)
{
  glob = atomic_compare_exchange_strong (&foo, &bar, 0);
  return glob;
}

At -O2 -march=armv7-a GCC 5 generates:
main:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        movw    r2, #:lower16:bar
        movw    r3, #:lower16:foo
        movt    r2, #:upper16:bar
        movt    r3, #:upper16:foo
        mov     r0, #0
        str     lr, [sp, #-4]!
        ldr     r1, [r2]
        dmb     sy
.L3:
        ldrex   ip, [r3]
        cmp     ip, r1
        bne     .L4
        strex   lr, r0, [r3]
        cmp     lr, #0
        bne     .L3
.L4:
        movw    r3, #:lower16:glob
        moveq   r1, #1
        movne   r1, r0
        movt    r3, #:upper16:glob
        dmb     sy
        mov     r0, r1
        strne   ip, [r2]
        str     r1, [r3]
        ldr     pc, [sp], #4

GCC 6 generates:
main:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        movw    r2, #:lower16:bar
        movt    r2, #:upper16:bar
        movw    r3, #:lower16:foo
        mov     r0, #0
        ldr     r1, [r2]
        movt    r3, #:upper16:foo
        dmb     ish
        ldrex   ip, [r3]
        cmp     ip, r1
        bne     .L14
        str     lr, [sp, #-4]! // Stack push moved below barrier
.L8:
        strex   lr, r0, [r3]
        cmp     lr, #0
        bne     .L3
.L4:
        movw    r3, #:lower16:glob
        movt    r3, #:upper16:glob
        moveq   r1, #1
        movne   r1, r0
        dmb     ish
        mov     r0, r1
        strne   ip, [r2]
        str     r1, [r3]
        ldr     pc, [sp], #4  //pop stack and return
.L3:
        ldrex   ip, [r3]
        cmp     ip, r1
        beq     .L8
        b       .L4
.L14:
        movw    r3, #:lower16:glob
        movt    r3, #:upper16:glob
        moveq   r1, #1
        movne   r1, r0
        dmb     ish
        mov     r0, r1
        strne   ip, [r2]
        str     r1, [r3]
        bx      lr  // simple return


Notice how the stack push "str lr, [sp, #-4]!" got moved below the barrier "dmb
ish". It turns that this is not fatal from a correctness perspective because if
that push is executed after the load-exclusive has executed then in ARMv7-A it
may clear the exclusive monitor (according to the architecture it is
implementation defined whether it clears the exclusive monitor or not), causing
the store-exclusive after .L8 to fail, which will cause the control flow to
jump to .L3 and replay the load-exclusive/store-exclusive pair without the
intervening stack push. So it all sort of works out in the end, but is really
suboptimal and not the sequence one would expect for this code

This occurs during shrink-wrapping. -fno-shrink-wrap "fixes" this.

Reply via email to