https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98435

            Bug ID: 98435
           Summary: [ARM NEON] Missed optimization in expanding vector
                    constructor
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: prathamesh3492 at gcc dot gnu.org
  Target Milestone: ---

For the following test-case:

#include <arm_neon.h>

bfloat16x4_t f1 (bfloat16_t a)
{
  return vdup_n_bf16 (a);
}

bfloat16x4_t f2 (bfloat16_t a)
{
  return (bfloat16x4_t) {a, a, a, a};
}

Compiling with arm-linux-gnueabi -O3 -mfpu=neon -mfloat-abi=softfp 
-march=armv8.2-a+bf16+fp16 results in f2 not being vectorized:

f1:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        vdup.16 d16, r0
        vmov    r0, r1, d16  @ v4bf
        bx      lr


f2:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        mov     r3, r0  @ __bf16
        adr     r1, .L4
        ldrd    r0, [r1]
        mov     r2, r3  @ __bf16
        mov     ip, r3  @ __bf16
        bfi     r1, r2, #0, #16
        bfi     r0, ip, #0, #16
        bfi     r1, r3, #16, #16
        bfi     r0, r2, #16, #16
        bx      lr


.optimized dump shows:
bfloat16x4_t f1 (bfloat16_t a)
{
  __simd64_bfloat16_t _3;

  <bb 2> [local count: 1073741824]:
  _3 = __builtin_neon_vdup_nv4bf (a_2(D)); [tail call]
  return _3;

}

bfloat16x4_t f2 (bfloat16_t a)
{
  bfloat16x4_t _2;

  <bb 2> [local count: 1073741824]:
  _2 = {a_1(D), a_1(D), a_1(D), a_1(D)};
  return _2;
}
  • [Bug target/98435] New: [AR... prathamesh3492 at gcc dot gnu.org via Gcc-bugs

Reply via email to