[Bug target/97906] New: [ARM NEON] Missed optimization in lowering to vcage

prathamesh3492 at gcc dot gnu.org via Gcc-bugs Thu, 19 Nov 2020 03:33:03 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97906


            Bug ID: 97906
           Summary: [ARM NEON] Missed optimization in lowering to vcage
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: prathamesh3492 at gcc dot gnu.org
  Target Milestone: ---

Hi,
Similar to PR97872 and PR97903, for following test-case:

#include <arm_neon.h>

uint32x2_t f1(float32x2_t a, float32x2_t b)
{
  return vabs_f32 (a) >= vabs_f32 (b);
}

uint32x2_t f2(float32x2_t a, float32x2_t b)
{
  return (uint32x2_t) __builtin_neon_vcagev2sf (a, b);
}

Code-gen:

f2:
        vacge.f32  d0, d0, d1
        bx         lr

f1:
        vabs.f32        d0, d0
        vabs.f32        d1, d1
        sub     sp, sp, #8
        vmov.32 r3, d0[0]
        vmov    s13, r3
        vmov.32 r3, d1[0]
        vmov    s12, r3
        vmov.32 r3, d1[1]
        vcmpe.f32       s12, s13
        vmov    s14, r3
        vmov.32 r3, d0[1]
        vmrs    APSR_nzcv, FPSCR
        vmov    s15, r3
        ite     ls
        movls   r3, #-1
        movhi   r3, #0
        vcmpe.f32       s14, s15
        str     r3, [sp]
        vmrs    APSR_nzcv, FPSCR
        ite     ls
        movls   r3, #-1
        movhi   r3, #0
        str     r3, [sp, #4]
        vldr    d0, [sp]
        add     sp, sp, #8
        @ sp needed
        bx      lr

For f1, it is initially lowered to:

f1 (float32x2_t a, float32x2_t b)
{
  vector(2) <signed-boolean:32> _1;
  vector(2) int _2;
  uint32x2_t _6;
  __simd64_float32_t _7;
  __simd64_float32_t _8;

  <bb 2> [local count: 1073741824]:
  _8 = __builtin_neon_vabsv2sf (a_4(D));
  _7 = __builtin_neon_vabsv2sf (b_5(D));
  _1 = _7 <= _8;
  _2 = VEC_COND_EXPR <_1, { -1, -1 }, { 0, 0 }>;
  _6 = VIEW_CONVERT_EXPR<uint32x2_t>(_2);
  return _6;
}

and veclower seems to "scalarize" the cond_expr op:

f1 (float32x2_t a, float32x2_t b)
{
  vector(2) int _2;
  uint32x2_t _6;
  __simd64_float32_t _7;
  __simd64_float32_t _8;
  float _11;
  float _12;
  int _13;
  float _14;
  float _15;
  int _16;

  <bb 2> [local count: 1073741824]:
  _8 = __builtin_neon_vabsv2sf (a_4(D));
  _7 = __builtin_neon_vabsv2sf (b_5(D));
  _11 = BIT_FIELD_REF <_7, 32, 0>;
  _12 = BIT_FIELD_REF <_8, 32, 0>;
  _13 = _11 <= _12 ? -1 : 0;
  _14 = BIT_FIELD_REF <_7, 32, 32>;
  _15 = BIT_FIELD_REF <_8, 32, 32>;
  _16 = _14 <= _15 ? -1 : 0;
  _2 = {_13, _16};
  _6 = VIEW_CONVERT_EXPR<uint32x2_t>(_2);
  return _6;

}

Thanks,
Prathamesh

[Bug target/97906] New: [ARM NEON] Missed optimization in lowering to vcage

Reply via email to