https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88241

            Bug ID: 88241
           Summary: Optimize vector signed integral comparison against 0
                    followed by blend
           Product: gcc
           Version: 9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jakub at gcc dot gnu.org
  Target Milestone: ---

As reported by Ulrich Drepper privately, with -O2 -msse4 on:

typedef signed char __v16qi __attribute__((vector_size (16)));
typedef int __v4si __attribute__((vector_size (16)));
typedef long long __v2di __attribute__((vector_size (16)));
typedef float __v4sf __attribute__((vector_size (16)));
typedef double __v2df __attribute__((vector_size (16)));

__v16qi
f1 (__v16qi a, __v16qi b, __v16qi c)
{
  return a < 0 ? b : c;
}

__v16qi
f2 (__v16qi a, __v16qi b, __v16qi c)
{
  return a >= 0 ? b : c;
}

__v4si
f3 (__v4si a, __v4si b, __v4si c)
{
  return a < 0 ? b : c;
}

__v4si
f4 (__v4si a, __v4si b, __v4si c)
{
  return a >= 0 ? b : c;
}

__v2di
f5 (__v2di a, __v2di b, __v2di c)
{
  return a < 0 ? b : c;
}

__v2di
f6 (__v2di a, __v2di b, __v2di c)
{
  return a >= 0 ? b : c;
}

__v4sf
f7 (__v4si a, __v4sf b, __v4sf c)
{
  return a < 0 ? b : c;
}

__v4sf
f8 (__v4si a, __v4sf b, __v4sf c)
{
  return a >= 0 ? b : c;
}

__v2df
f9 (__v2di a, __v2df b, __v2df c)
{
  return a < 0 ? b : c;
}

__v2df
f10 (__v2di a, __v2df b, __v2df c)
{
  return a >= 0 ? b : c;
}

we emit pcmpgt[bdq] followed by pblendvb or blendvp[sd].
We could optimize those into just pblendvb or blendvp[sd], those instruction
itself already test the most significant bit of the comparison operand.

combine dump shows:
Failed to match this instruction:
(set (reg:V16QI 86)
    (unspec:V16QI [
            (reg:V16QI 91)
            (reg:V16QI 90)
            (lt:V16QI (reg:V16QI 89)
                (const_vector:V16QI [
                        (const_int 0 [0]) repeated x16
                    ]))
        ] UNSPEC_BLENDV))
Failed to match this instruction:
(set (reg:V16QI 89)
    (unspec:V16QI [
            (subreg:V16QI (reg:V4SI 92) 0)
            (subreg:V16QI (reg:V4SI 91) 0)
            (subreg:V16QI (lt:V4SI (reg:V4SI 90)
                    (const_vector:V4SI [
                            (const_int 0 [0]) repeated x4
                        ])) 0)
        ] UNSPEC_BLENDV))
Failed to match this instruction:
(set (reg:V16QI 89)
    (unspec:V16QI [
            (subreg:V16QI (reg:V2DI 91) 0)
            (subreg:V16QI (reg:V2DI 92) 0)
            (subreg:V16QI (lt:V2DI (reg:V2DI 90)
                    (const_vector:V2DI [
                            (const_int 0 [0]) repeated x2
                        ])) 0)
        ] UNSPEC_BLENDV))
Failed to match this instruction:
(set (reg:V4SF 86)
    (unspec:V4SF [
            (reg:V4SF 92)
            (reg:V4SF 91)
            (subreg:V4SF (lt:V4SI (reg:V4SI 90)
                    (const_vector:V4SI [
                            (const_int 0 [0]) repeated x4
                        ])) 0)
        ] UNSPEC_BLENDV))
etc., so guess adding just a few patterns for combine could solve this.

Reply via email to