https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120378

            Bug ID: 120378
           Summary: Support narrowing clip idiom
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rdapp at gcc dot gnu.org
  Target Milestone: ---
            Target: riscv

x264 contains a variation of the following loop (in hpel_filter):

typedef unsigned char uint8_t;
typedef short int16_t;

inline
uint8_t
x264_clip_uint8 (int x)
{
  return x & (~255) ? (-x) >> 31 : x;
}

void
__attribute__ ((noipa))
x264_clip_loop (uint8_t *res, int *x, int w)
{
  for (int i = 0; i < w; i++)
    res[i] = x264_clip_uint8 (x[i]);
}

Currently we generate:

 .L4:
        vsetvli a5,a2,e32,m1,ta,mu
        vle32.v v1,0(a1)
        sub     a2,a2,a5
        sh2add  a1,a5,a1
        vmsgtu.vv       v0,v1,v3
        vrsub.vi        v2,v1,0
        vsra.vi v1,v2,31,v0.t
        vsetvli zero,zero,e16,mf2,ta,ma
        vnsrl.wi        v1,v1,0
        vsetvli zero,zero,e8,mf4,ta,ma
        vnsrl.wi        v1,v1,0
        vse8.v  v1,0(a0)
        add     a0,a0,a5
        bne     a2,zero,.L4

That's a literal vectorization of the code and not bad, however clang does a
bit better here by making use of vnclipu:

.LBB0_13:                               # =>This Inner Loop Header: Depth=1
        vl2re32.v       v8, (a5)
        vsetvli a3, zero, e32, m2, ta, ma
        vmax.vx v8, v8, zero
        vsetvli zero, zero, e16, m1, ta, ma
        vnclipu.wi      v10, v8, 0
        vsetvli zero, zero, e8, mf2, ta, ma
        vnclipu.wi      v8, v10, 0
        vse8.v  v8, (a4)
        add     a5, a5, t0
        add     a4, a4, a7
        bne     a4, t1, .LBB0_13


The ifcvt'ed code before vect is:

  _4 = *_3;
  x.0_12 = (unsigned int) _4;
  _38 = -x.0_12;
  _15 = (int) _38;
  _16 = _15 >> 31;
  _29 = x.0_12 > 255;
  _17 = _29 ? _16 : _4;
  _18 = (unsigned char) _17;

I guess that's a case for match.pd and vect patterns.  I'm just not sure yet
how to properly recognize the idiom as we need to ensure that _15's sign-bit is
set.

Reply via email to