https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113600

--- Comment #5 from Hongtao Liu <liuhongt at gcc dot gnu.org> ---
It looks like x264_pixel_satd_16x16 consumes more time after my commit, an
extracted case is as below, note there's no attribute((always_inline)) in the
original x264_pixel_satd_8x4, it's added to force inline(Under PGO, it's hot
and will be inlined)

typedef unsigned char uint8_t;
typedef unsigned uint32_t;
typedef unsigned short uint16_t;

static inline uint32_t abs2( uint32_t a )
{
    uint32_t s = ((a>>15)&0x10001)*0xffff;
    return (a+s)^s;
}

int
__attribute__((always_inline))
x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
  uint32_t tmp[4][4];
  uint32_t a0, a1, a2, a3;
  int sum = 0;
  for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
    {
      a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
      a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
      a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
      a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
      { int t0 = a0 + a1; int t1 = a0 - a1; int t2 = a2 + a3; int t3 = a2 - a3;
tmp[i][0] = t0 + t2; tmp[i][2] = t0 - t2; tmp[i][1] = t1 + t3; tmp[i][3] = t1 -
t3;};
    }
  for( int i = 0; i < 4; i++ )
    {
      { int t0 = tmp[0][i] + tmp[1][i]; int t1 = tmp[0][i] - tmp[1][i]; int t2
= tmp[2][i] + tmp[3][i]; int t3 = tmp[2][i] - tmp[3][i]; a0 = t0 + t2; a2 = t0
- t2; a1 = t1 + t3; a3 = t1 - t3;};
      sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
    }
  return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1;
}

int x264_pixel_satd_16x16( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2
)
{
  int sum = x264_pixel_satd_8x4( pix1, i_pix1, pix2, i_pix2 )
    + x264_pixel_satd_8x4( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );
  sum+= x264_pixel_satd_8x4( pix1+8, i_pix1, pix2+8, i_pix2 )
    + x264_pixel_satd_8x4( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2 );
  sum+= x264_pixel_satd_8x4( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )
    + x264_pixel_satd_8x4( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 );
  sum+= x264_pixel_satd_8x4( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 )
    + x264_pixel_satd_8x4( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pix2
);
  return sum;
}


after commits, slp failed to splitted group size 16(vector int(16)) into small
4 + 12 and missed vectorization for below cases.

  vect_t2_2445.784_8503 = VIEW_CONVERT_EXPR<vector(4) int>(_8502);
  vect__2457.786_8505 = vect_t0_2441.783_8501 - vect_t2_2445.784_8503;
  vect__2448.785_8504 = vect_t0_2441.783_8501 + vect_t2_2445.784_8503;
  _8506 = VEC_PERM_EXPR <vect__2448.785_8504, vect__2457.786_8505, { 0, 1, 6, 7
}>;
  vect__2449.787_8507 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(_8506);
  t3_2447 = (int) _2446;
  _2448 = t0_2441 + t2_2445;
  _2449 = (unsigned int) _2448;
  _2451 = t0_2441 - t2_2445;
  _2452 = (unsigned int) _2451;
  _2454 = t1_2443 + t3_2447;
  _2455 = (unsigned int) _2454;
  _2457 = t1_2443 - t3_2447;
  _2458 = (unsigned int) _2457;
  MEM <vector(4) unsigned int> [(unsigned int *)&tmp + 16B] =
vect__2449.787_8507;


The vector store will be optimized off with later vector load, so for the bad
case there're STLF issue.

Reply via email to