Hi,

The patch expand even/odd permutation using:
"and, and, pack" in odd case
"shift, shift, pack" in even case

instead of current "pshufb, pshufb, or" or big set of unpack insns.

AVX2/CORE bootstrap and make check passed.
expensive tests are in progress

Is it ok for trunk?

Evgeny

2014-11-20  Evgeny Stupachenko  <evstu...@gmail.com>

gcc/testsuite
        PR target/60451
        * gcc.target/i386/pr60451.c: New.

gcc/
        PR target/60451
        * config/i386/i386.c (expand_vec_perm_even_odd_pack): New.
        (expand_vec_perm_even_odd_1): Add new expand for SSE cases,
        replace with for AVX2 cases.
        (ix86_expand_vec_perm_const_1): Add new expand.


+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+   with two "and" and "pack" or two "shift" and "pack" insns.  We should
+   have already failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+{
+  rtx op, dop0, dop1, t, rperm[16];
+  unsigned i, odd, c, s, nelt = d->nelt;
+  bool end_perm = false;
+  machine_mode half_mode;
+  rtx (*gen_and) (rtx, rtx, rtx);
+  rtx (*gen_pack) (rtx, rtx, rtx);
+  rtx (*gen_shift) (rtx, rtx, rtx);
+
+  /* Required for "pack".  */
+  if (!TARGET_SSE4_2 || d->one_operand_p)
+    return false;
+
+  /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than general
+     shuffles.  */
+  if (d->vmode == V8HImode)
+    {
+      c = 0xffff;
+      s = 16;
+      half_mode = V4SImode;
+      gen_and = gen_andv4si3;
+      gen_pack = gen_sse4_1_packusdw;
+      gen_shift = gen_lshrv4si3;
+    }
+  else if (d->vmode == V16QImode)
+    {
+      c = 0xff;
+      s = 8;
+      half_mode = V8HImode;
+      gen_and = gen_andv8hi3;
+      gen_pack = gen_sse2_packuswb;
+      gen_shift = gen_lshrv8hi3;
+    }
+  else if (d->vmode == V16HImode)
+    {
+      c = 0xffff;
+      s = 16;
+      half_mode = V8SImode;
+      gen_and = gen_andv8si3;
+      gen_pack = gen_avx2_packusdw;
+      gen_shift = gen_lshrv8si3;
+      end_perm = true;
+    }
+  else if (d->vmode == V32QImode)
+    {
+      c = 0xff;
+      s = 8;
+      half_mode = V16HImode;
+      gen_and = gen_andv16hi3;
+      gen_pack = gen_avx2_packuswb;
+      gen_shift = gen_lshrv16hi3;
+      end_perm = true;
+    }
+  else
+    return false;
+
+  /* Check that permutation is even or odd.  */
+  odd = d->perm[0];
+  if (odd != 0 && odd != 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  dop0 = gen_reg_rtx (half_mode);
+  dop1 = gen_reg_rtx (half_mode);
+  if (odd == 0)
+    {
+      for (i = 0; i < nelt / 2; rperm[i++] = GEN_INT (c));
+      t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
+      t = force_reg (half_mode, t);
+      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+    }
+  else
+    {
+      emit_insn (gen_shift (dop0,
+                           gen_lowpart (half_mode, d->op0),
+                           GEN_INT (s)));
+      emit_insn (gen_shift (dop1,
+                           gen_lowpart (half_mode, d->op1),
+                           GEN_INT (s)));
+    }
+  /* In AVX2 for 256 bit case we need to permute pack result.  */
+  if (TARGET_AVX2 && end_perm)
+    {
+      op = gen_reg_rtx (d->vmode);
+      t = gen_reg_rtx (V4DImode);
+      emit_insn (gen_pack (op, dop0, dop1));
+      emit_insn (gen_avx2_permv4di_1 (t, gen_lowpart (V4DImode, op),
const0_rtx,
+                                     const2_rtx, const1_rtx, GEN_INT (3)));
+      emit_move_insn (d->target, gen_lowpart (d->vmode, t));
+    }
+  else
+    emit_insn (gen_pack (d->target, dop0, dop1));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
    and extract-odd permutations.  */

@@ -48393,6 +48503,8 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
       gcc_unreachable ();

     case V8HImode:
+      if (TARGET_SSE4_2)
+       return expand_vec_perm_even_odd_pack (d);
       if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
        return expand_vec_perm_pshufb2 (d);
       else
@@ -48416,6 +48528,8 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
       break;

     case V16QImode:
+      if (TARGET_SSE4_2)
+       return expand_vec_perm_even_odd_pack (d);
       if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
        return expand_vec_perm_pshufb2 (d);
       else
@@ -48441,7 +48555,7 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)

     case V16HImode:
     case V32QImode:
-      return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+      return expand_vec_perm_even_odd_pack (d);

     case V4DImode:
       if (!TARGET_AVX2)
@@ -48814,6 +48928,9 @@ ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)

   /* Try sequences of three instructions.  */

+  if (expand_vec_perm_even_odd_pack (d))
+    return true;
+
   if (expand_vec_perm_2vperm2f128_vshuf (d))
     return true;

diff --git a/gcc/testsuite/gcc.target/i386/pr60451.c
b/gcc/testsuite/gcc.target/i386/pr60451.c
new file mode 100644
index 0000000..29f019d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr60451.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -ftree-vectorize -msse4.2" } */
+
+void
+foo (unsigned char *a, unsigned char *b, unsigned char *c, int size)
+{
+  int i;
+
+  for (i = 0; i < size; i++)
+    a[i] = (unsigned char) ((unsigned int)1 + b[i] * c[i] * 117);
+}
+
+/* { dg-final { scan-assembler "packuswb|vpunpck" } } */

Reply via email to