From: Andi Kleen <a...@gcc.gnu.org>

[v3 version: Remove unnecessary _mask pattern.
Add extra FAIL case. Remove unnecessary AVX512F check.
Fix changelog.]

[v2 version: Split rotate patterns in V16QI and V32/64QI.
Add various AVX512F checks. Remove some unnecessary
masks. Add untested cond_ pattern (untested, couldn't trigger it)
Clean up some control flow. Use narrower modes.
Avoid need for weakening predicate check in expand.
Use more general constraints and adjust test cases for more matches.
I didn't change the >> 7 case because it generated the same
instructions as the others.]

The GFNI AVX gf2p8affineqb instruction can be used to implement
vectorized byte shifts or rotates. This patch uses them to implement
shift and rotate patterns to allow the vectorizer to use them.
Previously AVX couldn't do rotates (except with XOP) and had to handle
8 bit shifts with a half throughput 16 bit shift.

This is only implemented for constant shifts. In theory it could
be used with a lookup table for variable shifts, but it's unclear
if it's worth it.

The vectorizer cost model could be improved, but seems to work for now.
It doesn't model the true latencies of the instructions. Also it doesn't
account for the memory loading of the mask, assuming that for a loop
it will be loaded outside the loop.

The instructions would also support more complex patterns
(e.g. arbitary bit movement or inversions), so some of the tricks
applied to ternlog could be applied here too to collapse
more code. It's trickier because the input patterns
can be much longer since they can apply to every bit individually. I didn't
attempt any of this.

There's currently no test case for the masked/cond_ variants, they seem
to be difficult to trigger with the vectorizer. Suggestions for a test
case for them welcome.

gcc/ChangeLog:

        * config/i386/i386-expand.cc (ix86_vgf2p8affine_shift_matrix):
        New function to compute shift/rotate matrixes for gf2p8affine.
        * config/i386/i386-protos.h (ix86_vgf2p8affine_shift_matrix):
        Declare new function.
        * config/i386/i386.cc (ix86_shift_rotate_cost): Add cost model
        for shift/rotate implemented using gf2p8affine.
        * config/i386/sse.md (VI1_AVX512_3264): New mode iterator.
        (<insn><mode>3): Add GFNI case for shift patterns.
        (cond_<insn><mode>3): New pattern.
        (<insn><mode>3<mask_name>): Dito.
        (<insn>v16qi): New rotate pattern to handle XOP V16QI case
        and GFNI.
        (rotl<mode>3, rotr<mode>3): Exclude V16QI case.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/shift-gf2p8affine-1.c: New test
        * gcc.target/i386/shift-gf2p8affine-2.c: New test
        * gcc.target/i386/shift-gf2p8affine-3.c: New test
---
 gcc/config/i386/i386-expand.cc                | 103 +++++++++
 gcc/config/i386/i386-protos.h                 |   1 +
 gcc/config/i386/i386.cc                       |  14 ++
 gcc/config/i386/sse.md                        | 102 ++++++++-
 .../gcc.target/i386/shift-gf2p8affine-1.c     |  64 ++++++
 .../gcc.target/i386/shift-gf2p8affine-2.c     | 196 ++++++++++++++++++
 .../gcc.target/i386/shift-gf2p8affine-3.c     |  85 ++++++++
 7 files changed, 559 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 12cec611c7b5..ef6c12cd5697 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -27034,6 +27034,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx 
op1, rtx op2, int idx,
   return target;
 }
 
+/* GF2P8AFFINEQB matrixes to implement shift and rotate.  */
+
+static const uint64_t matrix_ashift[8] =
+{
+  0,
+  0x0001020408102040, /* 1 l */
+  0x0000010204081020, /* 2 l */
+  0x0000000102040810, /* 3 l */
+  0x0000000001020408, /* 4 l */
+  0x0000000000010204, /* 5 l */
+  0x0000000000000102, /* 6 l */
+  0x0000000000000001  /* 7 l */
+};
+
+static const uint64_t matrix_lshiftrt[8] =
+{
+  0,
+  0x0204081020408000, /* 1 r */
+  0x0408102040800000, /* 2 r */
+  0x0810204080000000, /* 3 r */
+  0x1020408000000000, /* 4 r */
+  0x2040800000000000, /* 5 r */
+  0x4080000000000000, /* 6 r */
+  0x8000000000000000  /* 7 r */
+};
+
+static const uint64_t matrix_ashiftrt[8] =
+{
+  0,
+  0x0204081020408080, /* 1 r */
+  0x0408102040808080, /* 2 r */
+  0x0810204080808080, /* 3 r */
+  0x1020408080808080, /* 4 r */
+  0x2040808080808080, /* 5 r */
+  0x4080808080808080, /* 6 r */
+  0x8080808080808080  /* 7 r */
+};
+
+static const uint64_t matrix_rotate[8] =
+{
+  0,
+  0x8001020408102040, /* 1 rol8 */
+  0x4080010204081020, /* 2 rol8 */
+  0x2040800102040810, /* 3 rol8 */
+  0x1020408001020408, /* 4 rol8 */
+  0x0810204080010204, /* 5 rol8 */
+  0x0408102040800102, /* 6 rol8 */
+  0x0204081020408001  /* 7 rol8 */
+};
+
+static const uint64_t matrix_rotatert[8] =
+{
+  0,
+  0x0204081020408001, /* 1 ror8 */
+  0x0408102040800102, /* 2 ror8 */
+  0x0810204080010204, /* 3 ror8 */
+  0x1020408001020408, /* 4 ror8 */
+  0x2040800102040810, /* 5 ror8 */
+  0x4080010204081020, /* 6 ror8 */
+  0x8001020408102040  /* 7 ror8 */
+};
+
+/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
+   for CODE and shift count COUNT into register with vector of size of SRC.  */
+
+rtx
+ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
+{
+  machine_mode mode = GET_MODE (src);
+  const uint64_t *matrix;
+  unsigned shift = INTVAL (count) & 7;
+  gcc_assert (shift > 0 && shift < 8);
+
+  switch (code)
+    {
+    case ASHIFT:
+      matrix = matrix_ashift;
+      break;
+    case ASHIFTRT:
+      matrix = matrix_ashiftrt;
+      break;
+    case LSHIFTRT:
+      matrix = matrix_lshiftrt;
+      break;
+    case ROTATE:
+      matrix = matrix_rotate;
+      break;
+    case ROTATERT:
+      matrix = matrix_rotatert;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  int nelts = GET_MODE_NUNITS (mode);
+  rtvec vec = rtvec_alloc (nelts);
+  uint64_t ma = matrix[shift];
+  for (int i = 0; i < nelts; i++)
+    RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
+
+  return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
+}
+
 /* Trunc a vector to a narrow vector, like v4di -> v4si.  */
 
 void
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index ee6b78b2c77c..bdb8bb963b5d 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap);
 /* In i386-expand.cc.  */
 bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
                                   HOST_WIDE_INT*);
+rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 55c9b16dd388..b4b84b9fecba 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22102,6 +22102,15 @@ ix86_shift_rotate_cost (const struct processor_costs 
*cost,
            }
          /* FALLTHRU */
        case V32QImode:
+         if (TARGET_GFNI && constant_op1)
+           {
+             /* Use vgf2p8affine. One extra load for the mask, but in a loop
+                with enough registers it will be moved out. So for now don't
+                account the constant mask load. This is not quite right
+                for non loop vectorization.  */
+             extra = 0;
+             return ix86_vec_cost (mode, cost->sse_op) + extra;
+           }
          if (TARGET_AVX2)
            /* Use vpbroadcast.  */
            extra = cost->sse_op;
@@ -22136,6 +22145,11 @@ ix86_shift_rotate_cost (const struct processor_costs 
*cost,
            count = 9;
          return ix86_vec_cost (mode, cost->sse_op * count) + extra;
 
+       case V64QImode:
+         /* Ignore the mask load for GF2P8AFFINEQB.  */
+         extra = 0;
+         return ix86_vec_cost (mode, cost->sse_op) + extra;
+
        case V2DImode:
        case V4DImode:
          /* V*DImode arithmetic right shift is emulated.  */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ec74f93731d8..4f3adbed295f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -326,6 +326,9 @@ (define_mode_iterator VI12HFBF_AVX512VL
 (define_mode_iterator VI1_AVX512VL
   [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
 
+(define_mode_iterator VI1_AVX512_3264
+  [V64QI (V32QI "TARGET_AVX512VL")])
+
 ;; All vector modes
 (define_mode_iterator V
   [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
@@ -26559,9 +26562,9 @@ (define_insn "xop_pperm_pack_v8hi_v16qi"
 
 ;; XOP packed rotate instructions
 (define_expand "rotl<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-       (rotate:VI_128
-        (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+       (rotate:VI248_128
+        (match_operand:VI248_128 1 "nonimmediate_operand")
         (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26590,9 +26593,9 @@ (define_expand "rotl<mode>3"
 })
 
 (define_expand "rotr<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand")
-       (rotatert:VI_128
-        (match_operand:VI_128 1 "nonimmediate_operand")
+  [(set (match_operand:VI248_128 0 "register_operand")
+       (rotatert:VI248_128
+        (match_operand:VI248_128 1 "nonimmediate_operand")
         (match_operand:SI 2 "general_operand")))]
   "TARGET_XOP"
 {
@@ -26984,11 +26987,98 @@ (define_expand "<insn><mode>3"
       gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
       emit_insn (gen (operands[0], operands[1], tmp));
     }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2]))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2],
+                                                  <CODE>);
+      emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+                                           GEN_INT (0)));
+    }
   else
     ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
   DONE;
 })
 
+; not generated by vectorizer?
+(define_expand "cond_<insn><mode>"
+  [(set (match_operand:VI1_AVX512VL 0 "register_operand")
+       (vec_merge:VI1_AVX512VL
+         (any_shift:VI1_AVX512VL
+           (match_operand:VI1_AVX512VL 2 "register_operand")
+           (match_operand:VI1_AVX512VL 3 
"nonimmediate_or_const_vec_dup_operand"))
+         (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand")
+       (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+  "TARGET_GFNI && TARGET_AVX512F"
+{
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], 
<CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[1], matrix,
+             GEN_INT (0), operands[4], operands[1]));
+  DONE;
+})
+
+(define_expand "<insn><mode>3"
+  [(set (match_operand:VI1_AVX512_3264 0 "register_operand")
+       (any_rotate:VI1_AVX512_3264
+         (match_operand:VI1_AVX512_3264 1 "general_operand")
+         (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_GFNI && TARGET_AVX512F"
+{
+  rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], 
<CODE>);
+  emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+             GEN_INT (0)));
+  DONE;
+})
+
+(define_expand "<insn>v16qi3"
+  [(set (match_operand:V16QI 0 "register_operand")
+     (any_rotate:V16QI
+       (match_operand:V16QI 1 "nonimmediate_operand")
+       (match_operand:SI 2 "general_operand")))]
+  "TARGET_GFNI || TARGET_XOP"
+{
+  /* Handle the V16QI XOP case to avoid a conflict with the other expand.  */
+  if (TARGET_XOP)
+    {
+      if (! const_0_to_7_operand (operands[2], SImode))
+        {
+          rtvec vs = rtvec_alloc (16);
+          rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+          rtx reg = gen_reg_rtx (V16QImode);
+          rtx op2 = operands[2];
+          int i;
+
+          if (GET_MODE (op2) != QImode)
+            {
+              op2 = gen_reg_rtx (QImode);
+              convert_move (op2, operands[2], false);
+            }
+
+          for (i = 0; i < 16; i++)
+            RTVEC_ELT (vs, i) = op2;
+
+          emit_insn (gen_vec_initv16qiqi (reg, par));
+          if (<CODE> == ROTATERT)
+            {
+              rtx neg = gen_reg_rtx (V16QImode);
+              emit_insn (gen_negv16qi2 (neg, reg));
+              emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg));
+              reg = neg;
+            }
+          emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg));
+          DONE;
+       }
+    }
+  else if (TARGET_GFNI && CONST_INT_P (operands[2]))
+    {
+      rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], 
<CODE>);
+      emit_insn (gen_vgf2p8affineqb_v16qi (operands[0], operands[1], matrix,
+                 GEN_INT (0)));
+      DONE;
+    }
+  else
+    FAIL;
+})
+
 (define_expand "ashrv2di3"
   [(set (match_operand:V2DI 0 "register_operand")
        (ashiftrt:V2DI
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c 
b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c
new file mode 100644
index 000000000000..e5be3a35538a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O3" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 14 } } */
+
+#ifndef N
+#define N 5
+#endif
+
+void
+ubyteshiftl (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] <<= N;
+}
+
+void
+ubyteshiftr (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] >>= N;
+}
+
+void
+ubyteshiftl_mask (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    if (a[i] & 1)
+      a[i] <<= N;
+}
+
+void
+sbyteshiftl (signed char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] <<= N;
+}
+
+void
+sbyteshiftr (signed char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] >>= N;
+}
+
+void
+ubyteror (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] = a[i] << N | a[i] >> (8 - N);
+}
+
+void
+ubyterol (unsigned char *a, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    a[i] = a[i] >> N | a[i] << (8 - N);
+}
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c 
b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c
new file mode 100644
index 000000000000..098361aa9ab3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c
@@ -0,0 +1,196 @@
+/* { dg-do run } */
+/* { dg-options "-mgfni -mavx512vl -mavx512bw -mavx512f -O3 
-Wno-shift-count-negative" } */
+
+#include <string.h>
+
+#ifndef N1
+#define N1 5
+#endif
+
+#ifndef N2
+#define N2 3
+#endif
+
+#ifndef N3
+#define N3 1
+#endif
+
+#ifndef N4
+#define N4 7
+#endif
+
+#ifndef N5
+#define N5 -3
+#endif
+
+#ifndef FILLER
+#define FILLER 0xab
+#endif
+
+#define FUNC(N)                                                        \
+  void ubyteshiftl##N(unsigned char *a, int len)               \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] <<= N;                                              \
+  }                                                            \
+                                                               \
+  void ubyteshiftr##N(unsigned char *a, int len)               \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] >>= N;                                              \
+  }                                                            \
+                                                               \
+  void ubyteshiftl_mask##N(unsigned char *a, int len)          \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      if (a[i] & 1)                                            \
+       a[i] <<= N;                                             \
+  }                                                            \
+                                                               \
+  void sbyteshiftl##N(signed char *a, int len)                 \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] <<= N;                                              \
+  }                                                            \
+                                                               \
+  void sbyteshiftr##N(signed char *a, int len)                 \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] >>= N;                                              \
+  }                                                            \
+                                                               \
+  void ubyteror##N(unsigned char *a, int len)                  \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] = a[i] << N | a[i] >> (8-N);                                \
+  }                                                            \
+                                                               \
+  void ubyterol##N(unsigned char *a, int len)                  \
+  {                                                            \
+    int i;                                                     \
+    for (i = 0; i < len; i++)                                  \
+      a[i] = a[i] >> N | a[i] << (8-N);                                \
+  }                                                            \
+  void ubyteshiftl##N##ref(unsigned char *a, int len)          \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] <<= N;                                             \
+  }                                                            \
+                                                               \
+  void ubyteshiftr##N##ref(unsigned char *a, int len)          \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] >>= N;                                             \
+  }                                                            \
+                                                               \
+  void ubyteshiftl_mask##N##ref(unsigned char *a, int len)     \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       if (a[i] & 1)                                           \
+         a[i] <<= N;                                           \
+  }                                                            \
+                                                               \
+  void sbyteshiftl##N##ref(signed char *a, int len)            \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] <<= N;                                             \
+  }                                                            \
+                                                               \
+  void sbyteshiftr##N##ref(signed char *a, int len)            \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] >>= N;                                             \
+  }                                                            \
+                                                               \
+  void ubyteror##N##ref(unsigned char *a, int len)             \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] = a[i] << N | a[i] >> (8-N);                       \
+  }                                                            \
+                                                               \
+  void ubyterol##N##ref(unsigned char *a, int len)             \
+  {                                                            \
+    int i;                                                     \
+    _Pragma("GCC novector")                                    \
+      for (i = 0; i < len; i++)                                        \
+       a[i] = a[i] >> N | a[i] << (8-N);                       \
+  }
+
+FUNC (N1)
+FUNC (N2)
+FUNC (N3)
+FUNC (N4)
+FUNC (N5)
+
+#define TEST(N, func)                                  \
+  memset (array, filler, len);                         \
+  func##N (array, len);                                        \
+  memset (array2, filler, len);                                \
+  func##N##ref (array2, len);                          \
+  if (memcmp (array, array2, len)) __builtin_abort ()
+
+int main ()
+{
+  __builtin_cpu_init ();
+  if (!__builtin_cpu_supports ("gfni"))
+    return 0;
+
+  const unsigned long len = 256;
+  char array[len], array2[len];
+  unsigned char filler = FILLER;
+
+  TEST (N1, ubyteshiftl);
+  TEST (N1, ubyteshiftl_mask);
+  TEST (N1, sbyteshiftl);
+  TEST (N1, sbyteshiftr);
+  TEST (N1, ubyteror);
+  TEST (N1, ubyterol);
+
+  TEST (N2, ubyteshiftl);
+  TEST (N2, ubyteshiftl_mask);
+  TEST (N2, sbyteshiftl);
+  TEST (N2, sbyteshiftr);
+  TEST (N2, ubyteror);
+  TEST (N2, ubyterol);
+
+  TEST (N3, ubyteshiftl);
+  TEST (N3, ubyteshiftl_mask);
+  TEST (N3, sbyteshiftl);
+  TEST (N3, sbyteshiftr);
+  TEST (N3, ubyteror);
+  TEST (N3, ubyterol);
+
+  TEST (N4, ubyteshiftl);
+  TEST (N4, ubyteshiftl_mask);
+  TEST (N4, sbyteshiftl);
+  TEST (N4, sbyteshiftr);
+  TEST (N4, ubyteror);
+  TEST (N4, ubyterol);
+
+  TEST (N5, ubyteshiftl);
+  TEST (N5, ubyteshiftl_mask);
+  TEST (N5, sbyteshiftl);
+  TEST (N5, sbyteshiftr);
+  TEST (N5, ubyteror);
+  TEST (N5, ubyterol);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c 
b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c
new file mode 100644
index 000000000000..6d882ee664ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-mgfni -mavx512bw -mavx512f -O3" } */
+/* { dg-final { scan-assembler-times "vgf2p8affineqb" 14 } } */
+
+/* Based on a test case from Andrew Pinski */
+
+#ifndef N
+#define N 5
+#endif
+
+void
+ubyteshiftl (unsigned char *restrict a, unsigned char *restrict b, unsigned 
char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) << N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i];
+    }
+}
+
+void
+ubyteshiftr (unsigned char *restrict a, unsigned char *restrict b, unsigned 
char *restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) >> N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i];
+    }
+}
+
+void
+sbyteshiftl (signed char *restrict a, signed char *restrict b, signed char 
*restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) << N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i];
+    }
+}
+
+void
+sbyteshiftr (signed char *restrict a, signed char *restrict b, signed char 
*restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? (a[i] | b[i]) >> N : a[i];
+      a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i];
+    }
+}
+
+static inline unsigned char rol8(unsigned char v, int c)
+{
+  return (v >> c) | (v << (8-c));
+}
+
+static inline unsigned char ror8(unsigned char v, int c)
+{
+  return (v << c) | (v >> (8-c));
+}
+
+void
+ubyterol (unsigned char *restrict a, unsigned char *restrict b, unsigned char 
*restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? rol8(a[i] | b[i], N) : a[i];
+      a[i] = (!c[i]) ? rol8(a[i] ^ b[i], N) : a[i];
+    }
+}
+
+void
+ubyteror (unsigned char *restrict a, unsigned char *restrict b, unsigned char 
*restrict c, int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    {
+      a[i] = c[i] ? ror8(a[i] | b[i], N) : a[i];
+      a[i] = (!c[i]) ? ror8(a[i] ^ b[i], N) : a[i];
+    }
+}
-- 
2.49.0

Reply via email to