On Wed, Aug 13, 2025 at 2:30 PM Hongtao Liu <crazy...@gmail.com> wrote: > > On Tue, Aug 5, 2025 at 8:49 AM Andi Kleen <a...@linux.intel.com> wrote: > > > > From: Andi Kleen <a...@gcc.gnu.org> > > > > The GFNI AVX gf2p8affineqb instruction can be used to implement > > vectorized byte shifts or rotates. This patch uses them to implement > > shift and rotate patterns to allow the vectorizer to use them. > > Previously AVX couldn't do rotates (except with XOP) and had to handle > > 8 bit shifts with a half throughput 16 bit shift. > > > > This is only implemented for constant shifts. In theory it could > > be used with a lookup table for variable shifts, but it's unclear > > if it's worth it. > > > > The vectorizer cost model could be improved, but seems to work for now. > > It doesn't model the true latencies of the instructions. Also it doesn't > > accout for the memory loading of the mask, assuming that for a loop > > it will be loaded outside the loop. > > > > The instructions would also support more complex patterns > > (e.g. arbitary bit movement or inversions), so some of the tricks > > applied to ternlog could be applied here too to collapse > > more code. It's trickier because the input patterns > > can be much longer since they can apply to every bit individually. I didn't > > attempt any of this. > > > > There's currently no test case for the masked variants, they seem > > to be difficult to trigger with the vectorizer. > > But two registers. Shorter dependency chains are usually better. > Also this case is faster in my micros using GFNI, although not by much > (1-4%) > > Could you provide your microbench, i'm still not very convinced for that. I do see there's improvement with my microbenchmark. Please just exclude >>7 from you patch. > > + Other review comments below. > > > > > gcc/ChangeLog: > > > > * config/i386/i386-expand.cc (ix86_vgf2p8affine_shift_matrix): > > New function to compute shift/rotate matrixes for gf2p8affine. > > (ix86_expand_multi_arg_builtin): Weaken predicate assert > > check to handle new more general pattern. > > * config/i386/i386-protos.h (ix86_vgf2p8affine_shift_matrix): > > Declare new function. > > * config/i386/i386.cc (ix86_shift_rotate_cost): Add cost model > > for shift/rotate implemented using gf2p8affine. > > * config/i386/sse.md (<insn><mode>3_mask): Add GFNI case. > > (<insn><mode>3<mask_name>): New pattern for GFNI. Handle V16QI > > case for XOP rotate. > > (rotl<mode>3, rotr<mode>3): Exclude V16QI case. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/shift-gf2p8affine-1.c: New test > > * gcc.target/i386/shift-gf2p8affine-2.c: New test > > * gcc.target/i386/shift-gf2p8affine-3.c: New test > > --- > > gcc/config/i386/i386-expand.cc | 113 +++++++++- > > gcc/config/i386/i386-protos.h | 1 + > > gcc/config/i386/i386.cc | 14 ++ > > gcc/config/i386/sse.md | 79 ++++++- > > .../gcc.target/i386/shift-gf2p8affine-1.c | 64 ++++++ > > .../gcc.target/i386/shift-gf2p8affine-2.c | 196 ++++++++++++++++++ > > .../gcc.target/i386/shift-gf2p8affine-3.c | 87 ++++++++ > > 7 files changed, 543 insertions(+), 11 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c > > > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > > index 09aa9b1461cc..ccc058228c74 100644 > > --- a/gcc/config/i386/i386-expand.cc > > +++ b/gcc/config/i386/i386-expand.cc > > @@ -10784,15 +10784,15 @@ ix86_expand_multi_arg_builtin (enum insn_code > > icode, tree exp, rtx target, > > } > > else > > { > > + /* The predicates may not match for the xop_rotlv16qi3 > > + case because the new shared pattern uses a > > + nonimmediate vs a register operand. Since it's a > > super > > + set ignore the mismatch. */ > > gcc_checking_assert > > (nargs == 2 > > && insn_data[new_icode].operand[0].mode == tmode > > && insn_data[new_icode].operand[1].mode == tmode > > - && insn_data[new_icode].operand[2].mode == mode > > - && insn_data[new_icode].operand[0].predicate > > - == insn_data[icode].operand[0].predicate > > - && insn_data[new_icode].operand[1].predicate > > - == insn_data[icode].operand[1].predicate); > > + && insn_data[new_icode].operand[2].mode == mode) > > icode = new_icode; > > goto non_constant; > > } > > @@ -27033,6 +27033,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, > > rtx op1, rtx op2, int idx, > > return target; > > } > > > > +/* GF2P8AFFINEQB matrixes to implement shift and rotate. */ > > + > > +static const uint64_t matrix_ashift[8] = > > +{ > > + 0, > > + 0x0001020408102040, /* 1 l */ > > + 0x0000010204081020, /* 2 l */ > > + 0x0000000102040810, /* 3 l */ > > + 0x0000000001020408, /* 4 l */ > > + 0x0000000000010204, /* 5 l */ > > + 0x0000000000000102, /* 6 l */ > > + 0x0000000000000001 /* 7 l */ > > +}; > > + > > +static const uint64_t matrix_lshiftrt[8] = > > +{ > > + 0, > > + 0x0204081020408000, /* 1 r */ > > + 0x0408102040800000, /* 2 r */ > > + 0x0810204080000000, /* 3 r */ > > + 0x1020408000000000, /* 4 r */ > > + 0x2040800000000000, /* 5 r */ > > + 0x4080000000000000, /* 6 r */ > > + 0x8000000000000000 /* 7 r */ > > +}; > > + > > +static const uint64_t matrix_ashiftrt[8] = > > +{ > > + 0, > > + 0x0204081020408080, /* 1 r */ > > + 0x0408102040808080, /* 2 r */ > > + 0x0810204080808080, /* 3 r */ > > + 0x1020408080808080, /* 4 r */ > > + 0x2040808080808080, /* 5 r */ > > + 0x4080808080808080, /* 6 r */ > > + 0x8080808080808080 /* 7 r */ > > +}; > > + > > +static const uint64_t matrix_rotate[8] = > > +{ > > + 0, > > + 0x8001020408102040, /* 1 rol8 */ > > + 0x4080010204081020, /* 2 rol8 */ > > + 0x2040800102040810, /* 3 rol8 */ > > + 0x1020408001020408, /* 4 rol8 */ > > + 0x0810204080010204, /* 5 rol8 */ > > + 0x0408102040800102, /* 6 rol8 */ > > + 0x0204081020408001 /* 7 rol8 */ > > +}; > > + > > +static const uint64_t matrix_rotatert[8] = > > +{ > > + 0, > > + 0x0204081020408001, /* 1 ror8 */ > > + 0x0408102040800102, /* 2 ror8 */ > > + 0x0810204080010204, /* 3 ror8 */ > > + 0x1020408001020408, /* 4 ror8 */ > > + 0x2040800102040810, /* 5 ror8 */ > > + 0x4080010204081020, /* 6 ror8 */ > > + 0x8001020408102040 /* 7 ror8 */ > > +}; > > + > > +/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift > > + for CODE and shift count COUNT into register with vector of size of > > SRC. */ > > + > > +rtx > > +ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code) > > +{ > > + machine_mode mode = GET_MODE (src); > > + const uint64_t *matrix; > > + unsigned shift = INTVAL (count) & 7; > > + gcc_assert (shift > 0 && shift < 8); > > + > > + switch (code) > > + { > > + case ASHIFT: > > + matrix = matrix_ashift; > > + break; > > + case ASHIFTRT: > > + matrix = matrix_ashiftrt; > > + break; > > + case LSHIFTRT: > > + matrix = matrix_lshiftrt; > > + break; > > + case ROTATE: > > + matrix = matrix_rotate; > > + break; > > + case ROTATERT: > > + matrix = matrix_rotatert; > > + break; > > + default: > > + gcc_unreachable (); > > + } > > + > > + int nelts = GET_MODE_NUNITS (mode); > > + rtvec vec = rtvec_alloc (nelts); > > + uint64_t ma = matrix[shift]; > > + for (int i = 0; i < nelts; i++) > > + RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, > > QImode); > > + > > + return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec)); > > +} > > + > > /* Trunc a vector to a narrow vector, like v4di -> v4si. */ > > > > void > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > > index 69bc0ee570dd..279f6ffdf26a 100644 > > --- a/gcc/config/i386/i386-protos.h > > +++ b/gcc/config/i386/i386-protos.h > > @@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap); > > /* In i386-expand.cc. */ > > bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*, > > HOST_WIDE_INT*); > > +rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code); > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > > index 65e04d3760d5..38a160709bd4 100644 > > --- a/gcc/config/i386/i386.cc > > +++ b/gcc/config/i386/i386.cc > > @@ -22102,6 +22102,15 @@ ix86_shift_rotate_cost (const struct > > processor_costs *cost, > > } > > /* FALLTHRU */ > > case V32QImode: > > + if (TARGET_GFNI && constant_op1) > > + { > > + /* Use vgf2p8affine. One extra load for the mask, but in a > > loop > > + with enough registers it will be moved out. So for now > > don't > > + account the constant mask load. This is not quite right > > + for non loop vectorization. */ > > + extra = 0; > > + return ix86_vec_cost (mode, cost->sse_op) + extra; > > + } > > if (TARGET_AVX2) > > /* Use vpbroadcast. */ > > extra = cost->sse_op; > > @@ -22136,6 +22145,11 @@ ix86_shift_rotate_cost (const struct > > processor_costs *cost, > > count = 9; > > return ix86_vec_cost (mode, cost->sse_op * count) + extra; > > > > + case V64QImode: > > + /* Ignore the mask load for GF2P8AFFINEQB. */ > > + extra = 0; > > + return ix86_vec_cost (mode, cost->sse_op) + extra; > > + > > case V2DImode: > > case V4DImode: > > /* V*DImode arithmetic right shift is emulated. */ > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > index ec74f93731d8..01b8d91c5d55 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -26559,9 +26559,9 @@ (define_insn "xop_pperm_pack_v8hi_v16qi" > > > > ;; XOP packed rotate instructions > > (define_expand "rotl<mode>3" > > - [(set (match_operand:VI_128 0 "register_operand") > > - (rotate:VI_128 > > - (match_operand:VI_128 1 "nonimmediate_operand") > > + [(set (match_operand:VI248_128 0 "register_operand") > > + (rotate:VI248_128 > > + (match_operand:VI248_128 1 "nonimmediate_operand") > > (match_operand:SI 2 "general_operand")))] > > "TARGET_XOP" > > { > > @@ -26590,9 +26590,9 @@ (define_expand "rotl<mode>3" > > }) > > > > (define_expand "rotr<mode>3" > > - [(set (match_operand:VI_128 0 "register_operand") > > - (rotatert:VI_128 > > - (match_operand:VI_128 1 "nonimmediate_operand") > > + [(set (match_operand:VI248_128 0 "register_operand") > > + (rotatert:VI248_128 > > + (match_operand:VI248_128 1 "nonimmediate_operand") > > (match_operand:SI 2 "general_operand")))] > > "TARGET_XOP" > > { > > @@ -26984,11 +26984,78 @@ (define_expand "<insn><mode>3" > > gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3); > > emit_insn (gen (operands[0], operands[1], tmp)); > > } > > + else if (TARGET_GFNI && CONST_INT_P (operands[2])) > > + { > > + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], > > operands[2], > > + <CODE>); > > + emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], > > matrix, > > + GEN_INT (0))); > > + } > > else > > ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]); > > DONE; > > }) > > > > +(define_expand "<insn><mode>3_mask" > I guess you want to support standard pattern names of cond_<insn><mode>? > > > + [(set (match_operand:VI1_AVX512 0 "register_operand") > > + (any_shift:VI1_AVX512 > And since mask operation is only supported in avx512, it should be > VI1_AVX512VL > > + (match_operand:VI1_AVX512 1 "register_operand") > > + (match_operand:SI 2 "const_int_operand"))) > > + (match_operand:VI1_AVX512 3 "nonimm_or_0_operand") > > + (match_operand:<avx512fmaskmode> 4 "register_operand")] > > + "TARGET_GFNI" > and "TARGET_GFNI && TARGET_AVX512F". > > +{ > > + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], > > <CODE>); > > + emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[1], > > matrix, > > + GEN_INT (0), operands[3], operands[4])); > > + DONE; > > +}) > > + > > +(define_expand "<insn><mode>3<mask_name>" > No need for <mask_name> here. > > > + [(set (match_operand:VI1_AVX512 0 "register_operand") > > + (any_rotate:VI1_AVX512 > > + (match_operand:VI1_AVX512 1 "register_operand") > > + (match_operand:SI 2 "const_int_operand")))] > > + "" > it better to be "TARGET_GFNI || (TARGET_XOP && <MODE_SIZE> == 16)"? > Maybe we should split the pattern for the V16QI part and V32QI/V64QI part. > For the former we use general operand for operands[2], so you don't > need to adjust the code in ix86_expand_multi_arg_builtin > And for the later just expand it with gfni instructions. > > +{ > > + /* Handle the V16QI XOP case to avoid a conflict with the other expand. > > */ > > + if (TARGET_XOP && ARRAY_SIZE (operands) <= 3 && <MODE>mode == V16QImode > > + && ! const_0_to_7_operand (operands[2], SImode)) > Just > if (TARGET_XOP && <MODE_SIZE> == 16) > { > if (!const_0_to_7_operand (operands[2], SImode) > { > .... > DONE; > } > } > else > { > rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], > operands[2], <CODE>); > emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix, > GEN_INT (0) )); > DONE; > } > > So we still keep the behavior for TARGET_XOP and V16QImode, and just > do additional optimization for GFNI? > > > > + { > > + rtvec vs = rtvec_alloc (16); > > + rtx par = gen_rtx_PARALLEL (V16QImode, vs); > > + rtx reg = gen_reg_rtx (V16QImode); > > + rtx op2 = operands[2]; > > + int i; > > + > > + if (GET_MODE (op2) != QImode) > > + { > > + op2 = gen_reg_rtx (QImode); > > + convert_move (op2, operands[2], false); > > + } > > + > > + for (i = 0; i < 16; i++) > > + RTVEC_ELT (vs, i) = op2; > > + > > + emit_insn (gen_vec_initv16qiqi (reg, par)); > > + if (<CODE> == ROTATERT) > > + { > > + rtx neg = gen_reg_rtx (<MODE>mode); > > + emit_insn (gen_neg<mode>2 (neg, reg)); > > + emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg)); > > + reg = neg; > > + } > > + emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg)); > > + DONE; > > + } > > + else if (!TARGET_GFNI) > > + FAIL; > > After it's added to the condition, it's not needed here. > > > > + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], > > <CODE>); > > + emit_insn (gen_vgf2p8affineqb_<mode><mask_name> (operands[0], > > operands[1], matrix, > > + GEN_INT (0) <mask_operand_arg34>)); > > + DONE; > > +}) > > + > > (define_expand "ashrv2di3" > > [(set (match_operand:V2DI 0 "register_operand") > > (ashiftrt:V2DI > > diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c > > b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c > > new file mode 100644 > > index 000000000000..39a94eb70b22 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-1.c > > @@ -0,0 +1,64 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-mgfni -mavx512bw -mavx512f -O3" } */ > > +/* { dg-final { scan-assembler-times "vgf2p8affineqb" 13 } } */ > > + > > +#ifndef N > > +#define N 5 > > +#endif > > + > > +void > > +ubyteshiftl (unsigned char *a, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + a[i] <<= N; > > +} > > + > > +void > > +ubyteshiftr (unsigned char *a, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + a[i] >>= N; > > +} > > + > > +void > > +ubyteshiftl_mask (unsigned char *a, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + if (a[i] & 1) > > + a[i] <<= N; > > +} > > + > > +void > > +sbyteshiftl (signed char *a, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + a[i] <<= N; > > +} > > + > > +void > > +sbyteshiftr (signed char *a, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + a[i] >>= N; > > +} > > + > > +void > > +ubyteror (unsigned char *a, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + a[i] = a[i] << N | a[i] >> (8 - N); > > +} > > + > > +void > > +ubyterol (unsigned char *a, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + a[i] = a[i] >> N | a[i] << (8 - N); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c > > b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c > > new file mode 100644 > > index 000000000000..1027d69e2f22 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-2.c > > @@ -0,0 +1,196 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-mgfni -mavx512bw -mavx512f -O3 > > -Wno-shift-count-negative" } */ > > + > > +#include <string.h> > > + > > +#ifndef N1 > > +#define N1 5 > > +#endif > > + > > +#ifndef N2 > > +#define N2 3 > > +#endif > > + > > +#ifndef N3 > > +#define N3 1 > > +#endif > > + > > +#ifndef N4 > > +#define N4 7 > > +#endif > > + > > +#ifndef N5 > > +#define N5 -3 > > +#endif > > + > > +#ifndef FILLER > > +#define FILLER 0xab > > +#endif > > + > > +#define FUNC(N) \ > > + void ubyteshiftl##N(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + for (i = 0; i < len; i++) \ > > + a[i] <<= N; \ > > + } \ > > + \ > > + void ubyteshiftr##N(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + for (i = 0; i < len; i++) \ > > + a[i] >>= N; \ > > + } \ > > + \ > > + void ubyteshiftl_mask##N(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + for (i = 0; i < len; i++) \ > > + if (a[i] & 1) \ > > + a[i] <<= N; \ > > + } \ > > + \ > > + void sbyteshiftl##N(signed char *a, int len) \ > > + { \ > > + int i; \ > > + for (i = 0; i < len; i++) \ > > + a[i] <<= N; \ > > + } \ > > + \ > > + void sbyteshiftr##N(signed char *a, int len) \ > > + { \ > > + int i; \ > > + for (i = 0; i < len; i++) \ > > + a[i] >>= N; \ > > + } \ > > + \ > > + void ubyteror##N(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + for (i = 0; i < len; i++) \ > > + a[i] = a[i] << N | a[i] >> (8-N); \ > > + } \ > > + \ > > + void ubyterol##N(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + for (i = 0; i < len; i++) \ > > + a[i] = a[i] >> N | a[i] << (8-N); \ > > + } \ > > + void ubyteshiftl##N##ref(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + _Pragma("GCC novector") \ > > + for (i = 0; i < len; i++) \ > > + a[i] <<= N; \ > > + } \ > > + \ > > + void ubyteshiftr##N##ref(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + _Pragma("GCC novector") \ > > + for (i = 0; i < len; i++) \ > > + a[i] >>= N; \ > > + } \ > > + \ > > + void ubyteshiftl_mask##N##ref(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + _Pragma("GCC novector") \ > > + for (i = 0; i < len; i++) \ > > + if (a[i] & 1) \ > > + a[i] <<= N; \ > > + } \ > > + \ > > + void sbyteshiftl##N##ref(signed char *a, int len) \ > > + { \ > > + int i; \ > > + _Pragma("GCC novector") \ > > + for (i = 0; i < len; i++) \ > > + a[i] <<= N; \ > > + } \ > > + \ > > + void sbyteshiftr##N##ref(signed char *a, int len) \ > > + { \ > > + int i; \ > > + _Pragma("GCC novector") \ > > + for (i = 0; i < len; i++) \ > > + a[i] >>= N; \ > > + } \ > > + \ > > + void ubyteror##N##ref(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + _Pragma("GCC novector") \ > > + for (i = 0; i < len; i++) \ > > + a[i] = a[i] << N | a[i] >> (8-N); \ > > + } \ > > + \ > > + void ubyterol##N##ref(unsigned char *a, int len) \ > > + { \ > > + int i; \ > > + _Pragma("GCC novector") \ > > + for (i = 0; i < len; i++) \ > > + a[i] = a[i] >> N | a[i] << (8-N); \ > > + } > > + > > +FUNC (N1) > > +FUNC (N2) > > +FUNC (N3) > > +FUNC (N4) > > +FUNC (N5) > > + > > +#define TEST(N, func) \ > > + memset (array, filler, len); \ > > + func##N (array, len); \ > > + memset (array2, filler, len); \ > > + func##N##ref (array2, len); \ > > + if (memcmp (array, array2, len)) __builtin_abort () > > + > > +int main () > > +{ > > + __builtin_cpu_init (); > > + if (!__builtin_cpu_supports ("gfni")) > > + return 0; > > + > > + const unsigned long len = 256; > > + char array[len], array2[len]; > > + unsigned char filler = FILLER; > > + > > + TEST (N1, ubyteshiftl); > > + TEST (N1, ubyteshiftl_mask); > > + TEST (N1, sbyteshiftl); > > + TEST (N1, sbyteshiftr); > > + TEST (N1, ubyteror); > > + TEST (N1, ubyterol); > > + > > + TEST (N2, ubyteshiftl); > > + TEST (N2, ubyteshiftl_mask); > > + TEST (N2, sbyteshiftl); > > + TEST (N2, sbyteshiftr); > > + TEST (N2, ubyteror); > > + TEST (N2, ubyterol); > > + > > + TEST (N3, ubyteshiftl); > > + TEST (N3, ubyteshiftl_mask); > > + TEST (N3, sbyteshiftl); > > + TEST (N3, sbyteshiftr); > > + TEST (N3, ubyteror); > > + TEST (N3, ubyterol); > > + > > + TEST (N4, ubyteshiftl); > > + TEST (N4, ubyteshiftl_mask); > > + TEST (N4, sbyteshiftl); > > + TEST (N4, sbyteshiftr); > > + TEST (N4, ubyteror); > > + TEST (N4, ubyterol); > > + > > + TEST (N5, ubyteshiftl); > > + TEST (N5, ubyteshiftl_mask); > > + TEST (N5, sbyteshiftl); > > + TEST (N5, sbyteshiftr); > > + TEST (N5, ubyteror); > > + TEST (N5, ubyterol); > > + > > + return 0; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c > > b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c > > new file mode 100644 > > index 000000000000..856893d9a5fe > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/shift-gf2p8affine-3.c > > @@ -0,0 +1,87 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-mgfni -mavx512bw -mavx512f -O3" } */ > > +/* { dg-final { scan-assembler-times "vgf2p8affineqb" 12 } } */ > > + > > +/* Based on a test case from Andrew Pinski */ > > + > > +#ifndef N > > +#define N 5 > > +#endif > > + > > +void > > +ubyteshiftl (unsigned char *restrict a, unsigned char *restrict b, > > unsigned char *restrict c, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + { > > + a[i] = c[i] ? (a[i] | b[i]) << N : a[i]; > > + a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i]; > > + } > > +} > > + > > +void > > +ubyteshiftr (unsigned char *restrict a, unsigned char *restrict b, > > unsigned char *restrict c, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + { > > + a[i] = c[i] ? (a[i] | b[i]) >> N : a[i]; > > + a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i]; > > + } > > +} > > + > > +void > > +sbyteshiftl (signed char *restrict a, signed char *restrict b, signed char > > *restrict c, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + { > > + a[i] = c[i] ? (a[i] | b[i]) << N : a[i]; > > + a[i] = (!c[i]) ? (a[i] ^ b[i]) << N : a[i]; > > + } > > +} > > + > > +void > > +sbyteshiftr (signed char *restrict a, signed char *restrict b, signed char > > *restrict c, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + { > > + a[i] = c[i] ? (a[i] | b[i]) >> N : a[i]; > > + a[i] = (!c[i]) ? (a[i] ^ b[i]) >> N : a[i]; > > + } > > +} > > + > > +static inline unsigned char rol8(unsigned char v, int c) > > +{ > > + return (v >> c) | (v << (8-c)); > > +} > > + > > +static inline unsigned char ror8(unsigned char v, int c) > > +{ > > + return (v << c) | (v >> (8-c)); > > +} > > + > > +void > > +ubyterol (unsigned char *restrict a, unsigned char *restrict b, unsigned > > char *restrict c, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + { > > + a[i] = c[i] ? rol8(a[i] | b[i], N) : a[i]; > > + a[i] = (!c[i]) ? rol8(a[i] ^ b[i], N) : a[i]; > > + } > > +} > > + > > +void > > +ubyteror (unsigned char *restrict a, unsigned char *restrict b, unsigned > > char *restrict c, int len) > > +{ > > + int i; > > + for (i = 0; i < len; i++) > > + { > > + a[i] = c[i] ? ror8(a[i] | b[i], N) : a[i]; > > + a[i] = (!c[i]) ? ror8(a[i] ^ b[i], N) : a[i]; > > + } > > +} > > + > > + > > -- > > 2.49.0 > > > > > -- > BR, > Hongtao
-- BR, Hongtao