Hello, This patch extends permutations for AVX-512*. Comments are welcome! Bootstrapped. AVX-512* tests on top of patch-set all pass under simulator.
Is it ok for trunk? gcc/ * config/i386/i386.c (ix86_expand_vec_perm_vpermi2): Handle V64QImode, V8HImode, V16HImode, V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode, V2DFmode, V4DFmode. (ix86_expand_sse_unpack): Handle V64QImode. (expand_vec_perm_blend): Update conditions for TARGET, handle V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode. (expand_vec_perm_pshufb): Handle V64QImode. (expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode, V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode. (ix86_expand_vec_perm_const_1): Call ix86_expand_vec_perm_vpermi2. (ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode. (ix86_expand_vecop_qihi): Handle V64QImode. * config/i386/sse.md (define_mode_iterator VI1_AVX2): Add V64QI mode. (define_mode_iterator VEC_PERM_AVX2): Add V32HI mode. (define_mode_iterator VEC_PERM_CONST): Add V64QI and V32HI mode. (define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking. -- Thanks, K diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 352ab81..d759a45 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) enum machine_mode mode = GET_MODE (op0); switch (mode) { + /* There is no byte version of vpermi2. So we use vpermi2w. */ + case V64QImode: + if (!TARGET_AVX512BW) + return false; + rtx mask_lowpart, op0_lowpart, op1_lowpart; + rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi; + + mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask)); + op0_lowpart = gen_lowpart (V32HImode, op0); + op1_lowpart = gen_lowpart (V32HImode, op1); + tmp = gen_reg_rtx (V32HImode); + tmp2 = gen_reg_rtx (V32HImode); + perm_lo = gen_reg_rtx (V32HImode); + perm_hi = gen_reg_rtx (V32HImode); + res_lo = gen_reg_rtx (V32HImode); + res_hi = gen_reg_rtx (V32HImode); + + emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8))); + emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9))); + emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9))); + emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart, + perm_lo, op1_lowpart)); + emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart, + perm_hi, op1_lowpart)); + emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8))); + emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo), + gen_lowpart (V64QImode, res_hi), + force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL)))); + return true; + case V8HImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0, + force_reg (V8HImode, mask), op1)); + return true; + case V16HImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0, + force_reg (V16HImode, mask), op1)); + return true; + case V32HImode: + emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0, + force_reg (V32HImode, mask), op1)); + return true; + case V4SImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0, + force_reg (V4SImode, mask), op1)); + return true; + case V8SImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0, + force_reg (V8SImode, mask), op1)); + return true; case V16SImode: emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0, force_reg (V16SImode, mask), op1)); return true; + case V4SFmode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0, + force_reg (V4SImode, mask), op1)); + return true; + case V8SFmode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0, + force_reg (V8SImode, mask), op1)); + return true; case V16SFmode: emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0, force_reg (V16SImode, mask), op1)); return true; + case V2DImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0, + force_reg (V2DImode, mask), op1)); + return true; + case V4DImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0, + force_reg (V4DImode, mask), op1)); + return true; case V8DImode: emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0, force_reg (V8DImode, mask), op1)); return true; + case V2DFmode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0, + force_reg (V2DImode, mask), op1)); + return true; + case V4DFmode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0, + force_reg (V4DImode, mask), op1)); + return true; case V8DFmode: emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0, force_reg (V8DImode, mask), op1)); @@ -21779,6 +21872,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) switch (imode) { + case V64QImode: + if (unsigned_p) + unpack = gen_avx512bw_zero_extendv32qiv32hi2; + else + unpack = gen_avx512bw_sign_extendv32qiv32hi2; + halfmode = V32QImode; + extract + = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi; + break; case V32QImode: if (unsigned_p) unpack = gen_avx2_zero_extendv16qiv16hi2; @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) if (d->one_operand_p) return false; - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 && + GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4) + ; + else if (TARGET_AVX512VL) + ; + else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; @@ -42693,12 +42800,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) switch (vmode) { + case V8DFmode: + case V16SFmode: case V4DFmode: case V8SFmode: case V2DFmode: case V4SFmode: case V8HImode: case V8SImode: + case V32HImode: + case V64QImode: + case V16SImode: + case V8DImode: for (i = 0; i < nelt; ++i) mask |= (d->perm[i] >= nelt) << i; break; @@ -42921,9 +43034,9 @@ static bool expand_vec_perm_pshufb (struct expand_vec_perm_d *d) { unsigned i, nelt, eltsz, mask; - unsigned char perm[32]; + unsigned char perm[64]; enum machine_mode vmode = V16QImode; - rtx rperm[32], vperm, target, op0, op1; + rtx rperm[64], vperm, target, op0, op1; nelt = d->nelt; @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return false; } } + else if (GET_MODE_SIZE (d->vmode) == 64) + { + if (!TARGET_AVX512BW) + return false; + if (vmode == V64QImode) + { + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (nelt / 4)) + return false; + } + } else return false; } @@ -43029,6 +43153,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) mask = 2 * nelt - 1; else if (vmode == V16QImode) mask = nelt - 1; + else if (vmode == V64QImode) + mask = nelt / 4 - 1; else mask = nelt / 2 - 1; @@ -43054,6 +43180,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); + else if (vmode == V64QImode) + emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); else if (vmode == V8SFmode) emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); else @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) rtx (*gen) (rtx, rtx) = NULL; switch (d->vmode) { + case V64QImode: + if (TARGET_AVX512VL) + gen = gen_avx512bw_vec_dupv64qi; + break; case V32QImode: gen = gen_avx2_pbroadcastv32qi_1; break; + case V32HImode: + if (TARGET_AVX512VL) + gen = gen_avx512bw_vec_dupv32hi; + break; case V16HImode: gen = gen_avx2_pbroadcastv16hi_1; break; + case V16SImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16si; + break; case V8SImode: gen = gen_avx2_pbroadcastv8si_1; break; @@ -43124,9 +43264,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) case V8HImode: gen = gen_avx2_pbroadcastv8hi; break; + case V16SFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16sf; + break; case V8SFmode: gen = gen_avx2_vec_dupv8sf_1; break; + case V8DFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8df; + break; + case V8DImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8di; + break; /* For other modes prefer other shuffles this function creates. */ default: break; } @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) mode = V8DImode; else if (mode == V16SFmode) mode = V16SImode; + else if (mode == V4DFmode) + mode = V4DImode; + else if (mode == V2DFmode) + mode = V2DImode; + else if (mode == V8SFmode) + mode = V8SImode; + else if (mode == V4SFmode) + mode = V4SImode; for (i = 0; i < nelt; ++i) vec[i] = GEN_INT (d->perm[i]); rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); @@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; /* Try sequences of two instructions. */ + /* ix86_expand_vec_perm_vpermi2 is also called from + * ix86_expand_vec_perm. So it doesn't take d as parameter. + * Construct needed params. */ + rtx vec[64]; + int i; + for (i = 0; i < d->nelt; ++i) + vec[i] = GEN_INT (d->perm[i]); + rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec)); + if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1)) + return true; if (expand_vec_perm_pshuflw_pshufhw (d)) return true; @@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode, /* Given sufficient ISA support we can just return true here for selected vector modes. */ if (d.vmode == V16SImode || d.vmode == V16SFmode - || d.vmode == V8DFmode || d.vmode == V8DImode) + || d.vmode == V8DFmode || d.vmode == V8DImode + || d.vmode == V32HImode || d.vmode == V64QImode) /* All implementable with a single vpermi2 insn. */ return true; if (GET_MODE_SIZE (d.vmode) == 16) @@ -45066,6 +45237,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) gen_il = gen_avx2_interleave_lowv32qi; gen_ih = gen_avx2_interleave_highv32qi; break; + case V64QImode: + himode = V32HImode; + gen_il = gen_avx512bw_interleave_lowv64qi; + gen_ih = gen_avx512bw_interleave_highv64qi; + break; default: gcc_unreachable (); } @@ -45126,7 +45302,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) { /* For SSE2, we used an full interleave, so the desired results are in the even elements. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2; } else @@ -45134,7 +45310,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) /* For AVX, the interleave used above was not cross-lane. So the extraction is evens but with the second and third quarter swapped. Happily, that is even one insn shorter than even extraction. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0); } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index bb6372a..d3e9635 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -298,7 +298,7 @@ [V8DI (V4DI "TARGET_AVX512VL")]) (define_mode_iterator VI1_AVX2 - [(V32QI "TARGET_AVX2") V16QI]) + [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI]) (define_mode_iterator VI2_AVX2 [(V16HI "TARGET_AVX2") V8HI]) @@ -10621,7 +10621,8 @@ (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") - (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")]) + (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")]) (define_expand "vec_perm<mode>" [(match_operand:VEC_PERM_AVX2 0 "register_operand") @@ -10642,7 +10643,8 @@ (V8SI "TARGET_AVX") (V4DI "TARGET_AVX") (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") - (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) + (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW")]) (define_expand "vec_perm_const<mode>" [(match_operand:VEC_PERM_CONST 0 "register_operand") @@ -13559,21 +13561,21 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "<ssse3_avx2>_pshufb<mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") +(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v") (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "register_operand" "0,x") - (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")] + [(match_operand:VI1_AVX2 1 "register_operand" "0,v") + (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,vm")] UNSPEC_PSHUFB))] - "TARGET_SSSE3" + "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" "@ pshufb\t{%2, %0|%0, %2} - vpshufb\t{%2, %1, %0|%0, %1, %2}" + vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog1") (set_attr "prefix_data16" "1,*") (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "orig,maybe_evex") (set_attr "btver2_decode" "vector,vector") (set_attr "mode" "<sseinsnmode>")])