Hello! Attached patch fixes issues around AVX2 vpermps and vpermd instructions.
1. Changes second argument of _mm256_permutevar8x32_ps to __m256i type and consequently changes second argument of __builtin_ia32_permvarsf256 argument to __v8si type. 2. Changes avx2_permvarv2sf pattern to accept v8si mask operand as its 2nd operand 3. Changes avx2_permvarv2si pattern in similar way, so it accepts mask as its 2nd operand 4. Macroizes avx2_permvarv2sf and permvarv2si patterns 5. Mechanically updates all calls to these two expanders 6. Fixes testcases accordingly 2012-04-12 Uros Bizjak <ubiz...@gmail.com> PR target/52932 * config/i386/avx2intrin.h (_mm256_permutevar8x32_ps): Change second argument type to __m256i. Update call to __builtin_ia32_permvarsf256. * config/i386/sse.md (UNSPEC_VPERMVAR): New. (UNSPEC_VPERMSI, UNSPEC_VPERMSF): Remove. (avx2_permvarv8sf, avx2_permvarv8si): Switch operands 1 and 2. (avx2_permvar<mode>): Macroize insn from avx2_permvarv8sf and avx2_permvarv8si using VI4F_256 mode iterator. * config/i386/i386.c (bdesc_args) <__builtin_ia32_permvarsf256>: Update builtin type to V8SF_FTYPE_V8SF_V8SI. (ix86_expand_vec_perm): Update calls to gen_avx2_permvarv8si and gen_avx2_permvarv8sf. (expand_vec_perm_pshufb): Ditto. testsuite/ChangeLog: 2012-04-12 Uros Bizjak <ubiz...@gmail.com> PR target/52932 * gcc.target/i386/avx2-vpermps-1.c (avx2_test): Use __m256i type for second function argument. * gcc.target/i386/avx2-vpermps-2.c (init_permps): Update declaration. (calc_permps): Update declaration. Calculate result correctly. (avx2_test): Change src2 type to union256i_d. * gcc.target/i386/avx2-vpermd-2.c (calc_permd): Calculate result correctly. Patch was tested on x86_64-pc-linux-gnu {,-m32}. Earlier version of the patch (without mechanical changes) was also tested on AVX2 target by Kirill. Patch was committed to mainline SVN, will be committed to 4.7.1 in a few days. Uros.
Index: config/i386/avx2intrin.h =================================================================== --- config/i386/avx2intrin.h (revision 186383) +++ config/i386/avx2intrin.h (working copy) @@ -1034,9 +1034,9 @@ _mm256_permute4x64_pd (__m256d __X, const int __M) extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_permutevar8x32_ps (__m256 __X, __m256 __Y) +_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) { - return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X,(__v8sf)__Y); + return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); } #ifdef __OPTIMIZE__ Index: config/i386/sse.md =================================================================== --- config/i386/sse.md (revision 186383) +++ config/i386/sse.md (working copy) @@ -79,8 +79,7 @@ UNSPEC_VCVTPS2PH ;; For AVX2 support - UNSPEC_VPERMSI - UNSPEC_VPERMSF + UNSPEC_VPERMVAR UNSPEC_VPERMTI UNSPEC_GATHER UNSPEC_VSIBADDR @@ -11901,30 +11900,18 @@ (set_attr "prefix" "vex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx2_permvarv8si" - [(set (match_operand:V8SI 0 "register_operand" "=x") - (unspec:V8SI - [(match_operand:V8SI 1 "register_operand" "x") - (match_operand:V8SI 2 "nonimmediate_operand" "xm")] - UNSPEC_VPERMSI))] +(define_insn "avx2_permvar<mode>" + [(set (match_operand:VI4F_256 0 "register_operand" "=x") + (unspec:VI4F_256 + [(match_operand:VI4F_256 1 "nonimmediate_operand" "xm") + (match_operand:V8SI 2 "register_operand" "x")] + UNSPEC_VPERMVAR))] "TARGET_AVX2" - "vpermd\t{%2, %1, %0|%0, %1, %2}" + "vperm<ssemodesuffix>\t{%1, %2, %0|%0, %2, %1}" [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_insn "avx2_permvarv8sf" - [(set (match_operand:V8SF 0 "register_operand" "=x") - (unspec:V8SF - [(match_operand:V8SF 1 "register_operand" "x") - (match_operand:V8SF 2 "nonimmediate_operand" "xm")] - UNSPEC_VPERMSF))] - "TARGET_AVX2" - "vpermps\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sselog") - (set_attr "prefix" "vex") - (set_attr "mode" "OI")]) - (define_expand "avx2_perm<mode>" [(match_operand:VI8F_256 0 "register_operand") (match_operand:VI8F_256 1 "nonimmediate_operand") Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 186383) +++ config/i386/i386.c (working copy) @@ -19937,7 +19937,7 @@ ix86_expand_vec_perm (rtx operands[]) vt = force_reg (maskmode, vt); mask = gen_lowpart (maskmode, mask); if (maskmode == V8SImode) - emit_insn (gen_avx2_permvarv8si (t1, vt, mask)); + emit_insn (gen_avx2_permvarv8si (t1, mask, vt)); else emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt)); @@ -19971,13 +19971,13 @@ ix86_expand_vec_perm (rtx operands[]) the high bits of the shuffle elements. No need for us to perform an AND ourselves. */ if (one_operand_shuffle) - emit_insn (gen_avx2_permvarv8si (target, mask, op0)); + emit_insn (gen_avx2_permvarv8si (target, op0, mask)); else { t1 = gen_reg_rtx (V8SImode); t2 = gen_reg_rtx (V8SImode); - emit_insn (gen_avx2_permvarv8si (t1, mask, op0)); - emit_insn (gen_avx2_permvarv8si (t2, mask, op1)); + emit_insn (gen_avx2_permvarv8si (t1, op0, mask)); + emit_insn (gen_avx2_permvarv8si (t2, op0, mask)); goto merge_two; } return; @@ -19985,13 +19985,13 @@ ix86_expand_vec_perm (rtx operands[]) case V8SFmode: mask = gen_lowpart (V8SFmode, mask); if (one_operand_shuffle) - emit_insn (gen_avx2_permvarv8sf (target, mask, op0)); + emit_insn (gen_avx2_permvarv8sf (target, op0, mask)); else { t1 = gen_reg_rtx (V8SFmode); t2 = gen_reg_rtx (V8SFmode); - emit_insn (gen_avx2_permvarv8sf (t1, mask, op0)); - emit_insn (gen_avx2_permvarv8sf (t2, mask, op1)); + emit_insn (gen_avx2_permvarv8sf (t1, op0, mask)); + emit_insn (gen_avx2_permvarv8sf (t2, op1, mask)); goto merge_two; } return; @@ -20004,7 +20004,7 @@ ix86_expand_vec_perm (rtx operands[]) t2 = gen_reg_rtx (V8SImode); emit_insn (gen_avx_vec_concatv8si (t1, op0, op1)); emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); - emit_insn (gen_avx2_permvarv8si (t1, t2, t1)); + emit_insn (gen_avx2_permvarv8si (t1, t1, t2)); emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx)); return; @@ -20014,7 +20014,7 @@ ix86_expand_vec_perm (rtx operands[]) mask = gen_lowpart (V4SFmode, mask); emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1)); emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask)); - emit_insn (gen_avx2_permvarv8sf (t1, t2, t1)); + emit_insn (gen_avx2_permvarv8sf (t1, t1, t2)); emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx)); return; @@ -26948,8 +26948,8 @@ static const struct builtin_description bdesc_args { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT }, - { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT }, @@ -36126,9 +36126,9 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d * else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); else if (vmode == V8SFmode) - emit_insn (gen_avx2_permvarv8sf (target, vperm, op0)); + emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); else - emit_insn (gen_avx2_permvarv8si (target, vperm, op0)); + emit_insn (gen_avx2_permvarv8si (target, op0, vperm)); } else { Index: testsuite/gcc.target/i386/avx2-vpermd-2.c =================================================================== --- testsuite/gcc.target/i386/avx2-vpermd-2.c (revision 186383) +++ testsuite/gcc.target/i386/avx2-vpermd-2.c (working copy) @@ -29,8 +29,8 @@ calc_permd (int *src1, int *src2, int *dst) memcpy (dst, src1, 32); for (i = 0; i < 8; i++) { - temp = src1[i]; - dst[i] = src2[temp & 7]; + temp = src2[i]; + dst[i] = src1[temp & 7]; } } Index: testsuite/gcc.target/i386/avx2-vpermps-1.c =================================================================== --- testsuite/gcc.target/i386/avx2-vpermps-1.c (revision 186383) +++ testsuite/gcc.target/i386/avx2-vpermps-1.c (working copy) @@ -5,9 +5,10 @@ #include <immintrin.h> __m256 x; +__m256i y; void extern avx2_test (void) { - x = _mm256_permutevar8x32_ps (x, x); + x = _mm256_permutevar8x32_ps (x, y); } Index: testsuite/gcc.target/i386/avx2-vpermps-2.c =================================================================== --- testsuite/gcc.target/i386/avx2-vpermps-2.c (revision 186383) +++ testsuite/gcc.target/i386/avx2-vpermps-2.c (working copy) @@ -8,7 +8,7 @@ #define NUM 10 static void -init_permps (float *src1, float *src2, int seed) +init_permps (float *src1, int *src2, int seed) { int i, sign = 1; @@ -21,24 +21,24 @@ static void } static void -calc_permps (float *src1, float *src2, float *dst) +calc_permps (float *src1, int *src2, float *dst) { int i; unsigned temp; - unsigned *idx = (int *) src1; memcpy (dst, src1, 32); for (i = 0; i < 8; i++) { - temp = idx[i]; - dst[i] = src2[temp & 7]; + temp = src2[i]; + dst[i] = src1[temp & 7]; } } static void avx2_test (void) { - union256 src1, src2, dst; + union256 src1, dst; + union256i_d src2; float dst_ref[8]; int i;