[PATCH 1/3 v4] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
Hi, This is the current version. I haven't made any major changes to the original code, I think it will have less impact on your code. And I think the current API is sufficient to support the mode selection you mentioned, if you have any concerns you can mention them. I can tweak it further. BRs, Lin gcc/ChangeLog: PR target/107432 * tree-vect-generic.cc (expand_vector_conversion): Support convert for int -> int, float -> float and int <-> float. * tree-vect-stmts.cc (vectorizable_conversion): Wrap the indirect convert part. (supportable_indirect_convert_operation): New function. * tree-vectorizer.h (supportable_indirect_convert_operation): Define the new function. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: New test. * gcc.target/i386/pr107432-2.c: Ditto. * gcc.target/i386/pr107432-3.c: Ditto. * gcc.target/i386/pr107432-4.c: Ditto. * gcc.target/i386/pr107432-5.c: Ditto. * gcc.target/i386/pr107432-6.c: Ditto. * gcc.target/i386/pr107432-7.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++ gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 ++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 +++ gcc/testsuite/gcc.target/i386/pr107432-7.c | 150 gcc/tree-vect-generic.cc | 34 ++- gcc/tree-vect-stmts.cc | 259 ++--- gcc/tree-vectorizer.h | 4 + 10 files changed, 1013 insertions(+), 95 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2si); +} + +__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); +} + +__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); +} + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); +} + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); +} + +__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); +} + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); +} + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4qi); +} + +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8di)a, __v8qi); +} + +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) +{
RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
> -Original Message- > From: Tamar Christina > Sent: Monday, June 24, 2024 10:12 PM > To: Richard Biener ; Hu, Lin1 > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > ubiz...@gmail.com > Subject: RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> > int, > float -> float and int <-> float. > > > -Original Message- > > From: Richard Biener > > Sent: Monday, June 24, 2024 1:34 PM > > To: Hu, Lin1 > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > > ubiz...@gmail.com > > Subject: RE: [PATCH 1/3 v3] vect: generate suitable convert insn for > > int -> int, float > > -> float and int <-> float. > > > > On Thu, 20 Jun 2024, Hu, Lin1 wrote: > > > > > > >else if (ret_elt_bits > arg_elt_bits) > > > > > modifier = WIDEN; > > > > > > > > > > + if (supportable_convert_operation (code, ret_type, arg_type, > > > > > )) > > > > > +{ > > > > > + g = gimple_build_assign (lhs, code1, arg); > > > > > + gsi_replace (gsi, g, false); > > > > > + return; > > > > > +} > > > > > > > > Given the API change I suggest below it might make sense to have > > > > supportable_indirect_convert_operation do the above and represent > > > > it as > > single- > > > > step conversion? > > > > > > > > > > OK, if you want to supportable_indirect_convert_operation can do > > > something like supportable_convert_operation, I'll give it a try. > > > This functionality is really the part that this function can cover. > > > But this would require some changes not only the API change, because > > > supportable_indirect_convert_operation originally only supported > > > Float > > > -> Int or Int ->Float. > > > > I think I'd like to see a single API to handle direct and > > (multi-)indirect-level converts that operate on vectors with all the > > same number of lanes. > > > > > > > > > > > + code_helper code2 = ERROR_MARK, code3 = ERROR_MARK; > > > > > + int multi_step_cvt = 0; > > > > > + vec interm_types = vNULL; > > > > > + if (supportable_indirect_convert_operation (NULL, > > > > > + code, > > > > > + ret_type, arg_type, > > > > > + , , > > > > > + _step_cvt, > > > > > + _types, arg)) > > > > > +{ > > > > > + new_rhs = make_ssa_name (interm_types[0]); > > > > > + g = gimple_build_assign (new_rhs, (tree_code) code3, arg); > > > > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > > > > + g = gimple_build_assign (lhs, (tree_code) code2, new_rhs); > > > > > + gsi_replace (gsi, g, false); > > > > > + return; > > > > > +} > > > > > + > > > > >if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == > > > > FLOAT_EXPR)) > > > > > { > > > > > - if (supportable_convert_operation (code, ret_type, arg_type, > )) > > > > > - { > > > > > - g = gimple_build_assign (lhs, code1, arg); > > > > > - gsi_replace (gsi, g, false); > > > > > - return; > > > > > - } > > > > >/* Can't use get_compute_type here, as > supportable_convert_operation > > > > >doesn't necessarily use an optab and needs two arguments. */ > > > > >tree vec_compute_type > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > > > index 05a169ecb2d..0aa608202ca 100644 > > > > > --- a/gcc/tree-vect-stmts.cc > > > > > +++ b/gcc/tree-vect-stmts.cc > > > > > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo, > > > > >tree scalar_dest; > > > > >tree op0, op1 = NULL_TREE; > > > > >loop_vec_info loop_vinfo = dyn_cast (vinfo); > > > > > - tree_code tc1, tc2; > > > > > + tree_code tc1; > > > > >code_helper code, code1, code2; > > > > >code_helper codecvt1 = ERROR_MARK, c
RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
> >else if (ret_elt_bits > arg_elt_bits) > > modifier = WIDEN; > > > > + if (supportable_convert_operation (code, ret_type, arg_type, )) > > +{ > > + g = gimple_build_assign (lhs, code1, arg); > > + gsi_replace (gsi, g, false); > > + return; > > +} > > Given the API change I suggest below it might make sense to have > supportable_indirect_convert_operation do the above and represent it as > single- > step conversion? > OK, if you want to supportable_indirect_convert_operation can do something like supportable_convert_operation, I'll give it a try. This functionality is really the part that this function can cover. But this would require some changes not only the API change, because supportable_indirect_convert_operation originally only supported Float -> Int or Int ->Float. > > > + code_helper code2 = ERROR_MARK, code3 = ERROR_MARK; > > + int multi_step_cvt = 0; > > + vec interm_types = vNULL; > > + if (supportable_indirect_convert_operation (NULL, > > + code, > > + ret_type, arg_type, > > + , , > > + _step_cvt, > > + _types, arg)) > > +{ > > + new_rhs = make_ssa_name (interm_types[0]); > > + g = gimple_build_assign (new_rhs, (tree_code) code3, arg); > > + gsi_insert_before (gsi, g, GSI_SAME_STMT); > > + g = gimple_build_assign (lhs, (tree_code) code2, new_rhs); > > + gsi_replace (gsi, g, false); > > + return; > > +} > > + > >if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == > FLOAT_EXPR)) > > { > > - if (supportable_convert_operation (code, ret_type, arg_type, )) > > - { > > - g = gimple_build_assign (lhs, code1, arg); > > - gsi_replace (gsi, g, false); > > - return; > > - } > >/* Can't use get_compute_type here, as supportable_convert_operation > > doesn't necessarily use an optab and needs two arguments. */ > >tree vec_compute_type > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index > > 05a169ecb2d..0aa608202ca 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo, > >tree scalar_dest; > >tree op0, op1 = NULL_TREE; > >loop_vec_info loop_vinfo = dyn_cast (vinfo); > > - tree_code tc1, tc2; > > + tree_code tc1; > >code_helper code, code1, code2; > >code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK; > >tree new_temp; > > @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo, > > break; > >} > > > > - /* For conversions between float and integer types try whether > > -we can use intermediate signed integer types to support the > > -conversion. */ > > - if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode) > > - && (code == FLOAT_EXPR || > > - (code == FIX_TRUNC_EXPR && !flag_trapping_math))) > > - { > > - bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE > (lhs_mode); > > - bool float_expr_p = code == FLOAT_EXPR; > > - unsigned short target_size; > > - scalar_mode intermediate_mode; > > - if (demotion) > > - { > > - intermediate_mode = lhs_mode; > > - target_size = GET_MODE_SIZE (rhs_mode); > > - } > > - else > > - { > > - target_size = GET_MODE_SIZE (lhs_mode); > > - if (!int_mode_for_size > > - (GET_MODE_BITSIZE (rhs_mode), 0).exists > (_mode)) > > - goto unsupported; > > - } > > - code1 = float_expr_p ? code : NOP_EXPR; > > - codecvt1 = float_expr_p ? NOP_EXPR : code; > > - opt_scalar_mode mode_iter; > > - FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode) > > - { > > - intermediate_mode = mode_iter.require (); > > - > > - if (GET_MODE_SIZE (intermediate_mode) > target_size) > > - break; > > - > > - scalar_mode cvt_mode; > > - if (!int_mode_for_size > > - (GET_MODE_BITSIZE (intermediate_mode), 0).exists > (_mode)) > > - break; > > - > > - cvt_type = build_nonstandard_integer_type > > - (GET_MODE_BITSIZE (cvt_mode), 0); > > - > > - /* Check if the intermediate type can hold OP0's range. > > -When converting from float to integer this is not necessary > > -because values that do not fit the (smaller) target type are > > -unspecified anyway. */ > > - if (demotion && float_expr_p) > > - { > > - wide_int op_min_value, op_max_value; > > - if (!vect_get_range_info (op0, _min_value, > _max_value)) > > - break; > > - > > - if (cvt_type == NULL_TREE > > - || (wi::min_precision (op_max_value, SIGNED) > > - > TYPE_PRECISION
RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
Ping this thread. BRs, Lin -Original Message- From: Hu, Lin1 Sent: Tuesday, June 11, 2024 2:49 PM To: gcc-patches@gcc.gnu.org Cc: Liu, Hongtao ; ubiz...@gmail.com; rguent...@suse.de Subject: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float. I wrap a part of code about indirect conversion. The API refers to supportable_narrowing/widening_operations. BRs, Lin gcc/ChangeLog: PR target/107432 * tree-vect-generic.cc (expand_vector_conversion): Support convert for int -> int, float -> float and int <-> float. * tree-vect-stmts.cc (vectorizable_conversion): Wrap the indirect convert part. (supportable_indirect_convert_operation): New function. * tree-vectorizer.h (supportable_indirect_convert_operation): Define the new function. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: New test. * gcc.target/i386/pr107432-2.c: Ditto. * gcc.target/i386/pr107432-3.c: Ditto. * gcc.target/i386/pr107432-4.c: Ditto. * gcc.target/i386/pr107432-5.c: Ditto. * gcc.target/i386/pr107432-6.c: Ditto. * gcc.target/i386/pr107432-7.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 ++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 + gcc/tree-vect-generic.cc | 33 ++- gcc/tree-vect-stmts.cc | 244 + gcc/tree-vectorizer.h | 9 + 10 files changed, 1011 insertions(+), 92 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi +__attribute__ ((__vector_size__ (4))); typedef char __v8qi +__attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) { + return __builtin_convertvector((__v2di)a, __v2si); } + +__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); } + +__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); } + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); } + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); } + +__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); } + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); } + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i
[PATCH] i386: Refine all cvtt* instructions with UNSPEC instead of FIX/UNSIGNED_FIX.
Hi, all This patch aims to refine all cvtt* instructions with UNSPEC instead of FIX/UNSIGNED_FIX. Because the intrinsics should behave as documented. Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk? BRs, Lin gcc/ChangeLog: PR target/115161 * config/i386/i386-builtin.def: Change CODE_FOR_* for cvtt*'s builtins. * config/i386/sse.md * (unspec_avx512fp16_fix_trunc2): Use UNSPEC instead of FIX/UNSIGNED_FIX. (unspec_avx512fp16_fix_trunc2): Ditto. (unspec_avx512fp16_fix_truncv2di2): Ditto. (unspec_avx512fp16_fix_trunc2): Ditto. (unspec_sse_cvttps2pi): Ditto. (unspec_sse_cvttss2si): Ditto. (unspec_fix_truncv16sfv16si2): Ditto. (unspec_fix_truncv8sfv8si2): Ditto. (unspec_fix_truncv4sfv4si2): Ditto. (unspec_sse2_cvttpd2pi): Ditto. (unspec_fixuns_truncv2dfv2si2): Ditto. (unspec_avx512f_vcvttss2usi): Ditto. (unspec_avx512f_vcvttsd2usi): Ditto. (unspec_sse2_cvttsd2si): Ditto. (unspec_fix_truncv8dfv8si2): Ditto. (*unspec_fixuns_truncv2dfv2si2): Ditto. (unspec_fixuns_truncv2dfv2si2_mask): Ditto. (unspec_fix_truncv4dfv4si2): Ditto. (unspec_fixuns_truncv4dfv4si2): Ditto. (unspec_fix_trunc2): Ditto. (unspec_fix_trunc2): Ditto. (unspec_avx512dq_fix_truncv2sfv2di2): Ditto. (unspec_fixuns_trunc2): Ditto. (unspec_sse2_cvttpd2dq): Ditto. gcc/testsuite/ChangeLog: PR target/115161 * gcc.target/i386/pr115161-1.c: New test. --- gcc/config/i386/i386-builtin.def | 128 gcc/config/i386/sse.md | 335 + gcc/testsuite/gcc.target/i386/pr115161-1.c | 65 3 files changed, 464 insertions(+), 64 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr115161-1.c diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 729355230b8..893e2baa006 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -631,9 +631,9 @@ BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX, 0, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF) BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF) BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF) -BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX, 0, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF) -BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF) -BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF) +BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX, 0, CODE_FOR_unspec_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF) +BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_unspec_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF) +BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_unspec_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF) BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) @@ -725,19 +725,19 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2p BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF) BDESC (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF) BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF) -BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF) -BDESC (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF) +BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_unspec_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF) +BDESC (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_unspec_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi",
[PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
I wrap a part of code about indirect conversion. The API refers to supportable_narrowing/widening_operations. BRs, Lin gcc/ChangeLog: PR target/107432 * tree-vect-generic.cc (expand_vector_conversion): Support convert for int -> int, float -> float and int <-> float. * tree-vect-stmts.cc (vectorizable_conversion): Wrap the indirect convert part. (supportable_indirect_convert_operation): New function. * tree-vectorizer.h (supportable_indirect_convert_operation): Define the new function. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: New test. * gcc.target/i386/pr107432-2.c: Ditto. * gcc.target/i386/pr107432-3.c: Ditto. * gcc.target/i386/pr107432-4.c: Ditto. * gcc.target/i386/pr107432-5.c: Ditto. * gcc.target/i386/pr107432-6.c: Ditto. * gcc.target/i386/pr107432-7.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 ++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 + gcc/tree-vect-generic.cc | 33 ++- gcc/tree-vect-stmts.cc | 244 + gcc/tree-vectorizer.h | 9 + 10 files changed, 1011 insertions(+), 92 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2si); +} + +__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); +} + +__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); +} + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); +} + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); +} + +__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); +} + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); +} + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4qi); +} + +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8di)a, __v8qi); +} + +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2hi); +} + +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4hi); +} +
RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
> -Original Message- > From: Richard Biener > Sent: Monday, June 3, 2024 5:03 PM > To: Hu, Lin1 > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > ubiz...@gmail.com > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, > float > -> float and int <-> float. > > On Mon, 3 Jun 2024, Hu, Lin1 wrote: > > > > -Original Message- > > > From: Richard Biener > > > Sent: Friday, May 31, 2024 8:41 PM > > > To: Hu, Lin1 > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > > > ubiz...@gmail.com > > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for > > > int -> int, float > > > -> float and int <-> float. > > > > > > On Fri, 31 May 2024, Hu, Lin1 wrote: > > > > > > > > -Original Message- > > > > > From: Richard Biener > > > > > Sent: Wednesday, May 29, 2024 5:41 PM > > > > > To: Hu, Lin1 > > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao > > > > > ; ubiz...@gmail.com > > > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn > > > > > for int -> int, float > > > > > -> float and int <-> float. > > > > > > > > > > On Thu, 23 May 2024, Hu, Lin1 wrote: > > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > > > PR target/107432 > > > > > > * tree-vect-generic.cc > > > > > > (supportable_indirect_narrowing_operation): New function for > > > > > > support indirect narrowing convert. > > > > > > (supportable_indirect_widening_operation): New function for > > > > > > support indirect widening convert. > > > > > > (expand_vector_conversion): Support convert for int -> int, > > > > > > float -> float and int <-> float. > > > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > > > PR target/107432 > > > > > > * gcc.target/i386/pr107432-1.c: New test. > > > > > > * gcc.target/i386/pr107432-2.c: Ditto. > > > > > > * gcc.target/i386/pr107432-3.c: Ditto. > > > > > > * gcc.target/i386/pr107432-4.c: Ditto. > > > > > > * gcc.target/i386/pr107432-5.c: Ditto. > > > > > > * gcc.target/i386/pr107432-6.c: Ditto. > > > > > > * gcc.target/i386/pr107432-7.c: Ditto. > > > > > > --- > > > > > > diff --git a/gcc/tree-vect-generic.cc > > > > > > b/gcc/tree-vect-generic.cc index > > > > > > ab640096ca2..0bedb53d9f9 100644 > > > > > > --- a/gcc/tree-vect-generic.cc > > > > > > +++ b/gcc/tree-vect-generic.cc > > > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If > > > > > > not see #include "gimple-match.h" > > > > > > #include "recog.h" /* FIXME: for insn_data */ > > > > > > #include "optabs-libfuncs.h" > > > > > > +#include "cfgloop.h" > > > > > > +#include "tree-vectorizer.h" > > > > > > > > > > > > > > > > > > /* Build a ternary operation and gimplify it. Emit code before > > > > > > GSI. > > > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion > > > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a, > > > > > >return gimplify_build2 (gsi, code, outer_type, b, c); } > > > > > > > > > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > > > +conversion > > > > > for > > > > > > + float <-> int, like double -> char. */ bool > > > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator > > > > > > *gsi, > > > > > > +enum tree_code code, > > > > > > +tree lhs, > > > > > > +tree arg) > > > > > > +{ > > > > > > + gimple *g; > > > > > > + tree ret_type = TREE_TYPE (lhs); > > > > > > + tree arg_type = TREE_TYPE (arg); > >
RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
> -Original Message- > From: Richard Biener > Sent: Friday, May 31, 2024 8:41 PM > To: Hu, Lin1 > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > ubiz...@gmail.com > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, > float > -> float and int <-> float. > > On Fri, 31 May 2024, Hu, Lin1 wrote: > > > > -Original Message- > > > From: Richard Biener > > > Sent: Wednesday, May 29, 2024 5:41 PM > > > To: Hu, Lin1 > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > > > ubiz...@gmail.com > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for > > > int -> int, float > > > -> float and int <-> float. > > > > > > On Thu, 23 May 2024, Hu, Lin1 wrote: > > > > > > > gcc/ChangeLog: > > > > > > > > PR target/107432 > > > > * tree-vect-generic.cc > > > > (supportable_indirect_narrowing_operation): New function for > > > > support indirect narrowing convert. > > > > (supportable_indirect_widening_operation): New function for > > > > support indirect widening convert. > > > > (expand_vector_conversion): Support convert for int -> int, > > > > float -> float and int <-> float. > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > PR target/107432 > > > > * gcc.target/i386/pr107432-1.c: New test. > > > > * gcc.target/i386/pr107432-2.c: Ditto. > > > > * gcc.target/i386/pr107432-3.c: Ditto. > > > > * gcc.target/i386/pr107432-4.c: Ditto. > > > > * gcc.target/i386/pr107432-5.c: Ditto. > > > > * gcc.target/i386/pr107432-6.c: Ditto. > > > > * gcc.target/i386/pr107432-7.c: Ditto. > > > > --- > > > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc > > > > index > > > > ab640096ca2..0bedb53d9f9 100644 > > > > --- a/gcc/tree-vect-generic.cc > > > > +++ b/gcc/tree-vect-generic.cc > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not > > > > see #include "gimple-match.h" > > > > #include "recog.h" /* FIXME: for insn_data */ > > > > #include "optabs-libfuncs.h" > > > > +#include "cfgloop.h" > > > > +#include "tree-vectorizer.h" > > > > > > > > > > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a, > > > >return gimplify_build2 (gsi, code, outer_type, b, c); } > > > > > > > > +/* A subroutine of expand_vector_conversion, support indirect > > > > +conversion > > > for > > > > + float <-> int, like double -> char. */ bool > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, > > > > +enum tree_code code, > > > > +tree lhs, > > > > +tree arg) > > > > +{ > > > > + gimple *g; > > > > + tree ret_type = TREE_TYPE (lhs); > > > > + tree arg_type = TREE_TYPE (arg); > > > > + tree new_rhs; > > > > + > > > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= > > > arg_elt_bits) > > > > +return false; > > > > + > > > > + unsigned short target_size; > > > > + scalar_mode tmp_cvt_mode; > > > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); > > > > + tree cvt_type = NULL_TREE; tmp_cvt_mode = lhs_mode; > > > > + target_size = GET_MODE_SIZE (rhs_mode); > > > > + > > > > + opt_scalar_mode mode_iter; > > > > + enum tree_code tc1, tc2; > > > > + unsigned HOST_WIDE_INT nelts > > > > += constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > > > + > > > > + FOR_EACH_2XWI
RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
> -Original Message- > From: Richard Biener > Sent: Wednesday, May 29, 2024 5:41 PM > To: Hu, Lin1 > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > ubiz...@gmail.com > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for int -> int, > float > -> float and int <-> float. > > On Thu, 23 May 2024, Hu, Lin1 wrote: > > > gcc/ChangeLog: > > > > PR target/107432 > > * tree-vect-generic.cc > > (supportable_indirect_narrowing_operation): New function for > > support indirect narrowing convert. > > (supportable_indirect_widening_operation): New function for > > support indirect widening convert. > > (expand_vector_conversion): Support convert for int -> int, > > float -> float and int <-> float. > > > > gcc/testsuite/ChangeLog: > > > > PR target/107432 > > * gcc.target/i386/pr107432-1.c: New test. > > * gcc.target/i386/pr107432-2.c: Ditto. > > * gcc.target/i386/pr107432-3.c: Ditto. > > * gcc.target/i386/pr107432-4.c: Ditto. > > * gcc.target/i386/pr107432-5.c: Ditto. > > * gcc.target/i386/pr107432-6.c: Ditto. > > * gcc.target/i386/pr107432-7.c: Ditto. > > --- > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index > > ab640096ca2..0bedb53d9f9 100644 > > --- a/gcc/tree-vect-generic.cc > > +++ b/gcc/tree-vect-generic.cc > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3. If not see > > #include "gimple-match.h" > > #include "recog.h" /* FIXME: for insn_data */ > > #include "optabs-libfuncs.h" > > +#include "cfgloop.h" > > +#include "tree-vectorizer.h" > > > > > > /* Build a ternary operation and gimplify it. Emit code before GSI. > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion > (gimple_stmt_iterator *gsi, tree inner_type, tree a, > >return gimplify_build2 (gsi, code, outer_type, b, c); } > > > > +/* A subroutine of expand_vector_conversion, support indirect conversion > for > > + float <-> int, like double -> char. */ bool > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi, > > +enum tree_code code, > > +tree lhs, > > +tree arg) > > +{ > > + gimple *g; > > + tree ret_type = TREE_TYPE (lhs); > > + tree arg_type = TREE_TYPE (arg); > > + tree new_rhs; > > + > > + unsigned int ret_elt_bits = vector_element_bits (ret_type); > > + unsigned int arg_elt_bits = vector_element_bits (arg_type); if > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >= > arg_elt_bits) > > +return false; > > + > > + unsigned short target_size; > > + scalar_mode tmp_cvt_mode; > > + scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type)); > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type)); tree > > + cvt_type = NULL_TREE; tmp_cvt_mode = lhs_mode; target_size = > > + GET_MODE_SIZE (rhs_mode); > > + > > + opt_scalar_mode mode_iter; > > + enum tree_code tc1, tc2; > > + unsigned HOST_WIDE_INT nelts > > += constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type)); > > + > > + FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode) > > +{ > > + tmp_cvt_mode = mode_iter.require (); > > + > > + if (GET_MODE_SIZE (tmp_cvt_mode) > target_size) > > + break; > > + > > + scalar_mode cvt_mode; > > + int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode); > > + if (!int_mode_for_size (tmp_cvt_size, 0).exists (_mode)) > > + break; > > + > > + int cvt_size = GET_MODE_BITSIZE (cvt_mode); > > + bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED > (arg_type); > > + cvt_type = build_nonstandard_integer_type (cvt_size, > > + isUnsigned); > > + > > + cvt_type = build_vector_type (cvt_type, nelts); > > + if (cvt_type == NULL_TREE > > + || !supportable_convert_operation ((tree_code) NOP_EXPR, > > +ret_type, > > +cvt_type, ) > > + || !supportable_convert_operation ((tree_code) code, > > +cvt_type, > > +arg_type, )) > > + continue; > > + > > + new_rhs = make_ssa_name (cvt_type); > > + g = vect_gimple_build (ne
[PATCH] i386: Handle target of __builtin_ia32_cmp[p|s][s|d] from avx into sse/sse2/avx
Hi, all This patch aims to extend __builtin_ia32_cmp[p|s][s|d] from avx to sse/sse2/avx, where its immediate is in range of [0, 7]. Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk? BRs, Lin gcc/ChangeLog: * config/i386/avxintrin.h: Move cmp[p|s][s|d] to [e|x]mmintrin.h, and move macros to xmmintrin.h * config/i386/emmintrin.h: Add cmp[p|s]s intrins. * config/i386/i386-builtin.def: Modify __builtin_ia32_cmp[p|s][s|d]. * config/i386/i386-expand.cc (ix86_expand_args_builtin): Raise error when imm is in range of [8, 32] without avx. * config/i386/sse.md (avx_cmp3): Modefy define_insn. (avx_vmcmp3): Ditto. * config/i386/xmmintrin.h (_CMP_EQ_OQ): New macro for sse/sse2. (_CMP_LT_OS): Ditto (_CMP_LE_OS): Ditto (_CMP_UNORD_Q): Ditto (_CMP_NEQ_UQ): Ditto (_CMP_NLT_US): Ditto (_CMP_NLE_US): Ditto (_CMP_ORD_Q): Ditto (_mm_cmp_ps): Move intrin from avxintrin.h to xmmintrin.h (_mm_cmp_ss): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/sse-cmp-1.c: New test. * gcc.target/i386/sse-cmp-2.c: Ditto. * gcc.target/i386/sse-cmp-error-1.c: Ditto. --- gcc/config/i386/avxintrin.h | 56 --- gcc/config/i386/emmintrin.h | 22 + gcc/config/i386/i386-builtin.def | 10 +- gcc/config/i386/i386-expand.cc| 6 ++ gcc/config/i386/predicates.md | 5 + gcc/config/i386/sse.md| 42 gcc/config/i386/xmmintrin.h | 41 gcc/testsuite/gcc.target/i386/sse-cmp-1.c | 20 gcc/testsuite/gcc.target/i386/sse-cmp-2.c | 96 +++ gcc/testsuite/gcc.target/i386/sse-cmp-error.c | 16 10 files changed, 236 insertions(+), 78 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/sse-cmp-1.c create mode 100644 gcc/testsuite/gcc.target/i386/sse-cmp-2.c create mode 100644 gcc/testsuite/gcc.target/i386/sse-cmp-error.c diff --git a/gcc/config/i386/avxintrin.h b/gcc/config/i386/avxintrin.h index 80214540888..ec9b9905b5f 100644 --- a/gcc/config/i386/avxintrin.h +++ b/gcc/config/i386/avxintrin.h @@ -72,22 +72,6 @@ typedef double __m256d_u __attribute__ ((__vector_size__ (32), /* Compare predicates for scalar and packed compare intrinsics. */ -/* Equal (ordered, non-signaling) */ -#define _CMP_EQ_OQ 0x00 -/* Less-than (ordered, signaling) */ -#define _CMP_LT_OS 0x01 -/* Less-than-or-equal (ordered, signaling) */ -#define _CMP_LE_OS 0x02 -/* Unordered (non-signaling) */ -#define _CMP_UNORD_Q 0x03 -/* Not-equal (unordered, non-signaling) */ -#define _CMP_NEQ_UQ0x04 -/* Not-less-than (unordered, signaling) */ -#define _CMP_NLT_US0x05 -/* Not-less-than-or-equal (unordered, signaling) */ -#define _CMP_NLE_US0x06 -/* Ordered (nonsignaling) */ -#define _CMP_ORD_Q 0x07 /* Equal (unordered, non-signaling) */ #define _CMP_EQ_UQ 0x08 /* Not-greater-than-or-equal (unordered, signaling) */ @@ -381,18 +365,6 @@ _mm256_xor_ps (__m256 __A, __m256 __B) } #ifdef __OPTIMIZE__ -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) -{ - return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); -} - -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) -{ - return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); -} - extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) { @@ -406,27 +378,7 @@ _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, __P); } - -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) -{ - return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); -} - -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) -{ - return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); -} #else -#define _mm_cmp_pd(X, Y, P)\ - ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P))) - -#define _mm_cmp_ps(X, Y, P)\ - ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P))) - #define _mm256_cmp_pd(X, Y, P) \ ((__m256d)
[PATCH 3/3 v2] vect: support direct conversion under x86-64-v3.
According to hongtao's suggestion, I support some trunc in mmx.md under x86-64-v3, and optimize ix86_expand_trunc_with_avx2_noavx512f. BRs, Lin gcc/ChangeLog: PR 107432 * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f): New function for generate a series of suitable insn. * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f): Define new function. * config/i386/sse.md: Extend trunc2 for x86-64-v3. (ssebytemode) Add V8HI. (PMOV_DST_MODE_2_AVX2): New mode iterator. (PMOV_SRC_MODE_3_AVX2): Ditto. * config/i386/mmx.md (trunc2): Ditto. (avx512vl_trunc2): Ditto. (truncv2si2): Ditto. (avx512vl_truncv2si2): Ditto. (mmxbytemode): New mode attr. gcc/testsuite/ChangeLog: PR 107432 * gcc.target/i386/pr107432-8.c: New test. * gcc.target/i386/pr107432-9.c: Ditto. * gcc.target/i386/pr92645-4.c: Modify test. --- gcc/config/i386/i386-expand.cc | 44 ++- gcc/config/i386/i386-protos.h | 3 + gcc/config/i386/mmx.md | 35 +- gcc/config/i386/sse.md | 88 ++ gcc/testsuite/gcc.target/i386/pr107432-8.c | 94 +++ gcc/testsuite/gcc.target/i386/pr107432-9.c | 129 + gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - 7 files changed, 363 insertions(+), 32 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..90705803d29 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) emit_insn (gen_xorv4si3 (value, value, large)); } -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, -machine_mode mode, rtx target, -rtx var, int one_var); - /* Convert an unsigned DImode value into a DFmode, using only SSE. Expects the 64-bit DImode to be supplied in a pair of integral registers. Requires SSE2; will use SSE3 if available. For x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, whose ONE_VAR element is VAR, and other elements are zero. Return true if successful. */ -static bool +bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, rtx target, rtx var, int one_var) { @@ -25551,4 +25547,42 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) return ret; } +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ + +void +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_mode) +{ + machine_mode out_mode = GET_MODE (output); + machine_mode in_mode = GET_MODE (input); + int len = GET_MODE_SIZE (in_mode); + gcc_assert (len == GET_MODE_SIZE (cvt_mode) + && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode) + && (REG_P (input) || SUBREG_P (input))); + scalar_mode inner_out_mode = GET_MODE_INNER (out_mode); + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); + int out_innersize = GET_MODE_SIZE (inner_out_mode); + + struct expand_vec_perm_d d; + d.target = gen_reg_rtx (cvt_mode); + d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode); + d.op1 = d.op0; + d.vmode = cvt_mode; + d.nelt = GET_MODE_NUNITS (cvt_mode); + d.testing_p = false; + d.one_operand_p = true; + + /* Init perm. Put the needed bits of input in order and + fill the rest of bits by default. */ + for (int i = 0; i < d.nelt; ++i) +{ + d.perm[i] = i; + if (i < GET_MODE_NUNITS (out_mode)) + d.perm[i] = i * (in_innersize / out_innersize); +} + + bool ok = ix86_expand_vec_perm_const_1(); + gcc_assert (ok); + emit_move_insn (output, gen_lowpart (out_mode, d.target)); +} + #include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index dbc861fb1ea..aa826f4864f 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code, extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx, bool, rtx_code_label *); extern rtx ix86_expand_fast_convert_bf_to_sf (rtx); +extern void ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx, machine_mode); extern rtx ix86_memtag_untagged_pointer (rtx, rtx); extern bool ix86_memtag_can_tag_addresses (void); @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); extern void ix86_expand_sse2_abs (rtx, rtx); extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
[PATCH 2/3 v2] vect: Support v4hi -> v4qi.
Exclude add TARGET_MMX_WITH_SSE, I merge two patterns. BRs, Lin gcc/ChangeLog: PR target/107432 * config/i386/mmx.md (VI2_32_64): New mode iterator. (mmxhalfmode): New mode atter. (mmxhalfmodelower): Ditto. (truncv2hiv2qi2): Extend mode v4hi and change name from truncv2hiv2qi to trunc2. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: Modify test. * gcc.target/i386/pr107432-6.c: Add test. --- gcc/config/i386/mmx.md | 17 + gcc/testsuite/gcc.target/i386/pr107432-1.c | 13 - gcc/testsuite/gcc.target/i386/pr107432-6.c | 19 --- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 5f342497885..27b080bfeb6 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -67,6 +67,9 @@ (define_mode_iterator V2F_32 [V2HF V2BF]) ;; 4-byte integer vector modes (define_mode_iterator VI_32 [V4QI V2HI]) +;; 8-byte and 4-byte HImode vector modes +(define_mode_iterator VI2_32_64 [(V4HI "TARGET_MMX_WITH_SSE") V2HI]) + ;; 4-byte and 2-byte integer vector modes (define_mode_iterator VI_16_32 [V4QI V2QI V2HI]) @@ -106,6 +109,12 @@ (define_mode_attr mmxinsnmode (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) +(define_mode_attr mmxhalfmode + [(V4HI "V4QI") (V2HI "V2QI")]) + +(define_mode_attr mmxhalfmodelower + [(V4HI "v4qi") (V2HI "v2qi")]) + ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr mmxintvecmode [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI") @@ -4873,10 +4882,10 @@ (define_expand "v2qiv2hi2" DONE; }) -(define_insn "truncv2hiv2qi2" - [(set (match_operand:V2QI 0 "register_operand" "=v") - (truncate:V2QI - (match_operand:V2HI 1 "register_operand" "v")))] +(define_insn "trunc2" + [(set (match_operand: 0 "register_operand" "=v") + (truncate: + (match_operand:VI2_32_64 1 "register_operand" "v")))] "TARGET_AVX512VL && TARGET_AVX512BW" "vpmovwb\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c index a4f37447eb4..afdf367afe2 100644 --- a/gcc/testsuite/gcc.target/i386/pr107432-1.c +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -7,7 +7,8 @@ /* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */ #include @@ -113,6 +114,11 @@ __v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a) return __builtin_convertvector((__v2hi)a, __v2qi); } +__v4qi mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a) +{ + return __builtin_convertvector((__v4hi)a, __v4qi); +} + __v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) { return __builtin_convertvector((__v8hi)a, __v8qi); @@ -218,6 +224,11 @@ __v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a) return __builtin_convertvector((__v2hu)a, __v2qu); } +__v4qu mm64_cvtepu16_epu8_builtin_convertvector(__v4hu a) +{ + return __builtin_convertvector((__v4hu)a, __v4qu); +} + __v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a) { return __builtin_convertvector((__v8hu)a, __v8qu); diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c index 4a68a10b089..7d3717d45bc 100644 --- a/gcc/testsuite/gcc.target/i386/pr107432-6.c +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c @@ -8,11 +8,14 @@ /* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */ /* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */ -/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */ +/* { dg-final { scan-assembler-times "vcvttph2w" 4 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2w" 5 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2uw" 5 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */ #include @@ -103,6 +106,11 @@ __v2qi
[PATCH] i386: Optimize EQ/NE comparison between avx512 kmask and -1.
Hi all, This patch aims to acheive EQ/NE comparison between avx512 kmask and -1 by using kxortest with checking CF. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk? BRs, Lin gcc/ChangeLog: PR target/113609 * config/i386/sse.md (*kortest_cmp_setcc): New define_insn_and_split. (*kortest_cmp_jcc): Ditto. gcc/testsuite/ChangeLog: PR target/113609 * gcc.target/i386/pr113609-1.c: New test. * gcc.target/i386/pr113609-2.c: Ditto. --- gcc/config/i386/sse.md | 67 +++ gcc/testsuite/gcc.target/i386/pr113609-1.c | 194 + gcc/testsuite/gcc.target/i386/pr113609-2.c | 161 + 3 files changed, 422 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-2.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b59c988fc31..34fd2e4afac 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2201,6 +2201,73 @@ (define_expand "kortest" UNSPEC_KORTEST))] "TARGET_AVX512F") +;; Optimize cmp + setcc with mask register by kortest + setcc. +(define_insn_and_split "*kortest_cmp_setcc" + [(set (match_operand:QI 0 "nonimmediate_operand" "=qm, qm") +(match_operator:QI 1 "bt_comparison_operator" + [(match_operand:SWI1248_AVX512BWDQ_64 2 "register_operand" "?k, ") +(const_int -1)])) + (clobber (reg:CC FLAGS_REG))] + "TARGET_AVX512BW" + "#" + "&& reload_completed" + [(const_int 0)] +{ + if (MASK_REGNO_P (REGNO (operands[2]))) +{ + emit_insn (gen_kortest_ccc (operands[2], operands[2])); + operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG); +} + else +{ + operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG); + emit_insn (gen_rtx_SET (operands[4], + gen_rtx_COMPARE (CCZmode, + operands[2], + constm1_rtx))); +} + ix86_expand_setcc (operands[0], +GET_CODE (operands[1]), +operands[4], +const0_rtx); + DONE; +}) + +;; Optimize cmp + jcc with mask register by kortest + jcc. +(define_insn_and_split "*kortest_cmp_jcc" + [(set (pc) + (if_then_else + (match_operator 0 "bt_comparison_operator" + [(match_operand:SWI1248_AVX512BWDQ_64 1 "register_operand" "?k, ") + (const_int -1)]) + (label_ref (match_operand 2)) + (pc))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_AVX512BW" + "#" + "&& reload_completed" + [(const_int 0)] +{ + if (MASK_REGNO_P (REGNO (operands[1]))) +{ + emit_insn (gen_kortest_ccc (operands[1], operands[1])); + operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG); +} + else +{ + operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG); + emit_insn (gen_rtx_SET (operands[4], + gen_rtx_COMPARE (CCZmode, + operands[1], + constm1_rtx))); +} + ix86_expand_branch (GET_CODE (operands[0]), + operands[4], + const0_rtx, + operands[2]); + DONE; +}) + (define_insn "kunpckhi" [(set (match_operand:HI 0 "register_operand" "=k") (ior:HI diff --git a/gcc/testsuite/gcc.target/i386/pr113609-1.c b/gcc/testsuite/gcc.target/i386/pr113609-1.c new file mode 100644 index 000..f0639b8500a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr113609-1.c @@ -0,0 +1,194 @@ +/* PR target/113609 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4" } */ +/* { dg-final { scan-assembler-not "^cmp" } } */ +/* { dg-final { scan-assembler-not "\[ \\t\]+sete" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "\[ \\t\]+setne" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "\[ \\t\]+je" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-not "\[ \\t\]+jne" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+sete" 1 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+setne" 1 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+je" 1 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[ \\t\]+jne" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "kortest" 12 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "kortest" 17 { target { ! ia32 } } } } */ + +#include + +unsigned int +cmp_vector_sete_mask8(__m128i a, __m128i b) +{ +__mmask8 k = _mm_cmpeq_epi16_mask (a, b); +if (k == (__mmask8) -1) + return 1; +else + return 0; +} + +unsigned int +cmp_vector_sete_mask16(__m128i a, __m128i b) +{ +__mmask16 k = _mm_cmpeq_epi8_mask (a, b); +if (k == (__mmask16) -1) + return 1; +else + return 0; +}
RE: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> -Original Message- > From: Hongtao Liu > Sent: Thursday, May 23, 2024 2:42 PM > To: Hu, Lin1 > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > ubiz...@gmail.com; rguent...@suse.de > Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3. > > On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 wrote: > > > > gcc/ChangeLog: > > > > PR 107432 > > * config/i386/i386-expand.cc > > (ix86_expand_trunc_with_avx2_noavx512f): > > New function for generate a series of suitable insn. > > * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f): > > Define new function. > > * config/i386/sse.md: Extend trunc2 for x86-64-v3. > I have some concern for this patch since > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this > patch. OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5). Or we can disable some of the optimization via vpermq. In pr107432-8.c, there are only 5 tests that use vpermq. BRs, Lin > > gcc/testsuite/ChangeLog: > > > > PR 107432 > > * gcc.target/i386/pr107432-8.c: New test. > > * gcc.target/i386/pr107432-9.c: Ditto. > > * gcc.target/i386/pr92645-4.c: Modify test. > > --- > > gcc/config/i386/i386-expand.cc | 47 +++- > > gcc/config/i386/i386-protos.h | 3 + > > gcc/config/i386/sse.md | 87 +++ > > gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 + > > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 + > > gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - > > 6 files changed, 304 insertions(+), 29 deletions(-) create mode > > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c > > > > diff --git a/gcc/config/i386/i386-expand.cc > > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644 > > --- a/gcc/config/i386/i386-expand.cc > > +++ b/gcc/config/i386/i386-expand.cc > > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) > >emit_insn (gen_xorv4si3 (value, value, large)); } > > > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, > > -machine_mode mode, rtx > > target, > > -rtx var, int one_var); > > - > > /* Convert an unsigned DImode value into a DFmode, using only SSE. > > Expects the 64-bit DImode to be supplied in a pair of integral > > registers. Requires SSE2; will use SSE3 if available. For > > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool > mmx_ok, machine_mode mode, > > whose ONE_VAR element is VAR, and other elements are zero. Return true > > if successful. */ > > > > -static bool > > +bool > > ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, > > rtx target, rtx var, int one_var) > > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) > >return ret; > > } > > > > +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ > > + > > +bool > > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) { > > + machine_mode out_mode = GET_MODE (output); > > + machine_mode in_mode = GET_MODE (input); > > + int len = GET_MODE_SIZE (in_mode); > > + gcc_assert (len == 16 || len == 32); > > + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; > > + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); > > + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); > > + > > + struct expand_vec_perm_d d; > > + d.target = gen_reg_rtx (cvt_mode); > > + d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), > > + in_mode); > > + d.op1 = d.op0; > > + d.vmode = cvt_mode; > > + d.nelt = len; > > + d.testing_p = false; > > + d.one_operand_p = true; > > + > > + /* Init perm. Put the needed bits of input in order and > > + fill the rest of bits by default. */ int tot = 0; for (int i > > + = 0; i < len; ++i) > > +{ > > + d.perm[i] = i; > > + if ((i % in_innersize) < out_innersize) > > + d.perm[tot++] = i; > > +} > > + > > + if (ix86_expand_vec_perm_const_1()) > > +{ > > + emit_move_insn (outp
[PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
gcc/ChangeLog: PR target/107432 * tree-vect-generic.cc (supportable_indirect_narrowing_operation): New function for support indirect narrowing convert. (supportable_indirect_widening_operation): New function for support indirect widening convert. (expand_vector_conversion): Support convert for int -> int, float -> float and int <-> float. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: New test. * gcc.target/i386/pr107432-2.c: Ditto. * gcc.target/i386/pr107432-3.c: Ditto. * gcc.target/i386/pr107432-4.c: Ditto. * gcc.target/i386/pr107432-5.c: Ditto. * gcc.target/i386/pr107432-6.c: Ditto. * gcc.target/i386/pr107432-7.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 + gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++ gcc/tree-vect-generic.cc | 157 +- 8 files changed, 968 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2si); +} + +__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); +} + +__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); +} + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); +} + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); +} + +__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); +} + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); +} + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4qi); +} + +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8di)a, __v8qi); +} + +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2hi); +} + +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4hi); +} + +__m128imm256_cvtepi32_epi16_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v8si)a, __v8hi); +} + +__m256imm512_cvtepi32_epi16_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);
[PATCH 3/3] vect: support direct conversion under x86-64-v3.
gcc/ChangeLog: PR 107432 * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f): New function for generate a series of suitable insn. * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f): Define new function. * config/i386/sse.md: Extend trunc2 for x86-64-v3. gcc/testsuite/ChangeLog: PR 107432 * gcc.target/i386/pr107432-8.c: New test. * gcc.target/i386/pr107432-9.c: Ditto. * gcc.target/i386/pr92645-4.c: Modify test. --- gcc/config/i386/i386-expand.cc | 47 +++- gcc/config/i386/i386-protos.h | 3 + gcc/config/i386/sse.md | 87 +++ gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 + gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 + gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - 6 files changed, 304 insertions(+), 29 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) emit_insn (gen_xorv4si3 (value, value, large)); } -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, -machine_mode mode, rtx target, -rtx var, int one_var); - /* Convert an unsigned DImode value into a DFmode, using only SSE. Expects the 64-bit DImode to be supplied in a pair of integral registers. Requires SSE2; will use SSE3 if available. For x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, whose ONE_VAR element is VAR, and other elements are zero. Return true if successful. */ -static bool +bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, rtx target, rtx var, int one_var) { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) return ret; } +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ + +bool +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) +{ + machine_mode out_mode = GET_MODE (output); + machine_mode in_mode = GET_MODE (input); + int len = GET_MODE_SIZE (in_mode); + gcc_assert (len == 16 || len == 32); + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); + + struct expand_vec_perm_d d; + d.target = gen_reg_rtx (cvt_mode); + d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode); + d.op1 = d.op0; + d.vmode = cvt_mode; + d.nelt = len; + d.testing_p = false; + d.one_operand_p = true; + + /* Init perm. Put the needed bits of input in order and + fill the rest of bits by default. */ + int tot = 0; + for (int i = 0; i < len; ++i) +{ + d.perm[i] = i; + if ((i % in_innersize) < out_innersize) + d.perm[tot++] = i; +} + + if (ix86_expand_vec_perm_const_1()) +{ + emit_move_insn (output, gen_lowpart (out_mode, d.target)); + return true; +} + + return false; +} + #include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index dbc861fb1ea..ac29fb34028 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code, extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx, bool, rtx_code_label *); extern rtx ix86_expand_fast_convert_bf_to_sf (rtx); +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx); extern rtx ix86_memtag_untagged_pointer (rtx, rtx); extern bool ix86_memtag_can_tag_addresses (void); @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); extern void ix86_expand_sse2_abs (rtx, rtx); extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, rtx); +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx, +rtx, int); extern bool ix86_extract_perm_from_pool_constant (int*, rtx); /* In i386-c.cc */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index f57f36ae380..0b14b3dc1ac 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -14373,14 +14373,25 @@ (define_expand "avx512bw_v32hiv32qi2_mask_store" (define_mode_iterator PMOV_DST_MODE_2 [V4SI V8HI (V16QI "TARGET_AVX512BW")]) +(define_mode_iterator PMOV_DST_MODE_2_AVX2 + [V4SI V8HI V16QI]) (define_mode_attr pmov_suff_2
[PATCH 2/3] vect: Support v4hi -> v4qi.
gcc/ChangeLog: PR target/107432 * config/i386/mmx.md (truncv4hiv4qi2): New define_insn. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-6.c: Add test. --- gcc/config/i386/mmx.md | 10 ++ gcc/testsuite/gcc.target/i386/pr107432-1.c | 12 +++- gcc/testsuite/gcc.target/i386/pr107432-6.c | 19 --- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 5f342497885..30f0d88af9f 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -4883,6 +4883,16 @@ (define_insn "truncv2hiv2qi2" (set_attr "prefix" "evex") (set_attr "mode" "TI")]) +(define_insn "truncv4hiv4qi2" + [(set (match_operand:V4QI 0 "register_operand" "=v") + (truncate:V4QI + (match_operand:V4HI 1 "register_operand" "v")))] + "TARGET_AVX512VL && TARGET_AVX512BW" + "vpmovwb\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + (define_mode_iterator V2QI_V2HI [V2QI V2HI]) (define_insn "truncv2si2" [(set (match_operand:V2QI_V2HI 0 "register_operand" "=v") diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c index a4f37447eb4..e0c7ffc8e5b 100644 --- a/gcc/testsuite/gcc.target/i386/pr107432-1.c +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -7,7 +7,7 @@ /* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 10 } } */ #include @@ -113,6 +113,11 @@ __v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi a) return __builtin_convertvector((__v2hi)a, __v2qi); } +__v4qi mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a) +{ + return __builtin_convertvector((__v4hi)a, __v4qi); +} + __v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a) { return __builtin_convertvector((__v8hi)a, __v8qi); @@ -218,6 +223,11 @@ __v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu a) return __builtin_convertvector((__v2hu)a, __v2qu); } +__v4qu mm64_cvtepu16_epu8_builtin_convertvector(__v4hu a) +{ + return __builtin_convertvector((__v4hu)a, __v4qu); +} + __v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a) { return __builtin_convertvector((__v8hu)a, __v8qu); diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c b/gcc/testsuite/gcc.target/i386/pr107432-6.c index 4a68a10b089..7d3717d45bc 100644 --- a/gcc/testsuite/gcc.target/i386/pr107432-6.c +++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c @@ -8,11 +8,14 @@ /* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */ /* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */ -/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */ +/* { dg-final { scan-assembler-times "vcvttph2w" 4 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2w" 5 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2uw" 4 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vcvttph2uw" 5 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */ /* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */ #include @@ -103,6 +106,11 @@ __v2qi mm32_cvtph_epi8_builtin_convertvector(__v2hf a) return __builtin_convertvector((__v2hf)a, __v2qi); } +__v4qi mm64_cvtph_epi8_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector((__v4hf)a, __v4qi); +} + __v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a) { return __builtin_convertvector((__v8hf)a, __v8qi); @@ -123,6 +131,11 @@ __v2qu mm32_cvtph_epu8_builtin_convertvector(__v2hf a) return __builtin_convertvector((__v2hf)a, __v2qu); } +__v4qu mm64_cvtph_epu8_builtin_convertvector(__v4hf a) +{ + return __builtin_convertvector((__v4hf)a, __v4qu); +} + __v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a) { return __builtin_convertvector((__v8hf)a, __v8qu); -- 2.31.1
[PATCH 0/3] Optimize __builtin_convertvector for x86-64-v4 and
These patches are a series of improved patches to the __builtin_convertvector for x86-64-v4 and x86-64-v3. I modified the first patch according to Richard's suggestion and send them out together to show my complete modification of the function. They are bootstrapped and regtested on x86_64-pc-linux-gnu. BRs, Lin Hu, Lin1 (3): vect: generate suitable convert insn for int -> int, float -> float and int <-> float. vect: Support v4hi -> v4qi. vect: support direct conversion under x86-64-v3. gcc/config/i386/i386-expand.cc | 47 +++- gcc/config/i386/i386-protos.h | 3 + gcc/config/i386/mmx.md | 10 + gcc/config/i386/sse.md | 87 ++-- gcc/testsuite/gcc.target/i386/pr107432-1.c | 244 + gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 ++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 152 + gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 + gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 ++ gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 ++ gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - gcc/tree-vect-generic.cc | 157 - 15 files changed, 1305 insertions(+), 35 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c -- 2.31.1
RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
> -Original Message- > From: Richard Biener > Sent: Tuesday, May 14, 2024 8:23 PM > To: Hu, Lin1 > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > ubiz...@gmail.com > Subject: RE: [PATCH] vect: generate suitable convert insn for int -> int, > float -> > float and int <-> float. > > On Tue, 14 May 2024, Hu, Lin1 wrote: > > > Do you have any advice? > > > > BRs, > > Lin > > > > -Original Message- > > From: Hu, Lin1 > > Sent: Wednesday, May 8, 2024 9:38 AM > > To: gcc-patches@gcc.gnu.org > > Cc: Liu, Hongtao ; ubiz...@gmail.com > > Subject: [PATCH] vect: generate suitable convert insn for int -> int, float > > -> > float and int <-> float. > > > > Hi, all > > > > This patch aims to optimize __builtin_convertvector. We want the function > can generate more efficient insn for some situations. Like v2si -> v2di. > > > > The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK > for trunk? > > I don't like the new code to be in a separate function, not integrated with > the > existing handling. Note the existing handling should get, say, V8DF -> V8SI > correct for SSE by splitting the operation into smaller vectors but your code > seems to just handle the cases the vectors are already properly sized. > Yes, my code only handles some cases, but others are handled by the core part of tree-vect-generic.cc. I just take care of some special cases up front. So, V8DF -> V8SI is still split into smaller vectors for SSE. And for SSE, I have another patch to expand the available direct optab environment with ix86_expand_vec_perm_const_1 (...). This patch hasn't been sent yet. I will sending it out together after I modify this patch. This gives an overall view of my changes to this function. > > Without checking it seems you are basing the code on what the vectorizer does? > Maybe we should have some common code that computes intermediate > conversion steps supported by the HW unifying what for example > supportable_widening_operation or supportable_narrowing_operation can do > to also cover int <-> float conversions. > Yes, my code is based on vectorizable_conversion(...). I will consider to split the function and define some new function like your advises to make my code more common. BRs, Lin > > That said, if you don't want to do that please still think about the core > part of > tree-vect-generic.cc which is breaking down large emulated vectors into small > supported vectors. > > Richard. > > > BRs, > > Lin > > > > gcc/ChangeLog: > > > > PR target/107432 > > * tree-vect-generic.cc (expand_vector_conversion): Support > > convert for int -> int, float -> float and int <-> float. > > (expand_vector_conversion_no_vec_pack): Check if can convert > > int <-> int, float <-> float and int <-> float, directly. > > Support indirect convert, when direct optab is not supported. > > > > gcc/testsuite/ChangeLog: > > > > PR target/107432 > > * gcc.target/i386/pr107432-1.c: New test. > > * gcc.target/i386/pr107432-2.c: Ditto. > > * gcc.target/i386/pr107432-3.c: Ditto. > > * gcc.target/i386/pr107432-4.c: Ditto. > > * gcc.target/i386/pr107432-5.c: Ditto. > > * gcc.target/i386/pr107432-6.c: Ditto. > > * gcc.target/i386/pr107432-7.c: Ditto. > > --- > > gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 + > gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + > gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + > gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + > gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++ > gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 > gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++ > > gcc/tree-vect-generic.cc | 107 +- > > 8 files changed, 918 insertions(+), 6 deletions(-) create mode > > 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c > > b/gcc/testsuite/gcc.target/i386/pr107432-1.c > > new file mode 100644 > &g
RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
Do you have any advice? BRs, Lin -Original Message- From: Hu, Lin1 Sent: Wednesday, May 8, 2024 9:38 AM To: gcc-patches@gcc.gnu.org Cc: Liu, Hongtao ; ubiz...@gmail.com Subject: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float. Hi, all This patch aims to optimize __builtin_convertvector. We want the function can generate more efficient insn for some situations. Like v2si -> v2di. The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk? BRs, Lin gcc/ChangeLog: PR target/107432 * tree-vect-generic.cc (expand_vector_conversion): Support convert for int -> int, float -> float and int <-> float. (expand_vector_conversion_no_vec_pack): Check if can convert int <-> int, float <-> float and int <-> float, directly. Support indirect convert, when direct optab is not supported. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: New test. * gcc.target/i386/pr107432-2.c: Ditto. * gcc.target/i386/pr107432-3.c: Ditto. * gcc.target/i386/pr107432-4.c: Ditto. * gcc.target/i386/pr107432-5.c: Ditto. * gcc.target/i386/pr107432-6.c: Ditto. * gcc.target/i386/pr107432-7.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 + gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++ gcc/tree-vect-generic.cc | 107 +- 8 files changed, 918 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi +__attribute__ ((__vector_size__ (4))); typedef char __v8qi +__attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) { + return __builtin_convertvector((__v2di)a, __v2si); } + +__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); } + +__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); } + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); } + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); } + +__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); } + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); } + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v
RE: [committed] testsuite: Fix up pr84508* tests [PR84508]
> -Original Message- > From: Jakub Jelinek > Sent: Friday, May 10, 2024 3:04 AM > To: Hongtao Liu > Cc: Hu, Lin1 ; gcc-patches@gcc.gnu.org; Liu, Hongtao > ; ubiz...@gmail.com > Subject: [committed] testsuite: Fix up pr84508* tests [PR84508] > > On Thu, May 09, 2024 at 12:45:42PM +0800, Hongtao Liu wrote: > > > PR target/84508 > > > * gcc.target/i386/pr84508-1.c: New test. > > > * gcc.target/i386/pr84508-2.c: Ditto. > > The tests FAIL on x86_64-linux with > /usr/bin/ld: cannot find -lubsan > collect2: error: ld returned 1 exit status compiler exited with status 1 > FAIL: gcc.target/i386/pr84508-1.c (test for excess errors) Excess errors: > /usr/bin/ld: cannot find -lubsan > > The problem is that only *.dg/ubsan/ubsan.exp calls ubsan_init which adds the > needed search paths to libubsan library. > So, link/run tests for -fsanitize=undefined need to go into gcc.dg/ubsan/ or > g++.dg/ubsan/, even when they are target specific. > Oh, I get it, thanks. > > Tested on x86_64-linux with > make check-gcc RUNTESTFLAGS='--target_board=unix\{-m32,-m64\} > i386.exp=pr84508* ubsan.exp=pr84508*' > and committed to trunk as obvious. > > 2024-05-09 Jakub Jelinek > > PR target/84508 > * gcc.target/i386/pr84508-1.c: Move to ... > * gcc.dg/ubsan/pr84508-1.c: ... here. Restrict to i?86/x86_64 > non-ia32 targets. > * gcc.target/i386/pr84508-2.c: Move to ... > * gcc.dg/ubsan/pr84508-2.c: ... here. Restrict to i?86/x86_64 > non-ia32 targets. > > diff --git a/gcc/testsuite/gcc.target/i386/pr84508-1.c > b/gcc/testsuite/gcc.dg/ubsan/pr84508-1.c > similarity index 74% > rename from gcc/testsuite/gcc.target/i386/pr84508-1.c > rename to gcc/testsuite/gcc.dg/ubsan/pr84508-1.c > index bb3e28d017e..d781e01 100644 > --- a/gcc/testsuite/gcc.target/i386/pr84508-1.c > +++ b/gcc/testsuite/gcc.dg/ubsan/pr84508-1.c > @@ -1,5 +1,6 @@ > -/* { dg-do run { target { ! ia32 } } } */ > +/* { dg-do run { target { { i?86-*-* x86_64-*-* } && { ! ia32 } } } } > +*/ > /* { dg-options "-fsanitize=undefined" } */ > + > #include > > int main() > diff --git a/gcc/testsuite/gcc.target/i386/pr84508-2.c > b/gcc/testsuite/gcc.dg/ubsan/pr84508-2.c > similarity index 73% > rename from gcc/testsuite/gcc.target/i386/pr84508-2.c > rename to gcc/testsuite/gcc.dg/ubsan/pr84508-2.c > index 32a8f20a536..cf9c7db1d15 100644 > --- a/gcc/testsuite/gcc.target/i386/pr84508-2.c > +++ b/gcc/testsuite/gcc.dg/ubsan/pr84508-2.c > @@ -1,5 +1,6 @@ > -/* { dg-do run { target { ! ia32 } } } */ > +/* { dg-do run { target { { i?86-*-* x86_64-*-* } && { ! ia32 } } } } > +*/ > /* { dg-options "-fsanitize=undefined" } */ > + > #include > > int main() > > Jakub
[PATCH] i386: Fix some intrinsics without alignment requirements.
Hi all, This patch aims to fix some intrinsics without alignment requirement, but raised runtime error's problem. Bootstrapped and tested on x86_64-linux-gnu, OK for trunk? BRs, Lin gcc/ChangeLog: PR target/84508 * config/i386/emmintrin.h (_mm_load_sd): Remove alignment requirement. (_mm_store_sd): Ditto. (_mm_loadh_pd): Ditto. (_mm_loadl_pd): Ditto. (_mm_storel_pd): Add alignment requirement. * config/i386/xmmintrin.h (_mm_loadh_pi): Remove alignment requirement. (_mm_loadl_pi): Ditto. (_mm_load_ss): Ditto. (_mm_store_ss): Ditto. gcc/testsuite/ChangeLog: PR target/84508 * gcc.target/i386/pr84508-1.c: New test. * gcc.target/i386/pr84508-2.c: Ditto. --- gcc/config/i386/emmintrin.h | 11 ++- gcc/config/i386/xmmintrin.h | 9 + gcc/testsuite/gcc.target/i386/pr84508-1.c | 11 +++ gcc/testsuite/gcc.target/i386/pr84508-2.c | 11 +++ 4 files changed, 33 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr84508-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr84508-2.c diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h index 915a5234c38..d7fc1af9687 100644 --- a/gcc/config/i386/emmintrin.h +++ b/gcc/config/i386/emmintrin.h @@ -56,6 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); /* Unaligned version of the same types. */ typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +typedef double double_u __attribute__ ((__may_alias__, __aligned__ (1))); /* Create a selector for use with the SHUFPD instruction. */ #define _MM_SHUFFLE2(fp1,fp0) \ @@ -145,7 +146,7 @@ _mm_load1_pd (double const *__P) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_sd (double const *__P) { - return _mm_set_sd (*__P); + return __extension__ (__m128d){ *(double_u *)__P, 0.0 }; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -180,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A) extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_sd (double *__P, __m128d __A) { - *__P = ((__v2df)__A)[0]; + *(double_u *)__P = ((__v2df)__A)[0] ; } extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -192,7 +193,7 @@ _mm_cvtsd_f64 (__m128d __A) extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storel_pd (double *__P, __m128d __A) { - _mm_store_sd (__P, __A); + *__P = ((__v2df)__A)[0]; } /* Stores the upper DPFP value. */ @@ -973,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadh_pd (__m128d __A, double const *__B) { - return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); + return __extension__ (__m128d) { ((__v2df)__A)[0], *(double_u*)__B }; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadl_pd (__m128d __A, double const *__B) { - return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B); + return __extension__ (__m128d) { *(double_u*)__B, ((__v2df)__A)[1] }; } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index 71b9955b843..9e20f262839 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -73,6 +73,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); /* Unaligned version of the same type. */ typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +typedef float float_u __attribute__ ((__may_alias__, __aligned__ (1))); /* Internal data types for implementing the intrinsics. */ typedef float __v4sf __attribute__ ((__vector_size__ (16))); @@ -774,7 +775,7 @@ _mm_unpacklo_ps (__m128 __A, __m128 __B) /* Sets the upper two SPFP values with 64-bits of data loaded from P; the lower two values are passed through from A. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadh_pi (__m128 __A, __m64 const *__P) +_mm_loadh_pi (__m128 __A, __m64_u const *__P) { return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P); } @@ -803,7 +804,7 @@ _mm_movelh_ps (__m128 __A, __m128 __B) /* Sets the lower two SPFP values with 64-bits of data loaded from P; the upper two values are passed through from A. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -_mm_loadl_pi (__m128
[PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
Hi, all This patch aims to optimize __builtin_convertvector. We want the function can generate more efficient insn for some situations. Like v2si -> v2di. The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk? BRs, Lin gcc/ChangeLog: PR target/107432 * tree-vect-generic.cc (expand_vector_conversion): Support convert for int -> int, float -> float and int <-> float. (expand_vector_conversion_no_vec_pack): Check if can convert int <-> int, float <-> float and int <-> float, directly. Support indirect convert, when direct optab is not supported. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: New test. * gcc.target/i386/pr107432-2.c: Ditto. * gcc.target/i386/pr107432-3.c: Ditto. * gcc.target/i386/pr107432-4.c: Ditto. * gcc.target/i386/pr107432-5.c: Ditto. * gcc.target/i386/pr107432-6.c: Ditto. * gcc.target/i386/pr107432-7.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 + gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++ gcc/tree-vect-generic.cc | 107 +- 8 files changed, 918 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2si); +} + +__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); +} + +__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); +} + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); +} + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); +} + +__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); +} + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); +} + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4qi); +} + +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8di)a, __v8qi); +} + +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2hi); +} + +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4hi); +} + +__m128i
[PATCH] i386: Fix CPUID of USER_MSR.
Hi, all This patch aims to fix the wrong CPUID of USER_MSR, its correct CPUID is (0x7, 0x1).EDX[15], But I set it as (0x7, 0x0).EDX[15]. And the patch modefied testcase for give the user a better example. It has been bootstrapped and regtested on x86-64-pc-linux-gnu, OK for trunk? BR, Lin gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_available_features): Move USER_MSR to the correct location. gcc/testsuite/ChangeLog: * gcc.target/i386/user_msr-1.c: Correct the MSR index for give the user an proper example. --- gcc/common/config/i386/cpuinfo.h | 4 ++-- gcc/testsuite/gcc.target/i386/user_msr-1.c | 9 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index f90fb4d56a2..a1eb285daed 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -861,8 +861,6 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_IBT); if (edx & bit_UINTR) set_feature (FEATURE_UINTR); - if (edx & bit_USER_MSR) - set_feature (FEATURE_USER_MSR); if (amx_usable) { if (edx & bit_AMX_TILE) @@ -921,6 +919,8 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_PREFETCHI); if (eax & bit_RAOINT) set_feature (FEATURE_RAOINT); + if (edx & bit_USER_MSR) + set_feature (FEATURE_USER_MSR); if (avx_usable) { if (eax & bit_AVXVNNI) diff --git a/gcc/testsuite/gcc.target/i386/user_msr-1.c b/gcc/testsuite/gcc.target/i386/user_msr-1.c index 447852306df..f315016d088 100644 --- a/gcc/testsuite/gcc.target/i386/user_msr-1.c +++ b/gcc/testsuite/gcc.target/i386/user_msr-1.c @@ -1,9 +1,9 @@ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-musermsr -O2" } */ /* { dg-final { scan-assembler-times "urdmsr\[ \\t\]\\%r\[a-z\]x, \\%r\[a-z\]x" 1 } } */ -/* { dg-final { scan-assembler-times "urdmsr\[ \\t\]\\\$121" 1 } } */ +/* { dg-final { scan-assembler-times "urdmsr\[ \\t\]\\\$6912" 1 } } */ /* { dg-final { scan-assembler-times "uwrmsr\[ \\t\]\\%r\[a-z\]x, \\%r\[a-z\]x" 1 } } */ -/* { dg-final { scan-assembler-times "uwrmsr\[ \\t\]\\%r\[a-z\]x, \\\$121" 1 } } */ +/* { dg-final { scan-assembler-times "uwrmsr\[ \\t\]\\%r\[a-z\]x, \\\$6912" 1 } } */ #include @@ -13,8 +13,9 @@ volatile unsigned long long y; void extern user_msr_test (void) { + y = 6913; x = _urdmsr(y); - x = _urdmsr(121); + x = _urdmsr(6912); _uwrmsr(y, x); - _uwrmsr(121, x); + _uwrmsr(6912, x); } -- 2.31.1
RE: [PATCH] Avoid generate vblendps with ymm16+
On Saturday, November 11, 2023 4:11 AM, Jakub Jelinek wrote: > On Thu, Nov 09, 2023 at 03:27:11PM +0800, Hongtao Liu wrote: > > On Thu, Nov 9, 2023 at 3:15 PM Hu, Lin1 wrote: > > > > > > This patch aims to avoid generate vblendps with ymm16+, And have > > > bootstrapped and tested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk? > > > > > > gcc/ChangeLog: > > > > > > PR target/112435 > > > * config/i386/sse.md: Adding constraints to restrict the > > > generation of > > > vblendps. > > It should be "Don't output vblendps when evex sse reg or gpr32 is involved." > > Others LGTM. > > I've missed this patch, so wrote my own today, and am wondering > > 1) if it isn't better to use separate alternative instead of >x86_evex_reg_mentioned_p, like in the patch below > 2) why do you need the last two hunks in sse.md, both avx2_permv2ti and >*avx_vperm2f128_nozero insns only use x in constraints, never v, >so x86_evex_reg_mentioned_p ought to be always false there > Yes, I think your method is better. For the second problem, I didn't focus on the constraints when I solved this problem. I did learn a good thought. Feel free to upstream this patch. BRs, Lin > > Here is the untested patch, of course you have more testcases (though, I > think it > is better to test dg-do assemble with avx512vl target rather than dg-do > compile > and scan the assembler, after all, the problem was that it didn't assemble). > > 2023-11-10 Jakub Jelinek > > PR target/112435 > * config/i386/sse.md > (avx512vl_shuf_32x4_1, > avx512dq_shuf_64x2_1): > Add > alternative with just x instead of v constraints and use vblendps > as optimization only with that alternative. > > * gcc.target/i386/avx512vl-pr112435.c: New test. > > --- gcc/config/i386/sse.md.jj 2023-11-09 09:04:18.616543403 +0100 > +++ gcc/config/i386/sse.md2023-11-10 15:56:44.138499931 +0100 > @@ -19235,11 +19235,11 @@ (define_expand "avx512dq_shuf_ }) > > (define_insn > "avx512dq_shuf_64x2_1" > - [(set (match_operand:VI8F_256 0 "register_operand" "=v") > + [(set (match_operand:VI8F_256 0 "register_operand" "=x,v") > (vec_select:VI8F_256 > (vec_concat: > - (match_operand:VI8F_256 1 "register_operand" "v") > - (match_operand:VI8F_256 2 "nonimmediate_operand" "vm")) > + (match_operand:VI8F_256 1 "register_operand" "x,v") > + (match_operand:VI8F_256 2 "nonimmediate_operand" "xm,vm")) > (parallel [(match_operand 3 "const_0_to_3_operand") >(match_operand 4 "const_0_to_3_operand") >(match_operand 5 "const_4_to_7_operand") @@ -19254,7 > +19254,7 @@ (define_insn "avx512dq_shu >mask = INTVAL (operands[3]) / 2; >mask |= (INTVAL (operands[5]) - 4) / 2 << 1; >operands[3] = GEN_INT (mask); > - if (INTVAL (operands[3]) == 2 && !) > + if (INTVAL (operands[3]) == 2 && ! && which_alternative > + == 0) > return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; >return > "vshuf64x2\t{%3, %2, %1, %0|%0 d7>, %1, %2, %3}"; } @@ -19386,11 +19386,11 @@ (define_expand > "avx512vl_shuf_ }) > > (define_insn "avx512vl_shuf_32x4_1" > - [(set (match_operand:VI4F_256 0 "register_operand" "=v") > + [(set (match_operand:VI4F_256 0 "register_operand" "=x,v") > (vec_select:VI4F_256 > (vec_concat: > - (match_operand:VI4F_256 1 "register_operand" "v") > - (match_operand:VI4F_256 2 "nonimmediate_operand" "vm")) > + (match_operand:VI4F_256 1 "register_operand" "x,v") > + (match_operand:VI4F_256 2 "nonimmediate_operand" "xm,vm")) > (parallel [(match_operand 3 "const_0_to_7_operand") >(match_operand 4 "const_0_to_7_operand") >(match_operand 5 "const_0_to_7_operand") @@ -19414,7 > +19414,7 @@ (define_insn "avx512vl_shuf_mask |= (INTVAL (operands[7]) - 8) / 4 << 1; >operands[3] = GEN_INT (mask); > > - if (INTVAL (operands[3]) == 2 && !) > + if (INTVAL (operands[3]) == 2 && ! && which_alternative > + == 0) > return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; > >return > "vshuf32x4\t{%3, %2, %1, %0|%0 nd11>, %1, %2, %3}"; > --- gcc/testsuite/gcc.target/i386/avx512vl-pr112435.c.jj 2023-11-10 > 16:04:21.708046771 +0100 > +++ gcc/testsuite/gcc.target/i386/avx512vl-pr112435.c 2023-11-10 > 16:03:51.053479094 +0100 > @@ -0,0 +1,13 @@ > +/* PR target/112435 */ > +/* { dg-do assemble { target { avx512vl && { ! ia32 } } } } */ > +/* { dg-options "-mavx512vl -O2" } */ > + > +#include > + > +__m256i > +foo (__m256i a, __m256i b) > +{ > + register __m256i c __asm__("ymm16") = a; > + asm ("" : "+v" (c)); > + return _mm256_shuffle_i32x4 (c, b, 2); } > > Jakub
[PATCH] Avoid generate vblendps with ymm16+
This patch aims to avoid generate vblendps with ymm16+, And have bootstrapped and tested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk? gcc/ChangeLog: PR target/112435 * config/i386/sse.md: Adding constraints to restrict the generation of vblendps. gcc/testsuite/ChangeLog: PR target/112435 * gcc.target/i386/pr112435-1.c: New test. * gcc.target/i386/pr112435-2.c: Ditto. * gcc.target/i386/pr112435-3.c: Ditto. --- gcc/config/i386/sse.md | 28 +--- gcc/testsuite/gcc.target/i386/pr112435-1.c | 14 gcc/testsuite/gcc.target/i386/pr112435-2.c | 64 ++ gcc/testsuite/gcc.target/i386/pr112435-3.c | 79 ++ 4 files changed, 175 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-3.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 33198756bb0..666f931c88d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19254,7 +19254,8 @@ mask = INTVAL (operands[3]) / 2; mask |= (INTVAL (operands[5]) - 4) / 2 << 1; operands[3] = GEN_INT (mask); - if (INTVAL (operands[3]) == 2 && !) + if (INTVAL (operands[3]) == 2 && ! + && !x86_evex_reg_mentioned_p (operands, 3)) return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; return "vshuf64x2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } @@ -19414,7 +19415,8 @@ mask |= (INTVAL (operands[7]) - 8) / 4 << 1; operands[3] = GEN_INT (mask); - if (INTVAL (operands[3]) == 2 && !) + if (INTVAL (operands[3]) == 2 && ! + && !x86_evex_reg_mentioned_p (operands, 3)) return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; return "vshuf32x4\t{%3, %2, %1, %0|%0, %1, %2, %3}"; @@ -26776,10 +26778,13 @@ else return "vmovaps\t{%2, %0|%0, %2}"; } -if ((mask & 0xbb) == 18) - return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; -if ((mask & 0xbb) == 48) - return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; +if (!x86_evex_reg_mentioned_p (operands, 3)) + { + if ((mask & 0xbb) == 18) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 48) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + } return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -27433,10 +27438,13 @@ && avx_vperm2f128_parallel (operands[3], mode)" { int mask = avx_vperm2f128_parallel (operands[3], mode) - 1; - if ((mask & 0xbb) == 0x12) -return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; - if ((mask & 0xbb) == 0x30) -return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + if (!x86_evex_reg_mentioned_p (operands, 3)) +{ + if ((mask & 0xbb) == 0x12) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 0x30) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; +} if ((mask & 0xbb) == 0x20) return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; operands[3] = GEN_INT (mask); diff --git a/gcc/testsuite/gcc.target/i386/pr112435-1.c b/gcc/testsuite/gcc.target/i386/pr112435-1.c new file mode 100644 index 000..ff56523b4e1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr112435-1.c @@ -0,0 +1,14 @@ +/* PR target/112435 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-not "vblendps" } } */ + +#include + +__m256i +f(__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_shuffle_i32x4 (t, b, 2); +} diff --git a/gcc/testsuite/gcc.target/i386/pr112435-2.c b/gcc/testsuite/gcc.target/i386/pr112435-2.c new file mode 100644 index 000..27ba80b1e68 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr112435-2.c @@ -0,0 +1,64 @@ +/* PR target/112435 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-Ofast -march=sapphirerapids" } */ +/* { dg-final { scan-assembler-not "vblendps.*ymm17\$" } } */ + +#include + +/* Vpermi128/Vpermf128 */ +__m256i +perm0 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_permute2x128_si256 (t, b, 50); +} + +__m256i +perm1 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_permute2x128_si256 (t, b, 18); +} + +__m256i +perm2 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_permute2x128_si256 (t, b, 48); +} + +/* vshuf{i,f}{32x4,64x2} ymm .*/ +__m256i +shuff0 (__m256i a, __m256i b) +{ + register __m256i t __asm__("ymm17") = a; + asm("":"+v"(t)); + return _mm256_shuffle_i32x4(t, b, 2); +} + +__m256 +shuff1 (__m256 a, __m256 b) +{ + register __m256 t __asm__("ymm17") = a; +
[PATCH] Fix testcases that are raised by support -mevex512
Hi, all This patch aims to fix some scan-asm fail of pr89229-{5,6,7}b.c since we emit scalar vmov{s,d} here, when trying to use x/ymm 16+ w/o avx512vl but with avx512f+evex512. If everyone has no objection to the modification of this behavior, then we tend to solve these failures by modifying these testcases. BRs, Lin gcc/testsuite/ChangeLog: * gcc.target/i386/pr89229-5b.c: Modify test. * gcc.target/i386/pr89229-6b.c: Ditto. * gcc.target/i386/pr89229-7b.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr89229-5b.c | 2 +- gcc/testsuite/gcc.target/i386/pr89229-6b.c | 2 +- gcc/testsuite/gcc.target/i386/pr89229-7b.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/gcc.target/i386/pr89229-5b.c b/gcc/testsuite/gcc.target/i386/pr89229-5b.c index 261f2e12e8d..8a81585e790 100644 --- a/gcc/testsuite/gcc.target/i386/pr89229-5b.c +++ b/gcc/testsuite/gcc.target/i386/pr89229-5b.c @@ -3,4 +3,4 @@ #include "pr89229-5a.c" -/* { dg-final { scan-assembler-times "vmovdqa32\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */ +/* { dg-final { scan-assembler-times "vmovsd\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr89229-6b.c b/gcc/testsuite/gcc.target/i386/pr89229-6b.c index a74f7169e6e..0c27daa4f74 100644 --- a/gcc/testsuite/gcc.target/i386/pr89229-6b.c +++ b/gcc/testsuite/gcc.target/i386/pr89229-6b.c @@ -3,4 +3,4 @@ #include "pr89229-6a.c" -/* { dg-final { scan-assembler-times "vmovaps\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr89229-7b.c b/gcc/testsuite/gcc.target/i386/pr89229-7b.c index d3a56e6e2b7..baba99ec775 100644 --- a/gcc/testsuite/gcc.target/i386/pr89229-7b.c +++ b/gcc/testsuite/gcc.target/i386/pr89229-7b.c @@ -3,4 +3,4 @@ #include "pr89229-7a.c" -/* { dg-final { scan-assembler-times "vmovdqa32\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */ +/* { dg-final { scan-assembler-times "vmovss\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */ -- 2.31.1
RE: [PATCH] Support Intel USER_MSR
There are some typos In /gcc/doc/extend.texi and /gcc/doc/invoke.texi. They should be USER_MSR, not UMSR. I have modified them in my branch. -Original Message- From: Hu, Lin1 Sent: Tuesday, October 10, 2023 3:47 PM To: gcc-patches@gcc.gnu.org Cc: Liu, Hongtao ; ubiz...@gmail.com Subject: [PATCH] Support Intel USER_MSR This patch aims to support Intel USER_MSR. gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_available_features): Detect USER_MSR. * common/config/i386/i386-common.cc (OPTION_MASK_ISA2_USER_MSR_SET): New. (OPTION_MASK_ISA2_USER_MSR_UNSET): Ditto. (ix86_handle_option): Handle -musermsr. * common/config/i386/i386-cpuinfo.h (enum processor_features): Add FEATURE_USER_MSR. * common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for usermsr. * config.gcc: Add usermsrintrin.h * config/i386/cpuid.h (bit_USER_MSR): New. * config/i386/i386-builtin-types.def: Add DEF_FUNCTION_TYPE (VOID, UINT64, UINT64). * config/i386/i386-builtins.cc (ix86_init_mmx_sse_builtins): Add __builtin_urdmsr and __builtin_uwrmsr. * config/i386/i386-builtins.h (ix86_builtins): Add IX86_BUILTIN_URDMSR and IX86_BUILTIN_UWRMSR. * config/i386/i386-c.cc (ix86_target_macros_internal): Define __USER_MSR__. * config/i386/i386-expand.cc (ix86_expand_builtin): Handle new builtins. * config/i386/i386-isa.def (USER_MSR): Add DEF_PTA(USER_MSR). * config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p): Handle usermsr. * config/i386/i386.md (urdmsr): New define_insn. (uwrmsr): Ditto. * config/i386/i386.opt: Add option -musermsr. * config/i386/x86gprintrin.h: Include usermsrintrin.h * doc/extend.texi: Document usermsr. * doc/invoke.texi: Document -musermsr. * doc/sourcebuild.texi: Document target usermsr. * config/i386/usermsrintrin.h: New file. gcc/testsuite/ChangeLog: * gcc.target/i386/funcspec-56.inc: Add new target attribute. * gcc.target/i386/x86gprintrin-1.c: Add -musermsr for 64bit target. * gcc.target/i386/x86gprintrin-2.c: Ditto. * gcc.target/i386/x86gprintrin-3.c: Ditto. * gcc.target/i386/x86gprintrin-4.c: Add musermsr for 64bit target. * gcc.target/i386/x86gprintrin-5.c: Ditto * gcc.target/i386/usermsr-1.c: New test. * gcc.target/i386/usermsr-2.c: Ditto. --- gcc/common/config/i386/cpuinfo.h | 2 + gcc/common/config/i386/i386-common.cc | 15 + gcc/common/config/i386/i386-cpuinfo.h | 1 + gcc/common/config/i386/i386-isas.h| 1 + gcc/config.gcc| 3 +- gcc/config/i386/cpuid.h | 1 + gcc/config/i386/i386-builtin-types.def| 3 + gcc/config/i386/i386-builtins.cc | 8 +++ gcc/config/i386/i386-builtins.h | 2 + gcc/config/i386/i386-c.cc | 2 + gcc/config/i386/i386-expand.cc| 35 +++ gcc/config/i386/i386-isa.def | 1 + gcc/config/i386/i386-options.cc | 4 +- gcc/config/i386/i386.md | 24 gcc/config/i386/i386.opt | 4 ++ gcc/config/i386/usermsrintrin.h | 60 +++ gcc/config/i386/x86gprintrin.h| 2 + gcc/doc/extend.texi | 5 ++ gcc/doc/invoke.texi | 6 +- gcc/doc/sourcebuild.texi | 3 + gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 + gcc/testsuite/gcc.target/i386/user_msr-1.c| 20 +++ gcc/testsuite/gcc.target/i386/user_msr-2.c| 16 + .../gcc.target/i386/x86gprintrin-1.c | 2 +- .../gcc.target/i386/x86gprintrin-2.c | 6 +- .../gcc.target/i386/x86gprintrin-3.c | 28 - .../gcc.target/i386/x86gprintrin-4.c | 32 +- .../gcc.target/i386/x86gprintrin-5.c | 6 +- 28 files changed, 286 insertions(+), 8 deletions(-) create mode 100644 gcc/config/i386/usermsrintrin.h create mode 100644 gcc/testsuite/gcc.target/i386/user_msr-1.c create mode 100644 gcc/testsuite/gcc.target/i386/user_msr-2.c diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 141d3743316..0f86b44730b 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -838,6 +838,8 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_IBT); if (edx & bit_UINTR) set_feature (FEATURE_UINTR); + if (edx & bit_USER_MSR) + set_feature (FEATURE_USER_MSR); if (amx_usable) { if (edx & bit_AMX_TILE) diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 684b0451bb3..13e423deceb 1006
[PATCH] Support Intel USER_MSR
This patch aims to support Intel USER_MSR. gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_available_features): Detect USER_MSR. * common/config/i386/i386-common.cc (OPTION_MASK_ISA2_USER_MSR_SET): New. (OPTION_MASK_ISA2_USER_MSR_UNSET): Ditto. (ix86_handle_option): Handle -musermsr. * common/config/i386/i386-cpuinfo.h (enum processor_features): Add FEATURE_USER_MSR. * common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for usermsr. * config.gcc: Add usermsrintrin.h * config/i386/cpuid.h (bit_USER_MSR): New. * config/i386/i386-builtin-types.def: Add DEF_FUNCTION_TYPE (VOID, UINT64, UINT64). * config/i386/i386-builtins.cc (ix86_init_mmx_sse_builtins): Add __builtin_urdmsr and __builtin_uwrmsr. * config/i386/i386-builtins.h (ix86_builtins): Add IX86_BUILTIN_URDMSR and IX86_BUILTIN_UWRMSR. * config/i386/i386-c.cc (ix86_target_macros_internal): Define __USER_MSR__. * config/i386/i386-expand.cc (ix86_expand_builtin): Handle new builtins. * config/i386/i386-isa.def (USER_MSR): Add DEF_PTA(USER_MSR). * config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p): Handle usermsr. * config/i386/i386.md (urdmsr): New define_insn. (uwrmsr): Ditto. * config/i386/i386.opt: Add option -musermsr. * config/i386/x86gprintrin.h: Include usermsrintrin.h * doc/extend.texi: Document usermsr. * doc/invoke.texi: Document -musermsr. * doc/sourcebuild.texi: Document target usermsr. * config/i386/usermsrintrin.h: New file. gcc/testsuite/ChangeLog: * gcc.target/i386/funcspec-56.inc: Add new target attribute. * gcc.target/i386/x86gprintrin-1.c: Add -musermsr for 64bit target. * gcc.target/i386/x86gprintrin-2.c: Ditto. * gcc.target/i386/x86gprintrin-3.c: Ditto. * gcc.target/i386/x86gprintrin-4.c: Add musermsr for 64bit target. * gcc.target/i386/x86gprintrin-5.c: Ditto * gcc.target/i386/usermsr-1.c: New test. * gcc.target/i386/usermsr-2.c: Ditto. --- gcc/common/config/i386/cpuinfo.h | 2 + gcc/common/config/i386/i386-common.cc | 15 + gcc/common/config/i386/i386-cpuinfo.h | 1 + gcc/common/config/i386/i386-isas.h| 1 + gcc/config.gcc| 3 +- gcc/config/i386/cpuid.h | 1 + gcc/config/i386/i386-builtin-types.def| 3 + gcc/config/i386/i386-builtins.cc | 8 +++ gcc/config/i386/i386-builtins.h | 2 + gcc/config/i386/i386-c.cc | 2 + gcc/config/i386/i386-expand.cc| 35 +++ gcc/config/i386/i386-isa.def | 1 + gcc/config/i386/i386-options.cc | 4 +- gcc/config/i386/i386.md | 24 gcc/config/i386/i386.opt | 4 ++ gcc/config/i386/usermsrintrin.h | 60 +++ gcc/config/i386/x86gprintrin.h| 2 + gcc/doc/extend.texi | 5 ++ gcc/doc/invoke.texi | 6 +- gcc/doc/sourcebuild.texi | 3 + gcc/testsuite/gcc.target/i386/funcspec-56.inc | 2 + gcc/testsuite/gcc.target/i386/user_msr-1.c| 20 +++ gcc/testsuite/gcc.target/i386/user_msr-2.c| 16 + .../gcc.target/i386/x86gprintrin-1.c | 2 +- .../gcc.target/i386/x86gprintrin-2.c | 6 +- .../gcc.target/i386/x86gprintrin-3.c | 28 - .../gcc.target/i386/x86gprintrin-4.c | 32 +- .../gcc.target/i386/x86gprintrin-5.c | 6 +- 28 files changed, 286 insertions(+), 8 deletions(-) create mode 100644 gcc/config/i386/usermsrintrin.h create mode 100644 gcc/testsuite/gcc.target/i386/user_msr-1.c create mode 100644 gcc/testsuite/gcc.target/i386/user_msr-2.c diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 141d3743316..0f86b44730b 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -838,6 +838,8 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_IBT); if (edx & bit_UINTR) set_feature (FEATURE_UINTR); + if (edx & bit_USER_MSR) + set_feature (FEATURE_USER_MSR); if (amx_usable) { if (edx & bit_AMX_TILE) diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 684b0451bb3..13e423deceb 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -125,6 +125,7 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA2_SM4_SET OPTION_MASK_ISA2_SM4 #define OPTION_MASK_ISA2_APX_F_SET OPTION_MASK_ISA2_APX_F #define OPTION_MASK_ISA2_EVEX512_SET OPTION_MASK_ISA2_EVEX512 +#define
RE: [PATCH 00/18] Support -mevex512 for AVX512
Hi, Thanks for you reply. I'd like to verify that our understanding of your requirements is correct, and that __EVEX256__ can be considered a default macro to determine whether the compiler supports the __EVEX***__ series of switches. For example: I have a segment of code like: #if defined(__EVEX512__): __mm512.*__; #else __mm256.*__; #endif But __EVEX512__ is undefined that doesn't mean I only need 256bit, maybe I use gcc-13, so I can still use 512bit. So the code should be: #if defined(__EVEX512__): __mm512.*__; #elif defined(__EVEX256__): __mm256.*__; #else __mm512.*__; #endif If we understand correctly, we'll consider the request. But since we're about to have a vacation, follow-up replies may be a bit slower. BRs, Lin -Original Message- From: ZiNgA BuRgA Sent: Thursday, September 28, 2023 8:32 AM To: Hu, Lin1 ; gcc-patches@gcc.gnu.org Subject: Re: [PATCH 00/18] Support -mevex512 for AVX512 Thanks for the new patch! I see that there's a new __EVEX512__ define. Will there be some __EVEX256__ (or maybe some max EVEX width) define, so that code can detect whether the compiler supports AVX10.1/256 without resorting to version checks?
[PATCH 11/18] [PATCH 5/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
From: Haochen Jiang gcc/ChangeLog: * config/i386/i386-builtin.def (BDESC): Add OPTION_MASK_ISA2_EVEX512. --- gcc/config/i386/i386-builtin.def | 156 +++ 1 file changed, 78 insertions(+), 78 deletions(-) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 8250e2998cd..b90d5ccc969 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -1568,9 +1568,9 @@ BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, CODE_FOR_copysignv8df3 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF) BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, CODE_FOR_sqrtv16sf2, "__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) V16SF_FTYPE_V16SF) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf, "__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_floorph512", IX86_BUILTIN_FLOORPH512, (enum rtx_code) ROUND_FLOOR, (int) V32HF_FTYPE_V32HF_ROUND) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_ceilph512", IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, (int) V32HF_FTYPE_V32HF_ROUND) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_truncph512", IX86_BUILTIN_TRUNCPH512, (enum rtx_code) ROUND_TRUNC, (int) V32HF_FTYPE_V32HF_ROUND) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_floorph512", IX86_BUILTIN_FLOORPH512, (enum rtx_code) ROUND_FLOOR, (int) V32HF_FTYPE_V32HF_ROUND) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_ceilph512", IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, (int) V32HF_FTYPE_V32HF_ROUND) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_truncph512", IX86_BUILTIN_TRUNCPH512, (enum rtx_code) ROUND_TRUNC, (int) V32HF_FTYPE_V32HF_ROUND) BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512f_roundps512, "__builtin_ia32_floorps512", IX86_BUILTIN_FLOORPS512, (enum rtx_code) ROUND_FLOOR, (int) V16SF_FTYPE_V16SF_ROUND) BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512f_roundps512, "__builtin_ia32_ceilps512", IX86_BUILTIN_CEILPS512, (enum rtx_code) ROUND_CEIL, (int) V16SF_FTYPE_V16SF_ROUND) BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512f_roundps512, "__builtin_ia32_truncps512", IX86_BUILTIN_TRUNCPS512, (enum rtx_code) ROUND_TRUNC, (int) V16SF_FTYPE_V16SF_ROUND) @@ -2874,40 +2874,40 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_extendbfsf2_1, "__builtin_ia32_cvtbf2sf /* AVX512FP16. */ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv16hf3_mask, "__builtin_ia32_addph256_mask", IX86_BUILTIN_ADDPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, "__builtin_ia32_addph512_mask", IX86_BUILTIN_ADDPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, CODE_FOR_addv32hf3_mask, "__builtin_ia32_addph512_mask", IX86_BUILTIN_ADDPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv8hf3_mask, "__builtin_ia32_subph128_mask", IX86_BUILTIN_SUBPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv16hf3_mask, "__builtin_ia32_subph256_mask", IX86_BUILTIN_SUBPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, "__builtin_ia32_subph512_mask", IX86_BUILTIN_SUBPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) +BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, CODE_FOR_subv32hf3_mask, "__builtin_ia32_subph512_mask", IX86_BUILTIN_SUBPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv8hf3_mask, "__builtin_ia32_mulph128_mask", IX86_BUILTIN_MULPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_mulv16hf3_mask, "__builtin_ia32_mulph256_mask", IX86_BUILTIN_MULPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI) -BDESC (0, OPTION_MASK_ISA2_AVX512FP16,
[PATCH 12/18] Disable zmm register and 512 bit libmvec call when !TARGET_EVEX512
From: Haochen Jiang gcc/ChangeLog: * config/i386/i386-expand.cc (ix86_broadcast_from_constant): Disable zmm broadcast for !TARGET_EVEX512. * config/i386/i386-options.cc (ix86_option_override_internal): Do not use PVW_512 when no-evex512. (ix86_simd_clone_adjust): Add evex512 target into string. * config/i386/i386.cc (type_natural_mode): Report ABI warning when using zmm register w/o evex512. (ix86_return_in_memory): Do not allow zmm when !TARGET_EVEX512. (ix86_hard_regno_mode_ok): Ditto. (ix86_set_reg_reg_cost): Ditto. (ix86_rtx_costs): Ditto. (ix86_vector_mode_supported_p): Ditto. (ix86_preferred_simd_mode): Ditto. (ix86_get_mask_mode): Ditto. (ix86_simd_clone_compute_vecsize_and_simdlen): Disable 512 bit libmvec call when !TARGET_EVEX512. (ix86_simd_clone_usable): Ditto. * config/i386/i386.h (BIGGEST_ALIGNMENT): Disable 512 alignment when !TARGET_EVEX512 (MOVE_MAX): Do not use PVW_512 when !TARGET_EVEX512. (STORE_MAX_PIECES): Ditto. --- gcc/config/i386/i386-expand.cc | 1 + gcc/config/i386/i386-options.cc | 14 + gcc/config/i386/i386.cc | 53 ++--- gcc/config/i386/i386.h | 7 +++-- 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index e42ff27c6ef..6eedcb384c0 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -611,6 +611,7 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op) avx512 embed broadcast is available. */ if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT && (!TARGET_AVX512F + || (GET_MODE_SIZE (mode) == 64 && !TARGET_EVEX512) || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL))) return nullptr; diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index a1a7a92da9f..e2a90d7d9e2 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -2845,7 +2845,8 @@ ix86_option_override_internal (bool main_args_p, opts->x_ix86_move_max = opts->x_prefer_vector_width_type; if (opts_set->x_ix86_move_max == PVW_NONE) { - if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)) + if (TARGET_AVX512F_P (opts->x_ix86_isa_flags) + && TARGET_EVEX512_P (opts->x_ix86_isa_flags2)) opts->x_ix86_move_max = PVW_AVX512; else opts->x_ix86_move_max = PVW_AVX128; @@ -2866,7 +2867,8 @@ ix86_option_override_internal (bool main_args_p, opts->x_ix86_store_max = opts->x_prefer_vector_width_type; if (opts_set->x_ix86_store_max == PVW_NONE) { - if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)) + if (TARGET_AVX512F_P (opts->x_ix86_isa_flags) + && TARGET_EVEX512_P (opts->x_ix86_isa_flags2)) opts->x_ix86_store_max = PVW_AVX512; else opts->x_ix86_store_max = PVW_AVX128; @@ -3145,13 +3147,13 @@ ix86_simd_clone_adjust (struct cgraph_node *node) case 'e': if (TARGET_PREFER_AVX256) { - if (!TARGET_AVX512F) - str = "avx512f,prefer-vector-width=512"; + if (!TARGET_AVX512F || !TARGET_EVEX512) + str = "avx512f,evex512,prefer-vector-width=512"; else str = "prefer-vector-width=512"; } - else if (!TARGET_AVX512F) - str = "avx512f"; + else if (!TARGET_AVX512F || !TARGET_EVEX512) + str = "avx512f,evex512"; break; default: gcc_unreachable (); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 477e6cecc38..0df3bf10547 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -1924,7 +1924,8 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum, if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type) && GET_MODE_INNER (mode) == innermode) { - if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU) + if (size == 64 && (!TARGET_AVX512F || !TARGET_EVEX512) + && !TARGET_IAMCU) { static bool warnedavx512f; static bool warnedavx512f_ret; @@ -4347,7 +4348,7 @@ ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) /* AVX512F values are returned in ZMM0 if available. */ if (size == 64) - return !TARGET_AVX512F; + return !TARGET_AVX512F || !TARGET_EVEX512; } if (mode == XFmode) @@ -20286,7 +20287,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) - any of 512-bit wide vector mode - any scalar mode. */ if (TARGET_AVX512F - && (VALID_AVX512F_REG_OR_XI_MODE (mode) +
[PATCH 13/18] Support -mevex512 for AVX512F intrins
From: Haochen Jiang gcc/ChangeLog: * config/i386/i386-builtins.cc (ix86_vectorize_builtin_gather): Disable 512 bit gather when !TARGET_EVEX512. * config/i386/i386-expand.cc (ix86_valid_mask_cmp_mode): Add TARGET_EVEX512. (ix86_expand_int_sse_cmp): Ditto. (ix86_expand_vector_init_one_nonzero): Disable subroutine when !TARGET_EVEX512. (ix86_emit_swsqrtsf): Add TARGET_EVEX512. (ix86_vectorize_vec_perm_const): Disable subroutine when !TARGET_EVEX512. * config/i386/i386.cc (standard_sse_constant_p): Add TARGET_EVEX512. (standard_sse_constant_opcode): Ditto. (ix86_get_ssemov): Ditto. (ix86_legitimate_constant_p): Ditto. (ix86_vectorize_builtin_scatter): Diable 512 bit scatter when !TARGET_EVEX512. * config/i386/i386.md (avx512f_512): New. (movxi): Add TARGET_EVEX512. (*movxi_internal_avx512f): Ditto. (*movdi_internal): Change alternative 12 to ?Yv. Adjust mode for alternative 13. (*movsi_internal): Change alternative 8 to ?Yv. Adjust mode for alternative 9. (*movhi_internal): Change alternative 11 to *Yv. (*movdf_internal): Change alternative 12 to Yv. (*movsf_internal): Change alternative 5 to Yv. Adjust mode for alternative 5 and 6. (*mov_internal): Change alternative 4 to Yv. (define_split for convert SF to DF): Add TARGET_EVEX512. (extendbfsf2_1): Ditto. * config/i386/predicates.md (bcst_mem_operand): Disable predicate for 512 bit when !TARGET_EVEX512. * config/i386/sse.md (VMOVE): Add TARGET_EVEX512. (V48_AVX512VL): Ditto. (V48_256_512_AVX512VL): Ditto. (V48H_AVX512VL): Ditto. (VI12_AVX512VL): Ditto. (V): Ditto. (V_512): Ditto. (V_256_512): Ditto. (VF): Ditto. (VF1_VF2_AVX512DQ): Ditto. (VFH): Ditto. (VFB): Ditto. (VF1): Ditto. (VF1_AVX2): Ditto. (VF2): Ditto. (VF2H): Ditto. (VF2_512_256): Ditto. (VF2_512_256VL): Ditto. (VF_512): Ditto. (VFB_512): Ditto. (VI48_AVX512VL): Ditto. (VI1248_AVX512VLBW): Ditto. (VF_AVX512VL): Ditto. (VFH_AVX512VL): Ditto. (VF1_AVX512VL): Ditto. (VI): Ditto. (VIHFBF): Ditto. (VI_AVX2): Ditto. (VI8): Ditto. (VI8_AVX512VL): Ditto. (VI2_AVX512F): Ditto. (VI4_AVX512F): Ditto. (VI4_AVX512VL): Ditto. (VI48_AVX512F_AVX512VL): Ditto. (VI8_AVX2_AVX512F): Ditto. (VI8_AVX_AVX512F): Ditto. (V8FI): Ditto. (V16FI): Ditto. (VI124_AVX2_24_AVX512F_1_AVX512BW): Ditto. (VI248_AVX512VLBW): Ditto. (VI248_AVX2_8_AVX512F_24_AVX512BW): Ditto. (VI248_AVX512BW): Ditto. (VI248_AVX512BW_AVX512VL): Ditto. (VI48_AVX512F): Ditto. (VI48_AVX_AVX512F): Ditto. (VI12_AVX_AVX512F): Ditto. (VI148_512): Ditto. (VI124_256_AVX512F_AVX512BW): Ditto. (VI48_512): Ditto. (VI_AVX512BW): Ditto. (VIHFBF_AVX512BW): Ditto. (VI4F_256_512): Ditto. (VI48F_256_512): Ditto. (VI48F): Ditto. (VI12_VI48F_AVX512VL): Ditto. (V32_512): Ditto. (AVX512MODE2P): Ditto. (STORENT_MODE): Ditto. (REDUC_PLUS_MODE): Ditto. (REDUC_SMINMAX_MODE): Ditto. (*andnot3): Change isa attribute to avx512f_512. (*andnot3): Ditto. (3): Ditto. (tf3): Ditto. (FMAMODEM): Add TARGET_EVEX512. (FMAMODE_AVX512): Ditto. (VFH_SF_AVX512VL): Ditto. (avx512f_fix_notruncv16sfv16si): Ditto. (fix_truncv16sfv16si2): Ditto. (avx512f_cvtdq2pd512_2): Ditto. (avx512f_cvtpd2dq512): Ditto. (fix_truncv8dfv8si2): Ditto. (avx512f_cvtpd2ps512): Ditto. (vec_unpacks_lo_v16sf): Ditto. (vec_unpacks_hi_v16sf): Ditto. (vec_unpacks_float_hi_v16si): Ditto. (vec_unpacks_float_lo_v16si): Ditto. (vec_unpacku_float_hi_v16si): Ditto. (vec_unpacku_float_lo_v16si): Ditto. (vec_pack_sfix_trunc_v8df): Ditto. (avx512f_vec_pack_sfix_v8df): Ditto. (avx512f_unpckhps512): Ditto. (avx512f_unpcklps512): Ditto. (avx512f_movshdup512): Ditto. (avx512f_movsldup512): Ditto. (AVX512_VEC): Ditto. (AVX512_VEC_2): Ditto. (vec_extract_lo_v64qi): Ditto. (vec_extract_hi_v64qi): Ditto. (VEC_EXTRACT_MODE): Ditto. (avx512f_unpckhpd512): Ditto. (avx512f_movddup512): Ditto. (avx512f_unpcklpd512): Ditto. (*_vternlog_all): Ditto. (*_vpternlog_1): Ditto. (*_vpternlog_2): Ditto. (*_vpternlog_3): Ditto. (avx512f_shufps512_mask): Ditto. (avx512f_shufps512_1): Ditto.
[PATCH 04/18] [PATCH 3/5] Push evex512 target for 512 bit intrins
From: Haochen Jiang gcc/ChangeLog: * config/i386/avx512bwintrin.h: Add evex512 target for 512 bit intrins. --- gcc/config/i386/avx512bwintrin.h | 291 --- 1 file changed, 153 insertions(+), 138 deletions(-) diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h index d1cd549ce18..925bae1457c 100644 --- a/gcc/config/i386/avx512bwintrin.h +++ b/gcc/config/i386/avx512bwintrin.h @@ -34,16 +34,6 @@ #define __DISABLE_AVX512BW__ #endif /* __AVX512BW__ */ -/* Internal data types for implementing the intrinsics. */ -typedef short __v32hi __attribute__ ((__vector_size__ (64))); -typedef short __v32hi_u __attribute__ ((__vector_size__ (64), \ - __may_alias__, __aligned__ (1))); -typedef char __v64qi __attribute__ ((__vector_size__ (64))); -typedef char __v64qi_u __attribute__ ((__vector_size__ (64), \ - __may_alias__, __aligned__ (1))); - -typedef unsigned long long __mmask64; - extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) @@ -54,229 +44,292 @@ _ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B) { - *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B); - return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); } extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B) { - return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); + return (unsigned char) __builtin_ia32_ktestcsi (__A, __B); } extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +_kortest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) { - return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); + *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); } extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B) { - return (unsigned char) __builtin_ia32_ktestcsi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); } extern __inline unsigned char __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B) +_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B) { - return (unsigned char) __builtin_ia32_ktestcdi (__A, __B); + return (unsigned char) __builtin_ia32_kortestcsi (__A, __B); } -extern __inline unsigned char +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) +_kadd_mask32 (__mmask32 __A, __mmask32 __B) { - *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B); - return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); + return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B); } -extern __inline unsigned char +extern __inline unsigned int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +_cvtmask32_u32 (__mmask32 __A) { - *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B); - return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); + return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A); } -extern __inline unsigned char +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +_cvtu32_mask32 (unsigned int __A) { - return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); + return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A); } -extern __inline unsigned char +extern __inline __mmask32 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +_load_mask32 (__mmask32 *__A) { - return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); + return (__mmask32) __builtin_ia32_kmovd (*__A); } -extern __inline unsigned char +extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +_store_mask32 (__mmask32 *__A, __mmask32 __B) { - return (unsigned char)
[PATCH 15/18] Support -mevex512 for AVX512BW intrins
From: Haochen Jiang gcc/Changelog: * config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate): Make sure there is EVEX512 enabled. (ix86_expand_vecop_qihi2): Refuse V32QI->V32HI when no EVEX512. * config/i386/i386.cc (ix86_hard_regno_mode_ok): Disable 64 bit mask when !TARGET_EVEX512. * config/i386/i386.md (avx512bw_512): New. (SWI1248_AVX512BWDQ_64): Add TARGET_EVEX512. (*zero_extendsidi2): Change isa to avx512bw_512. (kmov_isa): Ditto. (*anddi_1): Ditto. (*andn_1): Change isa to kmov_isa. (*_1): Ditto. (*notxor_1): Ditto. (*one_cmpl2_1): Ditto. (*one_cmplsi2_1_zext): Change isa to avx512bw_512. (*ashl3_1): Change isa to kmov_isa. (*lshr3_1): Ditto. * config/i386/sse.md (VI12HFBF_AVX512VL): Add TARGET_EVEX512. (VI1248_AVX512VLBW): Ditto. (VHFBF_AVX512VL): Ditto. (VI): Ditto. (VIHFBF): Ditto. (VI_AVX2): Ditto. (VI1_AVX512): Ditto. (VI12_256_512_AVX512VL): Ditto. (VI2_AVX2_AVX512BW): Ditto. (VI2_AVX512VNNIBW): Ditto. (VI2_AVX512VL): Ditto. (VI2HFBF_AVX512VL): Ditto. (VI8_AVX2_AVX512BW): Ditto. (VIMAX_AVX2_AVX512BW): Ditto. (VIMAX_AVX512VL): Ditto. (VI12_AVX2_AVX512BW): Ditto. (VI124_AVX2_24_AVX512F_1_AVX512BW): Ditto. (VI248_AVX512VL): Ditto. (VI248_AVX512VLBW): Ditto. (VI248_AVX2_8_AVX512F_24_AVX512BW): Ditto. (VI248_AVX512BW): Ditto. (VI248_AVX512BW_AVX512VL): Ditto. (VI248_512): Ditto. (VI124_256_AVX512F_AVX512BW): Ditto. (VI_AVX512BW): Ditto. (VIHFBF_AVX512BW): Ditto. (SWI1248_AVX512BWDQ): Ditto. (SWI1248_AVX512BW): Ditto. (SWI1248_AVX512BWDQ2): Ditto. (*knotsi_1_zext): Ditto. (define_split for zero_extend + not): Ditto. (kunpckdi): Ditto. (REDUC_SMINMAX_MODE): Ditto. (VEC_EXTRACT_MODE): Ditto. (*avx512bw_permvar_truncv16siv16hi_1): Ditto. (*avx512bw_permvar_truncv16siv16hi_1_hf): Ditto. (truncv32hiv32qi2): Ditto. (avx512bw_v32hiv32qi2): Ditto. (avx512bw_v32hiv32qi2_mask): Ditto. (avx512bw_v32hiv32qi2_mask_store): Ditto. (usadv64qi): Ditto. (VEC_PERM_AVX2): Ditto. (AVX512ZEXTMASK): Ditto. (SWI24_MASK): New. (vec_pack_trunc_): Change iterator to SWI24_MASK. (avx512bw_packsswb): Add TARGET_EVEX512. (avx512bw_packssdw): Ditto. (avx512bw_interleave_highv64qi): Ditto. (avx512bw_interleave_lowv64qi): Ditto. (avx512bw_pshuflwv32hi): Ditto. (avx512bw_pshufhwv32hi): Ditto. (vec_unpacks_lo_di): Ditto. (SWI48x_MASK): New. (vec_unpacks_hi_): Change iterator to SWI48x_MASK. (avx512bw_umulhrswv32hi3): Add TARGET_EVEX512. (VI1248_AVX512VL_AVX512BW): Ditto. (avx512bw_v32qiv32hi2): Ditto. (*avx512bw_zero_extendv32qiv32hi2_1): Ditto. (*avx512bw_zero_extendv32qiv32hi2_2): Ditto. (v32qiv32hi2): Ditto. (pbroadcast_evex_isa): Change isa attribute to avx512bw_512. (VPERMI2): Add TARGET_EVEX512. (VPERMI2I): Ditto. --- gcc/config/i386/i386-expand.cc | 3 +- gcc/config/i386/i386.cc| 4 +- gcc/config/i386/i386.md| 54 - gcc/config/i386/sse.md | 193 ++--- 4 files changed, 128 insertions(+), 126 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 063561e1265..ff2423f91ed 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -15617,6 +15617,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, case E_V32HFmode: case E_V32BFmode: case E_V64QImode: + gcc_assert (TARGET_EVEX512); if (TARGET_AVX512BW) return ix86_vector_duplicate_value (mode, target, val); else @@ -23512,7 +23513,7 @@ ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2) bool uns_p = code != ASHIFTRT; if ((qimode == V16QImode && !TARGET_AVX2) - || (qimode == V32QImode && !TARGET_AVX512BW) + || (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512)) /* There are no V64HImode instructions. */ || qimode == V64QImode) return false; diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 589b29a324d..03c96ff048d 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -20308,8 +20308,8 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) return MASK_PAIR_REGNO_P(regno); return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode)) - || (TARGET_AVX512BW - && VALID_MASK_AVX512BW_MODE (mode))); + || (TARGET_AVX512BW && mode == SImode) + || (TARGET_AVX512BW && TARGET_EVEX512
[PATCH 10/18] [PATCH 4/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
From: Haochen Jiang gcc/ChangeLog: * config/i386/i386-builtin.def (BDESC): Add OPTION_MASK_ISA2_EVEX512. --- gcc/config/i386/i386-builtin.def | 188 +++ 1 file changed, 94 insertions(+), 94 deletions(-) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 167d530a537..8250e2998cd 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -299,8 +299,8 @@ BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_sto BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_storev64qi_mask, "__builtin_ia32_storedquqi512_mask", IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, (int) VOID_FTYPE_PCHAR_V64QI_UDI) /* AVX512VP2INTERSECT */ -BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, UNKNOWN, (int) VOID_FTYPE_PUHI_PUHI_V16SI_V16SI) -BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectq512", IX86_BUILTIN_2INTERSECTQ512, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V8DI_V8DI) +BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT | OPTION_MASK_ISA2_EVEX512, CODE_FOR_nothing, "__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, UNKNOWN, (int) VOID_FTYPE_PUHI_PUHI_V16SI_V16SI) +BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT | OPTION_MASK_ISA2_EVEX512, CODE_FOR_nothing, "__builtin_ia32_2intersectq512", IX86_BUILTIN_2INTERSECTQ512, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V8DI_V8DI) BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectd256", IX86_BUILTIN_2INTERSECTD256, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V8SI_V8SI) BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectq256", IX86_BUILTIN_2INTERSECTQ256, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V4DI_V4DI) BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectd128", IX86_BUILTIN_2INTERSECTD128, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V4SI_V4SI) @@ -430,17 +430,17 @@ BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru, "__builtin_ia32_rdpkru", IX86_B BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru, "__builtin_ia32_wrpkru", IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED) /* VBMI2 */ -BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressstorev64qi_mask, "__builtin_ia32_compressstoreuqi512_mask", IX86_BUILTIN_PCOMPRESSBSTORE512, UNKNOWN, (int) VOID_FTYPE_PV64QI_V64QI_UDI) -BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressstorev32hi_mask, "__builtin_ia32_compressstoreuhi512_mask", IX86_BUILTIN_PCOMPRESSWSTORE512, UNKNOWN, (int) VOID_FTYPE_PV32HI_V32HI_USI) +BDESC (OPTION_MASK_ISA_AVX512VBMI2, OPTION_MASK_ISA2_EVEX512, CODE_FOR_compressstorev64qi_mask, "__builtin_ia32_compressstoreuqi512_mask", IX86_BUILTIN_PCOMPRESSBSTORE512, UNKNOWN, (int) VOID_FTYPE_PV64QI_V64QI_UDI) +BDESC (OPTION_MASK_ISA_AVX512VBMI2, OPTION_MASK_ISA2_EVEX512, CODE_FOR_compressstorev32hi_mask, "__builtin_ia32_compressstoreuhi512_mask", IX86_BUILTIN_PCOMPRESSWSTORE512, UNKNOWN, (int) VOID_FTYPE_PV32HI_V32HI_USI) BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressstorev32qi_mask, "__builtin_ia32_compressstoreuqi256_mask", IX86_BUILTIN_PCOMPRESSBSTORE256, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32QI_USI) BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressstorev16qi_mask, "__builtin_ia32_compressstoreuqi128_mask", IX86_BUILTIN_PCOMPRESSBSTORE128, UNKNOWN, (int) VOID_FTYPE_PV16QI_V16QI_UHI) BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressstorev16hi_mask, "__builtin_ia32_compressstoreuhi256_mask", IX86_BUILTIN_PCOMPRESSWSTORE256, UNKNOWN, (int) VOID_FTYPE_PV16HI_V16HI_UHI) BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_compressstorev8hi_mask, "__builtin_ia32_compressstoreuhi128_mask", IX86_BUILTIN_PCOMPRESSWSTORE128, UNKNOWN, (int) VOID_FTYPE_PV8HI_V8HI_UQI) -BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_mask, "__builtin_ia32_expandloadqi512_mask", IX86_BUILTIN_PEXPANDBLOAD512, UNKNOWN, (int) V64QI_FTYPE_PCV64QI_V64QI_UDI) -BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_maskz, "__builtin_ia32_expandloadqi512_maskz", IX86_BUILTIN_PEXPANDBLOAD512Z, UNKNOWN, (int) V64QI_FTYPE_PCV64QI_V64QI_UDI) -BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_mask, "__builtin_ia32_expandloadhi512_mask", IX86_BUILTIN_PEXPANDWLOAD512, UNKNOWN, (int) V32HI_FTYPE_PCV32HI_V32HI_USI) -BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_maskz, "__builtin_ia32_expandloadhi512_maskz", IX86_BUILTIN_PEXPANDWLOAD512Z, UNKNOWN, (int) V32HI_FTYPE_PCV32HI_V32HI_USI) +BDESC (OPTION_MASK_ISA_AVX512VBMI2, OPTION_MASK_ISA2_EVEX512, CODE_FOR_expandv64qi_mask, "__builtin_ia32_expandloadqi512_mask", IX86_BUILTIN_PEXPANDBLOAD512, UNKNOWN, (int)
[PATCH 05/18] [PATCH 4/5] Push evex512 target for 512 bit intrins
From: Haochen Jiang gcc/ChangeLog: * config.gcc: Add avx512bitalgvlintrin.h. * config/i386/avx5124fmapsintrin.h: Add evex512 target for 512 bit intrins. * config/i386/avx5124vnniwintrin.h: Ditto. * config/i386/avx512bf16intrin.h: Ditto. * config/i386/avx512bitalgintrin.h: Add evex512 target for 512 bit intrins. Split 128/256 bit intrins to avx512bitalgvlintrin.h. * config/i386/avx512erintrin.h: Add evex512 target for 512 bit intrins * config/i386/avx512ifmaintrin.h: Ditto * config/i386/avx512pfintrin.h: Ditto * config/i386/avx512vbmi2intrin.h: Ditto. * config/i386/avx512vbmiintrin.h: Ditto. * config/i386/avx512vnniintrin.h: Ditto. * config/i386/avx512vp2intersectintrin.h: Ditto. * config/i386/avx512vpopcntdqintrin.h: Ditto. * config/i386/gfniintrin.h: Ditto. * config/i386/immintrin.h: Add avx512bitalgvlintrin.h. * config/i386/vaesintrin.h: Add evex512 target for 512 bit intrins. * config/i386/vpclmulqdqintrin.h: Ditto. * config/i386/avx512bitalgvlintrin.h: New. --- gcc/config.gcc | 19 +-- gcc/config/i386/avx5124fmapsintrin.h | 2 +- gcc/config/i386/avx5124vnniwintrin.h | 2 +- gcc/config/i386/avx512bf16intrin.h | 31 ++-- gcc/config/i386/avx512bitalgintrin.h | 155 +- gcc/config/i386/avx512bitalgvlintrin.h | 180 + gcc/config/i386/avx512erintrin.h | 2 +- gcc/config/i386/avx512ifmaintrin.h | 4 +- gcc/config/i386/avx512pfintrin.h | 2 +- gcc/config/i386/avx512vbmi2intrin.h| 4 +- gcc/config/i386/avx512vbmiintrin.h | 4 +- gcc/config/i386/avx512vnniintrin.h | 4 +- gcc/config/i386/avx512vp2intersectintrin.h | 4 +- gcc/config/i386/avx512vpopcntdqintrin.h| 4 +- gcc/config/i386/gfniintrin.h | 76 + gcc/config/i386/immintrin.h| 2 + gcc/config/i386/vaesintrin.h | 4 +- gcc/config/i386/vpclmulqdqintrin.h | 4 +- 18 files changed, 282 insertions(+), 221 deletions(-) create mode 100644 gcc/config/i386/avx512bitalgvlintrin.h diff --git a/gcc/config.gcc b/gcc/config.gcc index ce5def08e2e..e47e6893e1d 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -425,15 +425,16 @@ i[34567]86-*-* | x86_64-*-*) avx512vbmi2vlintrin.h avx512vnniintrin.h avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h avx512vpopcntdqvlintrin.h avx512bitalgintrin.h - pconfigintrin.h wbnoinvdintrin.h movdirintrin.h - waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h - avx512bf16intrin.h enqcmdintrin.h serializeintrin.h - avx512vp2intersectintrin.h avx512vp2intersectvlintrin.h - tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h - amxbf16intrin.h x86gprintrin.h uintrintrin.h - hresetintrin.h keylockerintrin.h avxvnniintrin.h - mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h - avxifmaintrin.h avxvnniint8intrin.h avxneconvertintrin.h + avx512bitalgvlintrin.h pconfigintrin.h wbnoinvdintrin.h + movdirintrin.h waitpkgintrin.h cldemoteintrin.h + avx512bf16vlintrin.h avx512bf16intrin.h enqcmdintrin.h + serializeintrin.h avx512vp2intersectintrin.h + avx512vp2intersectvlintrin.h tsxldtrkintrin.h + amxtileintrin.h amxint8intrin.h amxbf16intrin.h + x86gprintrin.h uintrintrin.h hresetintrin.h + keylockerintrin.h avxvnniintrin.h mwaitintrin.h + avx512fp16intrin.h avx512fp16vlintrin.h avxifmaintrin.h + avxvnniint8intrin.h avxneconvertintrin.h cmpccxaddintrin.h amxfp16intrin.h prfchiintrin.h raointintrin.h amxcomplexintrin.h avxvnniint16intrin.h sm3intrin.h sha512intrin.h sm4intrin.h" diff --git a/gcc/config/i386/avx5124fmapsintrin.h b/gcc/config/i386/avx5124fmapsintrin.h index 97dd77c9235..4c884a5c203 100644 --- a/gcc/config/i386/avx5124fmapsintrin.h +++ b/gcc/config/i386/avx5124fmapsintrin.h @@ -30,7 +30,7 @@ #ifndef __AVX5124FMAPS__ #pragma GCC push_options -#pragma GCC target("avx5124fmaps") +#pragma GCC target("avx5124fmaps,evex512") #define __DISABLE_AVX5124FMAPS__ #endif /* __AVX5124FMAPS__ */ diff --git a/gcc/config/i386/avx5124vnniwintrin.h b/gcc/config/i386/avx5124vnniwintrin.h index fd129589798..795e4814f28 100644 --- a/gcc/config/i386/avx5124vnniwintrin.h +++ b/gcc/config/i386/avx5124vnniwintrin.h @@ -30,7 +30,7 @@ #ifndef __AVX5124VNNIW__ #pragma GCC push_options -#pragma GCC
[PATCH 07/18] [PATCH 1/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
From: Haochen Jiang gcc/ChangeLog: * config/i386/i386-builtin.def (BDESC): Add OPTION_MASK_ISA2_EVEX512. * config/i386/i386-builtins.cc (ix86_init_mmx_sse_builtins): Ditto. --- gcc/config/i386/i386-builtin.def | 648 +++ gcc/config/i386/i386-builtins.cc | 72 ++-- 2 files changed, 372 insertions(+), 348 deletions(-) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 8738b3b6a8a..0cc526383db 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -200,53 +200,53 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskstored256, "__builtin_ia32_mas BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI) /* AVX512F */ -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev16sf_mask, "__builtin_ia32_compressstoresf512_mask", IX86_BUILTIN_COMPRESSPSSTORE512, UNKNOWN, (int) VOID_FTYPE_PV16SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev16si_mask, "__builtin_ia32_compressstoresi512_mask", IX86_BUILTIN_PCOMPRESSDSTORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev8df_mask, "__builtin_ia32_compressstoredf512_mask", IX86_BUILTIN_COMPRESSPDSTORE512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev8di_mask, "__builtin_ia32_compressstoredi512_mask", IX86_BUILTIN_PCOMPRESSQSTORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16sf_mask, "__builtin_ia32_expandloadsf512_mask", IX86_BUILTIN_EXPANDPSLOAD512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16sf_maskz, "__builtin_ia32_expandloadsf512_maskz", IX86_BUILTIN_EXPANDPSLOAD512Z, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16si_mask, "__builtin_ia32_expandloadsi512_mask", IX86_BUILTIN_PEXPANDDLOAD512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16si_maskz, "__builtin_ia32_expandloadsi512_maskz", IX86_BUILTIN_PEXPANDDLOAD512Z, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8df_mask, "__builtin_ia32_expandloaddf512_mask", IX86_BUILTIN_EXPANDPDLOAD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8df_maskz, "__builtin_ia32_expandloaddf512_maskz", IX86_BUILTIN_EXPANDPDLOAD512Z, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8di_mask, "__builtin_ia32_expandloaddi512_mask", IX86_BUILTIN_PEXPANDQLOAD512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8di_maskz, "__builtin_ia32_expandloaddi512_maskz", IX86_BUILTIN_PEXPANDQLOAD512Z, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16si_mask, "__builtin_ia32_loaddqusi512_mask", IX86_BUILTIN_LOADDQUSI512, UNKNOWN, (int) V16SI_FTYPE_PCINT_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8di_mask, "__builtin_ia32_loaddqudi512_mask", IX86_BUILTIN_LOADDQUDI512, UNKNOWN, (int) V8DI_FTYPE_PCINT64_V8DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8df_mask, "__builtin_ia32_loadupd512_mask", IX86_BUILTIN_LOADUPD512, UNKNOWN, (int) V8DF_FTYPE_PCDOUBLE_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16sf_mask, "__builtin_ia32_loadups512_mask", IX86_BUILTIN_LOADUPS512, UNKNOWN, (int) V16SF_FTYPE_PCFLOAT_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16sf_mask, "__builtin_ia32_loadaps512_mask", IX86_BUILTIN_LOADAPS512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16si_mask, "__builtin_ia32_movdqa32load512_mask", IX86_BUILTIN_MOVDQA32LOAD512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8df_mask, "__builtin_ia32_loadapd512_mask", IX86_BUILTIN_LOADAPD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8di_mask, "__builtin_ia32_movdqa64load512_mask", IX86_BUILTIN_MOVDQA64LOAD512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movntv16sf, "__builtin_ia32_movntps512", IX86_BUILTIN_MOVNTPS512, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V16SF) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movntv8df, "__builtin_ia32_movntpd512", IX86_BUILTIN_MOVNTPD512, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V8DF) -BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movntv8di, "__builtin_ia32_movntdq512", IX86_BUILTIN_MOVNTDQ512, UNKNOWN,
[PATCH 17/18] Support -mevex512 for AVX512FP16 intrins
From: Haochen Jiang gcc/ChangeLog: * config/i386/sse.md (V48H_AVX512VL): Add TARGET_EVEX512. (VFH): Ditto. (VF2H): Ditto. (VFH_AVX512VL): Ditto. (VHFBF): Ditto. (VHF_AVX512VL): Ditto. (VI2H_AVX512VL): Ditto. (VI2F_256_512): Ditto. (VF48_I1248): Remove unused iterator. (VF48H_AVX512VL): Add TARGET_EVEX512. (VF_AVX512): Remove unused iterator. (REDUC_PLUS_MODE): Add TARGET_EVEX512. (REDUC_SMINMAX_MODE): Ditto. (FMAMODEM): Ditto. (VFH_SF_AVX512VL): Ditto. (VEC_PERM_AVX2): Ditto. Co-authored-by: Hu, Lin1 --- gcc/config/i386/sse.md | 44 -- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a5a95b9de66..25d53e15dce 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -280,7 +280,7 @@ (define_mode_iterator V48H_AVX512VL [(V16SI "TARGET_EVEX512") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") (V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL") - (V32HF "TARGET_AVX512FP16") + (V32HF "TARGET_AVX512FP16 && TARGET_EVEX512") (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") @@ -355,7 +355,7 @@ (V2DF "TARGET_AVX512DQ && TARGET_AVX512VL")]) (define_mode_iterator VFH - [(V32HF "TARGET_AVX512FP16") + [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512") (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF @@ -401,7 +401,7 @@ ;; All DFmode & HFmode vector float modes (define_mode_iterator VF2H - [(V32HF "TARGET_AVX512FP16") + [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512") (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX") V2DF]) @@ -463,7 +463,7 @@ [(V16SF "TARGET_AVX512ER") (V8SF "TARGET_AVX") V4SF]) (define_mode_iterator VFH_AVX512VL - [(V32HF "TARGET_AVX512FP16") + [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512") (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") @@ -475,12 +475,14 @@ (define_mode_iterator VF1_AVX512VL [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")]) -(define_mode_iterator VHFBF [V32HF V16HF V8HF V32BF V16BF V8BF]) +(define_mode_iterator VHFBF + [(V32HF "TARGET_EVEX512") V16HF V8HF + (V32BF "TARGET_EVEX512") V16BF V8BF]) (define_mode_iterator VHFBF_256 [V16HF V16BF]) (define_mode_iterator VHFBF_128 [V8HF V8BF]) (define_mode_iterator VHF_AVX512VL - [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")]) + [(V32HF "TARGET_EVEX512") (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")]) (define_mode_iterator VHFBF_AVX512VL [(V32HF "TARGET_EVEX512") (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL") @@ -594,9 +596,9 @@ (V8BF "TARGET_AVX512VL") (V16BF "TARGET_AVX512VL") (V32BF "TARGET_EVEX512")]) (define_mode_iterator VI2H_AVX512VL - [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI - (V8SI "TARGET_AVX512VL") V16SI - V8DI ]) + [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") (V32HI "TARGET_EVEX512") + (V8SI "TARGET_AVX512VL") (V16SI "TARGET_EVEX512") + (V8DI "TARGET_EVEX512")]) (define_mode_iterator VI1_AVX512VL_F [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F && TARGET_EVEX512")]) @@ -883,7 +885,10 @@ (V32BF "TARGET_AVX512BW && TARGET_EVEX512")]) ;; Int-float size matches -(define_mode_iterator VI2F_256_512 [V16HI V32HI V16HF V32HF V16BF V32BF]) +(define_mode_iterator VI2F_256_512 + [V16HI (V32HI "TARGET_EVEX512") + V16HF (V32HF "TARGET_EVEX512") + V16BF (V32BF "TARGET_EVEX512")]) (define_mode_iterator VI4F_128 [V4SI V4SF]) (define_mode_iterator VI8F_128 [V2DI V2DF]) (define_mode_iterator VI4F_256 [V8SI V8SF]) @@ -899,10 +90
[PATCH 03/18] [PATCH 2/5] Push evex512 target for 512 bit intrins
From: Haochen Jiang gcc/ChangeLog: * config/i386/avx512dqintrin.h: Add evex512 target for 512 bit intrins. --- gcc/config/i386/avx512dqintrin.h | 1840 +++--- 1 file changed, 926 insertions(+), 914 deletions(-) diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h index 93900a0b5c7..b6a1d499e25 100644 --- a/gcc/config/i386/avx512dqintrin.h +++ b/gcc/config/i386/avx512dqintrin.h @@ -184,1275 +184,1426 @@ _kandn_mask8 (__mmask8 __A, __mmask8 __B) return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B); } -extern __inline __m512d -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_f64x2 (__m128d __A) -{ - return (__m512d) -__builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, -_mm512_undefined_pd (), -(__mmask8) -1); -} - -extern __inline __m512d +#ifdef __OPTIMIZE__ +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A) +_kshiftli_mask8 (__mmask8 __A, unsigned int __B) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) - __A, - (__v8df) - __O, __M); + return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B); } -extern __inline __m512d +extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) +_kshiftri_mask8 (__mmask8 __A, unsigned int __B) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) - __A, - (__v8df) - _mm512_setzero_ps (), - __M); + return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_i64x2 (__m128i __A) +_mm_reduce_sd (__m128d __A, __m128d __B, int __C) { - return (__m512i) -__builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, -_mm512_undefined_epi32 (), + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, +(__v2df) __B, __C, +(__v2df) _mm_setzero_pd (), (__mmask8) -1); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A) +_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) - __A, - (__v8di) - __O, __M); + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); } -extern __inline __m512i +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) +_mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) - __A, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, +(__v2df) __B, __C, +(__v2df) __W, +(__mmask8) __U); } -extern __inline __m512 +extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_broadcast_f32x2 (__m128 __A) +_mm_mask_reduce_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C, const int __R) { - return
[PATCH 18/18] Allow -mno-evex512 usage
From: Haochen Jiang gcc/ChangeLog: * config/i386/i386.opt: Allow -mno-evex512. gcc/testsuite/ChangeLog: * gcc.target/i386/noevex512-1.c: New test. * gcc.target/i386/noevex512-2.c: Ditto. * gcc.target/i386/noevex512-3.c: Ditto. --- gcc/config/i386/i386.opt| 2 +- gcc/testsuite/gcc.target/i386/noevex512-1.c | 13 + gcc/testsuite/gcc.target/i386/noevex512-2.c | 13 + gcc/testsuite/gcc.target/i386/noevex512-3.c | 13 + 4 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-1.c create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-2.c create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-3.c diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 6d8601b1f75..34fc167af82 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1312,5 +1312,5 @@ Target Alias(mtune-ctrl=, use_scatter, ^use_scatter) Enable vectorization for scatter instruction. mevex512 -Target RejectNegative Mask(ISA2_EVEX512) Var(ix86_isa_flags2) Save +Target Mask(ISA2_EVEX512) Var(ix86_isa_flags2) Save Support 512 bit vector built-in functions and code generation. diff --git a/gcc/testsuite/gcc.target/i386/noevex512-1.c b/gcc/testsuite/gcc.target/i386/noevex512-1.c new file mode 100644 index 000..7fd45f15be6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/noevex512-1.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -march=x86-64 -mavx512f -mno-evex512 -Wno-psabi" } */ +/* { dg-final { scan-assembler-not ".%zmm" } } */ + +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); + +__m512d +foo () +{ + __m512d a, b; + a = a + b; + return a; +} diff --git a/gcc/testsuite/gcc.target/i386/noevex512-2.c b/gcc/testsuite/gcc.target/i386/noevex512-2.c new file mode 100644 index 000..1c206e385d0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/noevex512-2.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64 -mavx512bw -mno-evex512" } */ + +#include + +long long +foo (long long c) +{ + register long long a __asm ("k7") = c; + long long b = foo (a); + asm volatile ("" : "+k" (b)); /* { dg-error "inconsistent operand constraints in an 'asm'" } */ + return b; +} diff --git a/gcc/testsuite/gcc.target/i386/noevex512-3.c b/gcc/testsuite/gcc.target/i386/noevex512-3.c new file mode 100644 index 000..10e00c2d61c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/noevex512-3.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -Wno-psabi -mavx512f" } */ +/* { dg-final { scan-assembler-not ".%zmm" } } */ + +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); + +__attribute__ ((target ("no-evex512"))) __m512d +foo () +{ + __m512d a, b; + a = a + b; + return a; +} -- 2.31.1
[PATCH 14/18] Support -mevex512 for AVX512DQ intrins
From: Haochen Jiang gcc/ChangeLog: * config/i386/i386-expand.cc (ix86_expand_sse2_mulvxdi3): Add TARGET_EVEX512 for 512 bit usage. * config/i386/i386.cc (standard_sse_constant_opcode): Ditto. * config/i386/sse.md (VF1_VF2_AVX512DQ): Ditto. (VF1_128_256VL): Ditto. (VF2_AVX512VL): Ditto. (VI8_256_512): Ditto. (fixuns_trunc2): Ditto. (AVX512_VEC): Ditto. (AVX512_VEC_2): Ditto. (VI4F_BRCST32x2): Ditto. (VI8F_BRCST64x2): Ditto. --- gcc/config/i386/i386-expand.cc | 2 +- gcc/config/i386/i386.cc| 22 -- gcc/config/i386/sse.md | 24 ++-- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 0705e08d38c..063561e1265 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -24008,7 +24008,7 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) machine_mode mode = GET_MODE (op0); rtx t1, t2, t3, t4, t5, t6; - if (TARGET_AVX512DQ && mode == V8DImode) + if (TARGET_AVX512DQ && TARGET_EVEX512 && mode == V8DImode) emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2)); else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode) emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2)); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 635dd85e764..589b29a324d 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -5332,9 +5332,14 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) if (EXT_REX_SSE_REG_P (operands[0])) { if (TARGET_AVX512DQ) - return (TARGET_AVX512VL - ? "vxorpd\t%x0, %x0, %x0" - : "vxorpd\t%g0, %g0, %g0"); + { + if (TARGET_AVX512VL) + return "vxorpd\t%x0, %x0, %x0"; + else if (TARGET_EVEX512) + return "vxorpd\t%g0, %g0, %g0"; + else + gcc_unreachable (); + } else { if (TARGET_AVX512VL) @@ -5356,9 +5361,14 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) if (EXT_REX_SSE_REG_P (operands[0])) { if (TARGET_AVX512DQ) - return (TARGET_AVX512VL - ? "vxorps\t%x0, %x0, %x0" - : "vxorps\t%g0, %g0, %g0"); + { + if (TARGET_AVX512VL) + return "vxorps\t%x0, %x0, %x0"; + else if (TARGET_EVEX512) + return "vxorps\t%g0, %g0, %g0"; + else + gcc_unreachable (); + } else { if (TARGET_AVX512VL) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 8d1b75b43e0..a8f93ceddc5 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -350,7 +350,8 @@ (define_mode_iterator VF1_VF2_AVX512DQ [(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF - (V8DF "TARGET_AVX512DQ") (V4DF "TARGET_AVX512DQ && TARGET_AVX512VL") + (V8DF "TARGET_AVX512DQ && TARGET_EVEX512") + (V4DF "TARGET_AVX512DQ && TARGET_AVX512VL") (V2DF "TARGET_AVX512DQ && TARGET_AVX512VL")]) (define_mode_iterator VFH @@ -392,7 +393,7 @@ [(V8SF "TARGET_AVX") V4SF]) (define_mode_iterator VF1_128_256VL - [V8SF (V4SF "TARGET_AVX512VL")]) + [(V8SF "TARGET_EVEX512") (V4SF "TARGET_AVX512VL")]) ;; All DFmode vector float modes (define_mode_iterator VF2 @@ -467,7 +468,7 @@ (V8DF "TARGET_EVEX512") (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) (define_mode_iterator VF2_AVX512VL - [V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) + [(V8DF "TARGET_EVEX512") (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) (define_mode_iterator VF1_AVX512VL [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")]) @@ -534,7 +535,7 @@ [(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) (define_mode_iterator VI8_256_512 - [V8DI (V4DI "TARGET_AVX512VL")]) + [(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL")]) (define_mode_iterator VI1_AVX2 [(V32QI "TARGET_AVX2") V16QI]) @@ -9075,7 +9076,7 @@ (define_insn "fixuns_trunc2" [(set (match_operand: 0 "register_operand" "=v") (unsigned_fix: - (match_operand:VF1_128_256VL 1 "nonimmediate_operand" "vm")))] + (match_operand:VF1_128_256 1 "nonimmediate_operand" "vm")))] "TARGET_AVX512VL" "vcvttps2udq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") @@ -11466,7 +11467,8 @@ (V8SF "32x4") (V8SI "32x4") (V4DF "64x2") (V4DI "64x2")]) (define_mode_iterator AVX512_VEC - [(V8DF "TARGET_AVX512DQ") (V8DI "TARGET_AVX512DQ") + [(V8DF "TARGET_AVX512DQ && TARGET_EVEX512") + (V8DI "TARGET_AVX512DQ &&
[PATCH 09/18] [PATCH 3/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
From: Haochen Jiang gcc/ChangeLog: * config/i386/i386-builtin.def (BDESC): Add OPTION_MASK_ISA2_EVEX512. --- gcc/config/i386/i386-builtin.def | 226 +++ 1 file changed, 113 insertions(+), 113 deletions(-) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 7a0dec9bc8b..167d530a537 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -293,10 +293,10 @@ BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_CMPCCXADD, CODE_FOR_cmpccxadd_si, BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_CMPCCXADD, CODE_FOR_cmpccxadd_di, "__builtin_ia32_cmpccxadd64", IX86_BUILTIN_CMPCCXADD64, UNKNOWN, (int) LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT) /* AVX512BW */ -BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_loadv32hi_mask, "__builtin_ia32_loaddquhi512_mask", IX86_BUILTIN_LOADDQUHI512_MASK, UNKNOWN, (int) V32HI_FTYPE_PCSHORT_V32HI_USI) -BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_loadv64qi_mask, "__builtin_ia32_loaddquqi512_mask", IX86_BUILTIN_LOADDQUQI512_MASK, UNKNOWN, (int) V64QI_FTYPE_PCCHAR_V64QI_UDI) -BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_storev32hi_mask, "__builtin_ia32_storedquhi512_mask", IX86_BUILTIN_STOREDQUHI512_MASK, UNKNOWN, (int) VOID_FTYPE_PSHORT_V32HI_USI) -BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_storev64qi_mask, "__builtin_ia32_storedquqi512_mask", IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, (int) VOID_FTYPE_PCHAR_V64QI_UDI) +BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_loadv32hi_mask, "__builtin_ia32_loaddquhi512_mask", IX86_BUILTIN_LOADDQUHI512_MASK, UNKNOWN, (int) V32HI_FTYPE_PCSHORT_V32HI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_loadv64qi_mask, "__builtin_ia32_loaddquqi512_mask", IX86_BUILTIN_LOADDQUQI512_MASK, UNKNOWN, (int) V64QI_FTYPE_PCCHAR_V64QI_UDI) +BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_storev32hi_mask, "__builtin_ia32_storedquhi512_mask", IX86_BUILTIN_STOREDQUHI512_MASK, UNKNOWN, (int) VOID_FTYPE_PSHORT_V32HI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_storev64qi_mask, "__builtin_ia32_storedquqi512_mask", IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, (int) VOID_FTYPE_PCHAR_V64QI_UDI) /* AVX512VP2INTERSECT */ BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, "__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, UNKNOWN, (int) VOID_FTYPE_PUHI_PUHI_V16SI_V16SI) @@ -407,9 +407,9 @@ BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_ss_truncatev16hiv16qi2_mask_store, "__builtin_ia32_pmovswb256mem_mask", IX86_BUILTIN_PMOVSWB256_MEM, UNKNOWN, (int) VOID_FTYPE_PV16QI_V16HI_UHI) BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_us_truncatev8hiv8qi2_mask_store_2, "__builtin_ia32_pmovuswb128mem_mask", IX86_BUILTIN_PMOVUSWB128_MEM, UNKNOWN, (int) VOID_FTYPE_PUDI_V8HI_UQI) BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_us_truncatev16hiv16qi2_mask_store, "__builtin_ia32_pmovuswb256mem_mask", IX86_BUILTIN_PMOVUSWB256_MEM, UNKNOWN, (int) VOID_FTYPE_PV16QI_V16HI_UHI) -BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovuswb512mem_mask", IX86_BUILTIN_PMOVUSWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) -BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) -BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovuswb512mem_mask", IX86_BUILTIN_PMOVUSWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) +BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, "__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32HI_USI) /* AVX512FP16 */ BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_loadhf_mask, "__builtin_ia32_loadsh_mask", IX86_BUILTIN_LOADSH_MASK, UNKNOWN, (int) V8HF_FTYPE_PCFLOAT16_V8HF_UQI) @@ -1590,61 +1590,61 @@ BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512f_round BDESC (OPTION_MASK_ISA_AVX512DQ, 0,
[PATCH 16/18] Support -mevex512 for AVX512{IFMA, VBMI, VNNI, BF16, VPOPCNTDQ, VBMI2, BITALG, VP2INTERSECT}, VAES, GFNI, VPCLMULQDQ intrins
From: Haochen Jiang gcc/ChangeLog: * config/i386/sse.md (VI1_AVX512VL): Add TARGET_EVEX512. (VI8_FVL): Ditto. (VI1_AVX512F): Ditto. (VI1_AVX512VNNI): Ditto. (VI1_AVX512VL_F): Ditto. (VI12_VI48F_AVX512VL): Ditto. (*avx512f_permvar_truncv32hiv32qi_1): Ditto. (sdot_prod): Ditto. (VEC_PERM_AVX2): Ditto. (VPERMI2): Ditto. (VPERMI2I): Ditto. (vpmadd52v8di): Ditto. (usdot_prod): Ditto. (vpdpbusd_v16si): Ditto. (vpdpbusds_v16si): Ditto. (vpdpwssd_v16si): Ditto. (vpdpwssds_v16si): Ditto. (VI48_AVX512VP2VL): Ditto. (avx512vp2intersect_2intersectv16si): Ditto. (VF_AVX512BF16VL): Ditto. (VF1_AVX512_256): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr90096.c: Adjust error message. Co-authored-by: Hu, Lin1 --- gcc/config/i386/sse.md | 56 + gcc/testsuite/gcc.target/i386/pr90096.c | 2 +- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index e59f6bf4410..a5a95b9de66 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -298,7 +298,7 @@ (V32BF "TARGET_EVEX512") (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")]) (define_mode_iterator VI1_AVX512VL - [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")]) + [(V64QI "TARGET_EVEX512") (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")]) ;; All vector modes (define_mode_iterator V @@ -531,7 +531,7 @@ [(V8DI "TARGET_AVX512F && TARGET_EVEX512") (V4DI "TARGET_AVX") V2DI]) (define_mode_iterator VI8_FVL - [(V8DI "TARGET_AVX512F") V4DI (V2DI "TARGET_AVX512VL")]) + [(V8DI "TARGET_AVX512F && TARGET_EVEX512") V4DI (V2DI "TARGET_AVX512VL")]) (define_mode_iterator VI8_AVX512VL [(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) @@ -546,10 +546,10 @@ [(V64QI "TARGET_AVX512BW && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI]) (define_mode_iterator VI1_AVX512F - [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI]) + [(V64QI "TARGET_AVX512F && TARGET_EVEX512") (V32QI "TARGET_AVX") V16QI]) (define_mode_iterator VI1_AVX512VNNI - [(V64QI "TARGET_AVX512VNNI") (V32QI "TARGET_AVX2") V16QI]) + [(V64QI "TARGET_AVX512VNNI && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI]) (define_mode_iterator VI12_256_512_AVX512VL [(V64QI "TARGET_EVEX512") (V32QI "TARGET_AVX512VL") @@ -599,7 +599,7 @@ V8DI ]) (define_mode_iterator VI1_AVX512VL_F - [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F")]) + [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F && TARGET_EVEX512")]) (define_mode_iterator VI8_AVX2_AVX512BW [(V8DI "TARGET_AVX512BW && TARGET_EVEX512") (V4DI "TARGET_AVX2") V2DI]) @@ -923,8 +923,8 @@ (V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL") - V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL") - V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")]) + (V64QI "TARGET_EVEX512") (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL") + (V32HI "TARGET_EVEX512") (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")]) (define_mode_iterator VI48F_256 [V8SI V8SF V4DI V4DF]) @@ -14217,7 +14217,7 @@ (const_int 26) (const_int 27) (const_int 28) (const_int 29) (const_int 30) (const_int 31)])))] - "TARGET_AVX512VBMI && ix86_pre_reload_split ()" + "TARGET_AVX512VBMI && TARGET_EVEX512 && ix86_pre_reload_split ()" "#" "&& 1" [(set (match_dup 0) @@ -16040,7 +16040,7 @@ "TARGET_SSE2" { /* Try with vnni instructions. */ - if (( == 64 && TARGET_AVX512VNNI) + if (( == 64 && TARGET_AVX512VNNI && TARGET_EVEX512) || ( < 64 && ((TARGET_AVX512VNNI && TARGET_AVX512VL) || TARGET_AVXVNNI))) { @@ -17320,7 +17320,8 @@ (V8DF "TARGET_AVX512F && TARGET_EVEX512") (V16SI "TARGET_AVX512F && TARGET_EVEX512") (V8DI "TARGET_AVX512F && TARGET_EVEX512") - (V32HI "TARGET_AVX512BW && TARGET_EVEX512") (V64QI "TARGET_AVX512VBMI") +
[PATCH 08/18] [PATCH 2/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
From: Haochen Jiang gcc/ChangeLog: * config/i386/i386-builtin.def (BDESC): Add OPTION_MASK_ISA2_EVEX512. --- gcc/config/i386/i386-builtin.def | 94 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 0cc526383db..7a0dec9bc8b 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2408,37 +2408,37 @@ BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_cmpv2df3_mask, "__builtin_ BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_cmpv4sf3_mask, "__builtin_ia32_cmpps128_mask", IX86_BUILTIN_CMPPS128_MASK, UNKNOWN, (int) UQI_FTYPE_V4SF_V4SF_INT_UQI) /* AVX512DQ. */ -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16sf_mask, "__builtin_ia32_broadcastf32x2_512_mask", IX86_BUILTIN_BROADCASTF32x2_512, UNKNOWN, (int) V16SF_FTYPE_V4SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16si_mask, "__builtin_ia32_broadcasti32x2_512_mask", IX86_BUILTIN_BROADCASTI32x2_512, UNKNOWN, (int) V16SI_FTYPE_V4SI_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv8df_mask_1, "__builtin_ia32_broadcastf64x2_512_mask", IX86_BUILTIN_BROADCASTF64X2_512, UNKNOWN, (int) V8DF_FTYPE_V2DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv8di_mask_1, "__builtin_ia32_broadcasti64x2_512_mask", IX86_BUILTIN_BROADCASTI64X2_512, UNKNOWN, (int) V8DI_FTYPE_V2DI_V8DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16sf_mask_1, "__builtin_ia32_broadcastf32x8_512_mask", IX86_BUILTIN_BROADCASTF32X8_512, UNKNOWN, (int) V16SF_FTYPE_V8SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16si_mask_1, "__builtin_ia32_broadcasti32x8_512_mask", IX86_BUILTIN_BROADCASTI32X8_512, UNKNOWN, (int) V16SI_FTYPE_V8SI_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextractf64x2_mask, "__builtin_ia32_extractf64x2_512_mask", IX86_BUILTIN_EXTRACTF64X2_512, UNKNOWN, (int) V2DF_FTYPE_V8DF_INT_V2DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextractf32x8_mask, "__builtin_ia32_extractf32x8_mask", IX86_BUILTIN_EXTRACTF32X8, UNKNOWN, (int) V8SF_FTYPE_V16SF_INT_V8SF_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextracti64x2_mask, "__builtin_ia32_extracti64x2_512_mask", IX86_BUILTIN_EXTRACTI64X2_512, UNKNOWN, (int) V2DI_FTYPE_V8DI_INT_V2DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextracti32x8_mask, "__builtin_ia32_extracti32x8_mask", IX86_BUILTIN_EXTRACTI32X8, UNKNOWN, (int) V8SI_FTYPE_V16SI_INT_V8SI_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv8df_mask, "__builtin_ia32_reducepd512_mask", IX86_BUILTIN_REDUCEPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv16sf_mask, "__builtin_ia32_reduceps512_mask", IX86_BUILTIN_REDUCEPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_INT_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_mulv8di3_mask, "__builtin_ia32_pmullq512_mask", IX86_BUILTIN_PMULLQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_xorv8df3_mask, "__builtin_ia32_xorpd512_mask", IX86_BUILTIN_XORPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_xorv16sf3_mask, "__builtin_ia32_xorps512_mask", IX86_BUILTIN_XORPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_iorv8df3_mask, "__builtin_ia32_orpd512_mask", IX86_BUILTIN_ORPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_iorv16sf3_mask, "__builtin_ia32_orps512_mask", IX86_BUILTIN_ORPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_andv8df3_mask, "__builtin_ia32_andpd512_mask", IX86_BUILTIN_ANDPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_andv16sf3_mask, "__builtin_ia32_andps512_mask", IX86_BUILTIN_ANDPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512f_andnotv8df3_mask, "__builtin_ia32_andnpd512_mask", IX86_BUILTIN_ANDNPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_UQI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512f_andnotv16sf3_mask, "__builtin_ia32_andnps512_mask", IX86_BUILTIN_ANDNPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vinsertf32x8_mask, "__builtin_ia32_insertf32x8_mask", IX86_BUILTIN_INSERTF32X8, UNKNOWN, (int) V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vinserti32x8_mask, "__builtin_ia32_inserti32x8_mask", IX86_BUILTIN_INSERTI32X8, UNKNOWN, (int) V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI) -BDESC (OPTION_MASK_ISA_AVX512DQ, 0,
[PATCH 00/18] Support -mevex512 for AVX512
Hi all, After previous discussion, instead of supporting option -mavx10.1, we will first introduct option -m[no-]evex512, which will enable/disable 512 bit register and 64 bit mask register. It will not change the current option behavior since if AVX512F is enabled with no evex512 option specified, it will automatically enable 512 bit register and 64 bit mask register. How the patches go comes following: Patch 1 added initial support for option -mevex512. Patch 2-6 refined current intrin file to push evex512 target for all 512 bit intrins. Those scalar intrins remained untouched. Patch 7-11 added OPTION_MASK_ISA2_EVEX512 for all related builtins. Patch 12 disabled zmm register, 512 bit libmvec call for no-evex512, also requested evex512 for vectorization when using 512 bit register. Patch 13-17 supported evex512 in related patterns. Patch 18 added testcases for -mno-evex512 and allowed its usage. The patches currently cause scan-asm fail for pr89229-{5,6,7}b.c since we will emit scalar vmovss here. When trying to use x/ymm 16+ w/o avx512vl but with avx512f+evex512, I suppose we could either emit scalar or zmm instructions. It is quite a rare case on HW since there is no HW w/o avx512vl but with avx512f, so I prefer to not to add maintainence effort here to get a slightly perf improvement. But it could be changed to former behavior. Discussions are welcomed for all the patches. Thx, Haochen Haochen Jiang (18): Initial support for -mevex512 Push evex512 target for 512 bit intrins Push evex512 target for 512 bit intrins Push evex512 target for 512 bit intrins Push evex512 target for 512 bit intrins Push evex512 target for 512 bit intrins Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins Disable zmm register and 512 bit libmvec call when !TARGET_EVEX512 Support -mevex512 for AVX512F intrins Support -mevex512 for AVX512DQ intrins Support -mevex512 for AVX512BW intrins Support -mevex512 for AVX512{IFMA,VBMI,VNNI,BF16,VPOPCNTDQ,VBMI2,BITALG,VP2INTERSECT},VAES,GFNI,VPCLMULQDQ intrins Support -mevex512 for AVX512FP16 intrins Allow -mno-evex512 usage gcc/common/config/i386/i386-common.cc |15 + gcc/config.gcc |19 +- gcc/config/i386/avx5124fmapsintrin.h| 2 +- gcc/config/i386/avx5124vnniwintrin.h| 2 +- gcc/config/i386/avx512bf16intrin.h |31 +- gcc/config/i386/avx512bitalgintrin.h| 155 +- gcc/config/i386/avx512bitalgvlintrin.h | 180 + gcc/config/i386/avx512bwintrin.h| 291 +- gcc/config/i386/avx512dqintrin.h| 1840 +- gcc/config/i386/avx512erintrin.h| 2 +- gcc/config/i386/avx512fintrin.h | 19663 +- gcc/config/i386/avx512fp16intrin.h | 8925 gcc/config/i386/avx512ifmaintrin.h | 4 +- gcc/config/i386/avx512pfintrin.h| 2 +- gcc/config/i386/avx512vbmi2intrin.h | 4 +- gcc/config/i386/avx512vbmiintrin.h | 4 +- gcc/config/i386/avx512vnniintrin.h | 4 +- gcc/config/i386/avx512vp2intersectintrin.h | 4 +- gcc/config/i386/avx512vpopcntdqintrin.h | 4 +- gcc/config/i386/gfniintrin.h|76 +- gcc/config/i386/i386-builtin.def| 1312 +- gcc/config/i386/i386-builtins.cc|96 +- gcc/config/i386/i386-c.cc | 2 + gcc/config/i386/i386-expand.cc |18 +- gcc/config/i386/i386-options.cc |33 +- gcc/config/i386/i386.cc | 168 +- gcc/config/i386/i386.h | 7 +- gcc/config/i386/i386.md | 127 +- gcc/config/i386/i386.opt| 4 + gcc/config/i386/immintrin.h | 2 + gcc/config/i386/predicates.md | 3 +- gcc/config/i386/sse.md | 854 +- gcc/config/i386/vaesintrin.h| 4 +- gcc/config/i386/vpclmulqdqintrin.h | 4 +- gcc/testsuite/gcc.target/i386/noevex512-1.c |13 + gcc/testsuite/gcc.target/i386/noevex512-2.c |13 + gcc/testsuite/gcc.target/i386/noevex512-3.c |13 + gcc/testsuite/gcc.target/i386/pr89229-5b.c | 2 +- gcc/testsuite/gcc.target/i386/pr89229-6b.c | 2 +- gcc/testsuite/gcc.target/i386/pr89229-7b.c | 2 +- gcc/testsuite/gcc.target/i386/pr90096.c | 2 +- 41 files changed, 17170 insertions(+), 16738 deletions(-) create mode 100644 gcc/config/i386/avx512bitalgvlintrin.h create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-1.c create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-2.c create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-3.c -- 2.31.1
[PATCH 01/18] Initial support for -mevex512
From: Haochen Jiang gcc/ChangeLog: * common/config/i386/i386-common.cc (OPTION_MASK_ISA2_EVEX512_SET): New. (OPTION_MASK_ISA2_EVEX512_UNSET): Ditto. (ix86_handle_option): Handle EVEX512. * config/i386/i386-c.cc (ix86_target_macros_internal): Ditto. * config/i386/i386-options.cc: (isa2_opts): Ditto. (ix86_valid_target_attribute_inner_p): Ditto. (ix86_option_override_internal): Set EVEX512 target if it is not explicitly set when AVX512 is enabled. Disable AVX512{PF,ER,4VNNIW,4FAMPS} for -mno-evex512. * config/i386/i386.opt: Add mevex512. Temporaily RejectNegative. --- gcc/common/config/i386/i386-common.cc | 15 +++ gcc/config/i386/i386-c.cc | 2 ++ gcc/config/i386/i386-options.cc | 19 ++- gcc/config/i386/i386.opt | 4 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 95468b7c405..8cc59e08d06 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -123,6 +123,7 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA2_SM3_SET OPTION_MASK_ISA2_SM3 #define OPTION_MASK_ISA2_SHA512_SET OPTION_MASK_ISA2_SHA512 #define OPTION_MASK_ISA2_SM4_SET OPTION_MASK_ISA2_SM4 +#define OPTION_MASK_ISA2_EVEX512_SET OPTION_MASK_ISA2_EVEX512 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same as -msse4.2. */ @@ -309,6 +310,7 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA2_SM3_UNSET OPTION_MASK_ISA2_SM3 #define OPTION_MASK_ISA2_SHA512_UNSET OPTION_MASK_ISA2_SHA512 #define OPTION_MASK_ISA2_SM4_UNSET OPTION_MASK_ISA2_SM4 +#define OPTION_MASK_ISA2_EVEX512_UNSET OPTION_MASK_ISA2_EVEX512 /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same as -mno-sse4.1. */ @@ -1341,6 +1343,19 @@ ix86_handle_option (struct gcc_options *opts, } return true; +case OPT_mevex512: + if (value) + { + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_EVEX512_SET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_EVEX512_SET; + } + else + { + opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_EVEX512_UNSET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_EVEX512_UNSET; + } + return true; + case OPT_mfma: if (value) { diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc index 47768fa0940..93154efa7ff 100644 --- a/gcc/config/i386/i386-c.cc +++ b/gcc/config/i386/i386-c.cc @@ -707,6 +707,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__SHA512__"); if (isa_flag2 & OPTION_MASK_ISA2_SM4) def_or_undef (parse_in, "__SM4__"); + if (isa_flag2 & OPTION_MASK_ISA2_EVEX512) +def_or_undef (parse_in, "__EVEX512__"); if (TARGET_IAMCU) { def_or_undef (parse_in, "__iamcu"); diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index e47f9ed5d5f..a1a7a92da9f 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -250,7 +250,8 @@ static struct ix86_target_opts isa2_opts[] = { "-mavxvnniint16", OPTION_MASK_ISA2_AVXVNNIINT16 }, { "-msm3", OPTION_MASK_ISA2_SM3 }, { "-msha512",OPTION_MASK_ISA2_SHA512 }, - { "-msm4",OPTION_MASK_ISA2_SM4 } + { "-msm4",OPTION_MASK_ISA2_SM4 }, + { "-mevex512",OPTION_MASK_ISA2_EVEX512 } }; static struct ix86_target_opts isa_opts[] = { @@ -1109,6 +1110,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_ISA ("sm3", OPT_msm3), IX86_ATTR_ISA ("sha512", OPT_msha512), IX86_ATTR_ISA ("sm4", OPT_msm4), +IX86_ATTR_ISA ("evex512", OPT_mevex512), /* enum options */ IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), @@ -2559,6 +2561,21 @@ ix86_option_override_internal (bool main_args_p, &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM) & ~opts->x_ix86_isa_flags_explicit); + /* Set EVEX512 target if it is not explicitly set + when AVX512 is enabled. */ + if (TARGET_AVX512F_P(opts->x_ix86_isa_flags) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_EVEX512)) +opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_EVEX512; + + /* Disable AVX512{PF,ER,4VNNIW,4FAMPS} for -mno-evex512. */ + if (!TARGET_EVEX512_P(opts->x_ix86_isa_flags2)) +{ + opts->x_ix86_isa_flags + &= ~(OPTION_MASK_ISA_AVX512PF | OPTION_MASK_ISA_AVX512ER); + opts->x_ix86_isa_flags2 + &= ~(OPTION_MASK_ISA2_AVX5124FMAPS | OPTION_MASK_ISA2_AVX5124VNNIW); +} + /* Validate -mpreferred-stack-boundary= value or default it to PREFERRED_STACK_BOUNDARY_DEFAULT. */ ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT; diff
[r14-4046 Regression] FAIL: 23_containers/vector/bool/110807.cc -std=gnu++17 (test for excess errors) on Linux/x86_64
On Linux/x86_64, 3a0e01f6bb1d6ec444001f2caea6ef43a4a83e3a is the first bad commit commit 3a0e01f6bb1d6ec444001f2caea6ef43a4a83e3a Author: Jonathan Wakely Date: Fri Sep 1 21:27:57 2023 +0100 libstdc++: Add support for running tests with multiple -std options caused FAIL: 23_containers/vector/bool/110807.cc -std=gnu++17 (test for excess errors) with GCC configured with ../../gcc/configure --prefix=/export/users/haochenj/src/gcc-bisect/master/master/r14-4046/usr --enable-clocale=gnu --with-system-zlib --with-demangler-in-ld --with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl --enable-libmpx x86_64-linux --disable-bootstrap To reproduce: $ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check RUNTESTFLAGS="conformance.exp=23_containers/vector/bool/110807.cc --target_board='unix{-m32}'" $ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check RUNTESTFLAGS="conformance.exp=23_containers/vector/bool/110807.cc --target_board='unix{-m32\ -march=cascadelake}'" (Please do not reply to this email, for question about this report, contact me at lin1 dot hu at intel.com.) (If you met problems with cascadelake related, disabling AVX512F in command line might save that.) (However, please make sure that there is no potential problems with AVX512.)
[PATCH] Add myself for write after approval
ChangeLog: * MAINTAINERS (Write After Approval): Add myself. --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 49aa6bae73b..90e2c81f0c2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -460,6 +460,7 @@ Matthew Hiller Kazu Hirata Manfred Hollstein Cong Hou +Lin Hu Falk Hueffner Andrew John Hughes Dominique d'Humieres -- 2.31.1
[PATCH] i386: refactor macros.
Hi, all This patch aims to refactor macros in case some other thing is added to AMX_TILE_SET in future. OK for trunk? BRs, Lin gcc/ChangeLog: * common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AMX_INT8_SET): Change OPTION_MASK_ISA2_AMX_TILE to OPTION_MASK_ISA2_AMX_TILE_SET. (OPTION_MASK_ISA2_AMX_FP16_SET): Ditto (OPTION_MASK_ISA2_AMX_COMPLEX_SET): Ditto (OPTION_MASK_ISA_ABM_SET): Change OPTION_MASK_ISA_POPCNT to OPTION_MASK_ISA_POPCNT_SET. --- gcc/common/config/i386/i386-common.cc | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index bf126f14073..4f79afba917 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -107,18 +107,18 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA2_AVX512VP2INTERSECT_SET OPTION_MASK_ISA2_AVX512VP2INTERSECT #define OPTION_MASK_ISA2_AMX_TILE_SET OPTION_MASK_ISA2_AMX_TILE #define OPTION_MASK_ISA2_AMX_INT8_SET \ - (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_INT8) + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_INT8) #define OPTION_MASK_ISA2_AMX_BF16_SET \ - (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_BF16) + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_BF16) #define OPTION_MASK_ISA2_AVXVNNIINT8_SET OPTION_MASK_ISA2_AVXVNNIINT8 #define OPTION_MASK_ISA2_AVXNECONVERT_SET OPTION_MASK_ISA2_AVXNECONVERT #define OPTION_MASK_ISA2_CMPCCXADD_SET OPTION_MASK_ISA2_CMPCCXADD #define OPTION_MASK_ISA2_AMX_FP16_SET \ - (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_FP16) + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_FP16) #define OPTION_MASK_ISA2_PREFETCHI_SET OPTION_MASK_ISA2_PREFETCHI #define OPTION_MASK_ISA2_RAOINT_SET OPTION_MASK_ISA2_RAOINT #define OPTION_MASK_ISA2_AMX_COMPLEX_SET \ - (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_COMPLEX) + (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_COMPLEX) /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same as -msse4.2. */ @@ -143,7 +143,7 @@ along with GCC; see the file COPYING3. If not see (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2_SET) #define OPTION_MASK_ISA_ABM_SET \ - (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT) + (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT_SET) #define OPTION_MASK_ISA2_PCONFIG_SET OPTION_MASK_ISA2_PCONFIG #define OPTION_MASK_ISA2_WBNOINVD_SET OPTION_MASK_ISA2_WBNOINVD -- 2.31.1
RE: [PATCH] i386: Fix incorrect intrinsic signature for AVX512 s{lli|rai|rli}
OK, I update the change log and modify a part of format. The attached file is the new version. -Original Message- From: Hongtao Liu Sent: Thursday, May 25, 2023 11:40 AM To: Hu, Lin1 Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; ubiz...@gmail.com Subject: Re: [PATCH] i386: Fix incorrect intrinsic signature for AVX512 s{lli|rai|rli} On Thu, May 25, 2023 at 10:55 AM Hu, Lin1 via Gcc-patches wrote: > > Hi all, > > This patch aims to fix incorrect intrinsic signature for > _mm{512|256|}_s{lli|rai|rli}_epi*. And it has been tested on > x86_64-pc-linux-gnu. OK for trunk? > > BRs, > Lin > > gcc/ChangeLog: > > PR target/109173 > PR target/109174 > * config/i386/avx512bwintrin.h (_mm512_srli_epi16): Change type from > int to const int. int to unsigned int or const int to const unsigned int. Others LGTM. > (_mm512_mask_srli_epi16): Ditto. > (_mm512_slli_epi16): Ditto. > (_mm512_mask_slli_epi16): Ditto. > (_mm512_maskz_slli_epi16): Ditto. > (_mm512_srai_epi16): Ditto. > (_mm512_mask_srai_epi16): Ditto. > (_mm512_maskz_srai_epi16): Ditto. > * config/i386/avx512vlintrin.h (_mm256_mask_srli_epi32): Ditto. > (_mm256_maskz_srli_epi32): Ditto. > (_mm_mask_srli_epi32): Ditto. > (_mm_maskz_srli_epi32): Ditto. > (_mm256_mask_srli_epi64): Ditto. > (_mm256_maskz_srli_epi64): Ditto. > (_mm_mask_srli_epi64): Ditto. > (_mm_maskz_srli_epi64): Ditto. > (_mm256_mask_srai_epi32): Ditto. > (_mm256_maskz_srai_epi32): Ditto. > (_mm_mask_srai_epi32): Ditto. > (_mm_maskz_srai_epi32): Ditto. > (_mm256_srai_epi64): Ditto. > (_mm256_mask_srai_epi64): Ditto. > (_mm256_maskz_srai_epi64): Ditto. > (_mm_srai_epi64): Ditto. > (_mm_mask_srai_epi64): Ditto. > (_mm_maskz_srai_epi64): Ditto. > (_mm_mask_slli_epi32): Ditto. > (_mm_maskz_slli_epi32): Ditto. > (_mm_mask_slli_epi64): Ditto. > (_mm_maskz_slli_epi64): Ditto. > (_mm256_mask_slli_epi32): Ditto. > (_mm256_maskz_slli_epi32): Ditto. > (_mm256_mask_slli_epi64): Ditto. > (_mm256_maskz_slli_epi64): Ditto. > (_mm_mask_srai_epi16): Ditto. > (_mm_maskz_srai_epi16): Ditto. > (_mm256_srai_epi16): Ditto. > (_mm256_mask_srai_epi16): Ditto. > (_mm_mask_slli_epi16): Ditto. > (_mm_maskz_slli_epi16): Ditto. > (_mm256_mask_slli_epi16): Ditto. > (_mm256_maskz_slli_epi16): Ditto. > > gcc/testsuite/ChangeLog: > > PR target/109173 > PR target/109174 > * gcc.target/i386/pr109173-1.c: New test. > * gcc.target/i386/pr109174-1.c: Ditto. > --- > gcc/config/i386/avx512bwintrin.h | 32 +++--- > gcc/config/i386/avx512fintrin.h| 58 +++ > gcc/config/i386/avx512vlbwintrin.h | 36 --- > gcc/config/i386/avx512vlintrin.h | 112 +++-- > gcc/testsuite/gcc.target/i386/pr109173-1.c | 57 +++ > gcc/testsuite/gcc.target/i386/pr109174-1.c | 45 + > 6 files changed, 236 insertions(+), 104 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr109173-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr109174-1.c > > diff --git a/gcc/config/i386/avx512bwintrin.h > b/gcc/config/i386/avx512bwintrin.h > index 89790f7917b..791d4e35f32 100644 > --- a/gcc/config/i386/avx512bwintrin.h > +++ b/gcc/config/i386/avx512bwintrin.h > @@ -2880,7 +2880,7 @@ _mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, > __m512i __B, > > extern __inline __m512i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > -_mm512_srli_epi16 (__m512i __A, const int __imm) > +_mm512_srli_epi16 (__m512i __A, const unsigned int __imm) > { >return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, > (__v32hi) > @@ -2891,7 +2891,7 @@ _mm512_srli_epi16 (__m512i __A, const int __imm) > extern __inline __m512i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, > - const int __imm) > + const unsigned int __imm) > { >return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, > (__v32hi) __W, > @@ -2910,7 +2910,7 @@ _mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, > const int __imm) > > extern __inline __m512i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)
[PATCH] i386: Fix incorrect intrinsic signature for AVX512 s{lli|rai|rli}
Hi all, This patch aims to fix incorrect intrinsic signature for _mm{512|256|}_s{lli|rai|rli}_epi*. And it has been tested on x86_64-pc-linux-gnu. OK for trunk? BRs, Lin gcc/ChangeLog: PR target/109173 PR target/109174 * config/i386/avx512bwintrin.h (_mm512_srli_epi16): Change type from int to const int. (_mm512_mask_srli_epi16): Ditto. (_mm512_slli_epi16): Ditto. (_mm512_mask_slli_epi16): Ditto. (_mm512_maskz_slli_epi16): Ditto. (_mm512_srai_epi16): Ditto. (_mm512_mask_srai_epi16): Ditto. (_mm512_maskz_srai_epi16): Ditto. * config/i386/avx512vlintrin.h (_mm256_mask_srli_epi32): Ditto. (_mm256_maskz_srli_epi32): Ditto. (_mm_mask_srli_epi32): Ditto. (_mm_maskz_srli_epi32): Ditto. (_mm256_mask_srli_epi64): Ditto. (_mm256_maskz_srli_epi64): Ditto. (_mm_mask_srli_epi64): Ditto. (_mm_maskz_srli_epi64): Ditto. (_mm256_mask_srai_epi32): Ditto. (_mm256_maskz_srai_epi32): Ditto. (_mm_mask_srai_epi32): Ditto. (_mm_maskz_srai_epi32): Ditto. (_mm256_srai_epi64): Ditto. (_mm256_mask_srai_epi64): Ditto. (_mm256_maskz_srai_epi64): Ditto. (_mm_srai_epi64): Ditto. (_mm_mask_srai_epi64): Ditto. (_mm_maskz_srai_epi64): Ditto. (_mm_mask_slli_epi32): Ditto. (_mm_maskz_slli_epi32): Ditto. (_mm_mask_slli_epi64): Ditto. (_mm_maskz_slli_epi64): Ditto. (_mm256_mask_slli_epi32): Ditto. (_mm256_maskz_slli_epi32): Ditto. (_mm256_mask_slli_epi64): Ditto. (_mm256_maskz_slli_epi64): Ditto. (_mm_mask_srai_epi16): Ditto. (_mm_maskz_srai_epi16): Ditto. (_mm256_srai_epi16): Ditto. (_mm256_mask_srai_epi16): Ditto. (_mm_mask_slli_epi16): Ditto. (_mm_maskz_slli_epi16): Ditto. (_mm256_mask_slli_epi16): Ditto. (_mm256_maskz_slli_epi16): Ditto. gcc/testsuite/ChangeLog: PR target/109173 PR target/109174 * gcc.target/i386/pr109173-1.c: New test. * gcc.target/i386/pr109174-1.c: Ditto. --- gcc/config/i386/avx512bwintrin.h | 32 +++--- gcc/config/i386/avx512fintrin.h| 58 +++ gcc/config/i386/avx512vlbwintrin.h | 36 --- gcc/config/i386/avx512vlintrin.h | 112 +++-- gcc/testsuite/gcc.target/i386/pr109173-1.c | 57 +++ gcc/testsuite/gcc.target/i386/pr109174-1.c | 45 + 6 files changed, 236 insertions(+), 104 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr109173-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr109174-1.c diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h index 89790f7917b..791d4e35f32 100644 --- a/gcc/config/i386/avx512bwintrin.h +++ b/gcc/config/i386/avx512bwintrin.h @@ -2880,7 +2880,7 @@ _mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B, extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_srli_epi16 (__m512i __A, const int __imm) +_mm512_srli_epi16 (__m512i __A, const unsigned int __imm) { return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, (__v32hi) @@ -2891,7 +2891,7 @@ _mm512_srli_epi16 (__m512i __A, const int __imm) extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - const int __imm) + const unsigned int __imm) { return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, (__v32hi) __W, @@ -2910,7 +2910,7 @@ _mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm) extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_slli_epi16 (__m512i __A, const int __B) +_mm512_slli_epi16 (__m512i __A, const unsigned int __B) { return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, (__v32hi) @@ -2921,7 +2921,7 @@ _mm512_slli_epi16 (__m512i __A, const int __B) extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - const int __B) + const unsigned int __B) { return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, (__v32hi) __W, @@ -2930,7 +2930,7 @@ _mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B) +_mm512_maskz_slli_epi16 (__mmask32 __U,
RE: [PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics
More details: Intrinsics guide add these 128/256-bit intrinsics as follow: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=reduce__expand=5814. So we intend to enable these intrinsics for GCC-14. -Original Message- From: Gcc-patches On Behalf Of Hu, Lin1 via Gcc-patches Sent: Tuesday, April 18, 2023 3:03 PM To: gcc-patches@gcc.gnu.org Cc: Liu, Hongtao ; ubiz...@gmail.com Subject: [PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics Hi all, The patch aims to support reduce_*_ep[i|u][8|16] series intrinsics, and has been tested on x86_64-pc-linux-gnu. OK for trunk? BRs, Lin gcc/ChangeLog: * config/i386/avx2intrin.h (_MM_REDUCE_OPERATOR_BASIC_EPI16): New macro. (_MM_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI16): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_mm_reduce_add_epi16): New instrinsics. (_mm_reduce_mul_epi16): Ditto. (_mm_reduce_and_epi16): Ditto. (_mm_reduce_or_epi16): Ditto. (_mm_reduce_max_epi16): Ditto. (_mm_reduce_max_epu16): Ditto. (_mm_reduce_min_epi16): Ditto. (_mm_reduce_min_epu16): Ditto. (_mm256_reduce_add_epi16): Ditto. (_mm256_reduce_mul_epi16): Ditto. (_mm256_reduce_and_epi16): Ditto. (_mm256_reduce_or_epi16): Ditto. (_mm256_reduce_max_epi16): Ditto. (_mm256_reduce_max_epu16): Ditto. (_mm256_reduce_min_epi16): Ditto. (_mm256_reduce_min_epu16): Ditto. (_mm_reduce_add_epi8): Ditto. (_mm_reduce_mul_epi8): Ditto. (_mm_reduce_and_epi8): Ditto. (_mm_reduce_or_epi8): Ditto. (_mm_reduce_max_epi8): Ditto. (_mm_reduce_max_epu8): Ditto. (_mm_reduce_min_epi8): Ditto. (_mm_reduce_min_epu8): Ditto. (_mm256_reduce_add_epi8): Ditto. (_mm256_reduce_mul_epi8): Ditto. (_mm256_reduce_and_epi8): Ditto. (_mm256_reduce_or_epi8): Ditto. (_mm256_reduce_max_epi8): Ditto. (_mm256_reduce_max_epu8): Ditto. (_mm256_reduce_min_epi8): Ditto. (_mm256_reduce_min_epu8): Ditto. * config/i386/avx512vlbwintrin.h: (_mm_mask_reduce_add_epi16): Ditto. (_mm_mask_reduce_mul_epi16): Ditto. (_mm_mask_reduce_and_epi16): Ditto. (_mm_mask_reduce_or_epi16): Ditto. (_mm_mask_reduce_max_epi16): Ditto. (_mm_mask_reduce_max_epu16): Ditto. (_mm_mask_reduce_min_epi16): Ditto. (_mm_mask_reduce_min_epu16): Ditto. (_mm256_mask_reduce_add_epi16): Ditto. (_mm256_mask_reduce_mul_epi16): Ditto. (_mm256_mask_reduce_and_epi16): Ditto. (_mm256_mask_reduce_or_epi16): Ditto. (_mm256_mask_reduce_max_epi16): Ditto. (_mm256_mask_reduce_max_epu16): Ditto. (_mm256_mask_reduce_min_epi16): Ditto. (_mm256_mask_reduce_min_epu16): Ditto. (_mm_mask_reduce_add_epi8): Ditto. (_mm_mask_reduce_mul_epi8): Ditto. (_mm_mask_reduce_and_epi8): Ditto. (_mm_mask_reduce_or_epi8): Ditto. (_mm_mask_reduce_max_epi8): Ditto. (_mm_mask_reduce_max_epu8): Ditto. (_mm_mask_reduce_min_epi8): Ditto. (_mm_mask_reduce_min_epu8): Ditto. (_mm256_mask_reduce_add_epi8): Ditto. (_mm256_mask_reduce_mul_epi8): Ditto. (_mm256_mask_reduce_and_epi8): Ditto. (_mm256_mask_reduce_or_epi8): Ditto. (_mm256_mask_reduce_max_epi8): Ditto. (_mm256_mask_reduce_max_epu8): Ditto. (_mm256_mask_reduce_min_epi8): Ditto. (_mm256_mask_reduce_min_epu8): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vlbw-reduce-op-1.c: New test. --- gcc/config/i386/avx2intrin.h | 347 ++ gcc/config/i386/avx512vlbwintrin.h| 256 + .../gcc.target/i386/avx512vlbw-reduce-op-1.c | 206 +++ 3 files changed, 809 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 1b9c8169a96..9b8c13b7233 100644 --- a/gcc/config/i386/avx2intrin.h +++ b/gcc/config/i386/avx2intrin.h @@ -1915,6 +1915,353 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base, (int) (SCALE)) #endif /* __OPTIMIZE__ */ +#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \ + __v8hi __T1 = (__v8hi)__W; \ + __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, +6, 7); \ + __v8hi __T3 = __T1 op __T2; \ + __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, +6, 7); \ + __v8hi __T5 = __T3 op __T4; \ + __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1
[PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics
Hi all, The patch aims to support reduce_*_ep[i|u][8|16] series intrinsics, and has been tested on x86_64-pc-linux-gnu. OK for trunk? BRs, Lin gcc/ChangeLog: * config/i386/avx2intrin.h (_MM_REDUCE_OPERATOR_BASIC_EPI16): New macro. (_MM_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI16): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto. (_MM_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_MM256_REDUCE_OPERATOR_BASIC_EPI8): Ditto. (_MM256_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto. (_mm_reduce_add_epi16): New instrinsics. (_mm_reduce_mul_epi16): Ditto. (_mm_reduce_and_epi16): Ditto. (_mm_reduce_or_epi16): Ditto. (_mm_reduce_max_epi16): Ditto. (_mm_reduce_max_epu16): Ditto. (_mm_reduce_min_epi16): Ditto. (_mm_reduce_min_epu16): Ditto. (_mm256_reduce_add_epi16): Ditto. (_mm256_reduce_mul_epi16): Ditto. (_mm256_reduce_and_epi16): Ditto. (_mm256_reduce_or_epi16): Ditto. (_mm256_reduce_max_epi16): Ditto. (_mm256_reduce_max_epu16): Ditto. (_mm256_reduce_min_epi16): Ditto. (_mm256_reduce_min_epu16): Ditto. (_mm_reduce_add_epi8): Ditto. (_mm_reduce_mul_epi8): Ditto. (_mm_reduce_and_epi8): Ditto. (_mm_reduce_or_epi8): Ditto. (_mm_reduce_max_epi8): Ditto. (_mm_reduce_max_epu8): Ditto. (_mm_reduce_min_epi8): Ditto. (_mm_reduce_min_epu8): Ditto. (_mm256_reduce_add_epi8): Ditto. (_mm256_reduce_mul_epi8): Ditto. (_mm256_reduce_and_epi8): Ditto. (_mm256_reduce_or_epi8): Ditto. (_mm256_reduce_max_epi8): Ditto. (_mm256_reduce_max_epu8): Ditto. (_mm256_reduce_min_epi8): Ditto. (_mm256_reduce_min_epu8): Ditto. * config/i386/avx512vlbwintrin.h: (_mm_mask_reduce_add_epi16): Ditto. (_mm_mask_reduce_mul_epi16): Ditto. (_mm_mask_reduce_and_epi16): Ditto. (_mm_mask_reduce_or_epi16): Ditto. (_mm_mask_reduce_max_epi16): Ditto. (_mm_mask_reduce_max_epu16): Ditto. (_mm_mask_reduce_min_epi16): Ditto. (_mm_mask_reduce_min_epu16): Ditto. (_mm256_mask_reduce_add_epi16): Ditto. (_mm256_mask_reduce_mul_epi16): Ditto. (_mm256_mask_reduce_and_epi16): Ditto. (_mm256_mask_reduce_or_epi16): Ditto. (_mm256_mask_reduce_max_epi16): Ditto. (_mm256_mask_reduce_max_epu16): Ditto. (_mm256_mask_reduce_min_epi16): Ditto. (_mm256_mask_reduce_min_epu16): Ditto. (_mm_mask_reduce_add_epi8): Ditto. (_mm_mask_reduce_mul_epi8): Ditto. (_mm_mask_reduce_and_epi8): Ditto. (_mm_mask_reduce_or_epi8): Ditto. (_mm_mask_reduce_max_epi8): Ditto. (_mm_mask_reduce_max_epu8): Ditto. (_mm_mask_reduce_min_epi8): Ditto. (_mm_mask_reduce_min_epu8): Ditto. (_mm256_mask_reduce_add_epi8): Ditto. (_mm256_mask_reduce_mul_epi8): Ditto. (_mm256_mask_reduce_and_epi8): Ditto. (_mm256_mask_reduce_or_epi8): Ditto. (_mm256_mask_reduce_max_epi8): Ditto. (_mm256_mask_reduce_max_epu8): Ditto. (_mm256_mask_reduce_min_epi8): Ditto. (_mm256_mask_reduce_min_epu8): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vlbw-reduce-op-1.c: New test. --- gcc/config/i386/avx2intrin.h | 347 ++ gcc/config/i386/avx512vlbwintrin.h| 256 + .../gcc.target/i386/avx512vlbw-reduce-op-1.c | 206 +++ 3 files changed, 809 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 1b9c8169a96..9b8c13b7233 100644 --- a/gcc/config/i386/avx2intrin.h +++ b/gcc/config/i386/avx2intrin.h @@ -1915,6 +1915,353 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base, (int) (SCALE)) #endif /* __OPTIMIZE__ */ +#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \ + __v8hi __T1 = (__v8hi)__W; \ + __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, 6, 7); \ + __v8hi __T3 = __T1 op __T2; \ + __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, 6, 7); \ + __v8hi __T5 = __T3 op __T4; \ + __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1, 1, 2, 3, 4, 5, 6, 7); \ + __v8hi __T7 = __T5 op __T6; \ + return __T7[0] + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_add_epi16 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI16 (+); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_mul_epi16 (__m128i __W) +{ + _MM_REDUCE_OPERATOR_BASIC_EPI16 (*); +} + +extern __inline short +__attribute__ ((__gnu_inline__,
[PATCH] i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm
Hi, all The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128. And it has regtested on x86_64-pc-linux-gnu. OK for trunk? Thanks. Lin vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk. We can optimze them to vblend, vmovaps when there's no cross-lane. gcc/ChangeLog: * config/i386/sse.md: Modify insn vperm{i,f} and vshuf{i,f}. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test. * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto. * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-1.c: New test. * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto. * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto. --- gcc/config/i386/sse.md| 36 -- .../gcc.target/i386/avx512vl-vshuff32x4-1.c | 2 +- .../gcc.target/i386/avx512vl-vshuff64x2-1.c | 2 +- .../gcc.target/i386/avx512vl-vshufi32x4-1.c | 2 +- .../gcc.target/i386/avx512vl-vshufi64x2-1.c | 2 +- .../gcc.target/i386/opt-vperm-vshuf-1.c | 51 ++ .../gcc.target/i386/opt-vperm-vshuf-2.c | 68 +++ .../gcc.target/i386/opt-vperm-vshuf-3.c | 63 + 8 files changed, 218 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 513960e8f33..5b6b2427460 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -18437,6 +18437,8 @@ mask = INTVAL (operands[3]) / 2; mask |= (INTVAL (operands[5]) - 4) / 2 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !) +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; return "vshuf64x2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -18595,6 +18597,9 @@ mask |= (INTVAL (operands[7]) - 8) / 4 << 1; operands[3] = GEN_INT (mask); + if (INTVAL (operands[3]) == 2 && !) +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + return "vshuf32x4\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } [(set_attr "type" "sselog") @@ -25663,7 +25668,28 @@ (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_VPERMTI))] "TARGET_AVX2" - "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}" + { +int mask = INTVAL (operands[3]); +if ((mask & 0xbb) == 16) + { + if (rtx_equal_p (operands[0], operands[1])) + return ""; + else + return "vmovaps\t{%1, %0|%0, %1}"; + } +if ((mask & 0xbb) == 50) + { + if (rtx_equal_p (operands[0], operands[2])) + return ""; + else + return "vmovaps\t{%2, %0|%0, %2}"; + } +if ((mask & 0xbb) == 18) + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; +if ((mask & 0xbb) == 48) + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; +return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"; + } [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -26226,9 +26252,11 @@ && avx_vperm2f128_parallel (operands[3], mode)" { int mask = avx_vperm2f128_parallel (operands[3], mode) - 1; - if (mask == 0x12) -return "vinsert\t{$0, %x2, %1, %0|%0, %1, %x2, 0}"; - if (mask == 0x20) + if ((mask & 0xbb) == 0x12) +return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}"; + if ((mask & 0xbb) == 0x30) +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}"; + if ((mask & 0xbb) == 0x20) return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; operands[3] = GEN_INT (mask); return "vperm2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c index 6c2fb2f184a..02aecf4edce 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f32x4 (x, x, 2); + x = _mm256_shuffle_f32x4 (x, x, 3); x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c index 1191b400134..563ded5d9df 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c @@ -12,7 +12,7 @@ volatile __mmask8 m; void extern avx512vl_test (void) { - x = _mm256_shuffle_f64x2 (x, x, 2); + x = _mm256_shuffle_f64x2 (x, x, 3); x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2); x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2); } diff --git
RE: [PATCH] i386:Add missing OPTION_MASK_ISA_AVX512VL in i386-builtin.def for VAES builtins
It has regtested on x86_64-pc-linux-gnu. OK for trunk? Thanks. Lin -Original Message- From: Uros Bizjak Sent: Tuesday, March 14, 2023 3:05 PM To: Hu, Lin1 Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao Subject: Re: [PATCH] i386:Add missing OPTION_MASK_ISA_AVX512VL in i386-builtin.def for VAES builtins On Tue, Mar 14, 2023 at 7:2 AM Hu, Lin1 wrote: > > The implementation of these builtins requires support for both > AVX512VL and VAES. However, the builtins didn't request AVX512VL. As a > result, compiling pr109117-1.c with the options -mvaes -mno-avx512vl caused > an ICE. > > This patch aims to fix the bug. > > gcc/ChangeLog: > > PR target/109117 > * config/i386/i386-builtin.def (__builtin_ia32_vaesdec_v16qi, > __builtin_ia32_vaesdeclast_v16qi,__builtin_ia32_vaesenc_v16qi, > __builtin_ia32_vaesenclast_v16qi): Require OPTION_MASK_ISA_AVX512VL. > > gcc/testsuite/ChangeLog: > > PR target/109117 > * gcc.target/i386/pr109117-1.c: New test. OK. Thanks, Uros. > --- > gcc/config/i386/i386-builtin.def | 8 > gcc/testsuite/gcc.target/i386/pr109117-1.c | 14 ++ > 2 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 > gcc/testsuite/gcc.target/i386/pr109117-1.c > > diff --git a/gcc/config/i386/i386-builtin.def > b/gcc/config/i386/i386-builtin.def > index f1c295c34f6..17dfe40fac7 100644 > --- a/gcc/config/i386/i386-builtin.def > +++ b/gcc/config/i386/i386-builtin.def > @@ -2797,16 +2797,16 @@ BDESC (0, OPTION_MASK_ISA2_AVX5124VNNIW, > CODE_FOR_avx5124vnniw_vp4dpwssds_mask, > BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, > "__builtin_ia32_rdpid", IX86_BUILTIN_RDPID, UNKNOWN, (int) > UNSIGNED_FTYPE_VOID) > > /* VAES. */ > -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, > "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) > V16QI_FTYPE_V16QI_V16QI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, > +CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", > +IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, > "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) > V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, > CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", > IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC > (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, > "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, > UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, > +CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", > +IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, > "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, > UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, > OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, > "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, > UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC (0, > OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, > "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) > V16QI_FTYPE_V16QI_V16QI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, > +CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", > +IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, > "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) > V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, > CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", > IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC > (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, > "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, > UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, > +CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", > +IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) > BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, > "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, > UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, > OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, > "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, > UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) > > diff --git a/gcc/
[PATCH] i386:Add missing OPTION_MASK_ISA_AVX512VL in i386-builtin.def for VAES builtins
The implementation of these builtins requires support for both AVX512VL and VAES. However, the builtins didn't request AVX512VL. As a result, compiling pr109117-1.c with the options -mvaes -mno-avx512vl caused an ICE. This patch aims to fix the bug. gcc/ChangeLog: PR target/109117 * config/i386/i386-builtin.def (__builtin_ia32_vaesdec_v16qi, __builtin_ia32_vaesdeclast_v16qi,__builtin_ia32_vaesenc_v16qi, __builtin_ia32_vaesenclast_v16qi): Require OPTION_MASK_ISA_AVX512VL. gcc/testsuite/ChangeLog: PR target/109117 * gcc.target/i386/pr109117-1.c: New test. --- gcc/config/i386/i386-builtin.def | 8 gcc/testsuite/gcc.target/i386/pr109117-1.c | 14 ++ 2 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr109117-1.c diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index f1c295c34f6..17dfe40fac7 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2797,16 +2797,16 @@ BDESC (0, OPTION_MASK_ISA2_AVX5124VNNIW, CODE_FOR_avx5124vnniw_vp4dpwssds_mask, BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, "__builtin_ia32_rdpid", IX86_BUILTIN_RDPID, UNKNOWN, (int) UNSIGNED_FTYPE_VOID) /* VAES. */ -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) diff --git a/gcc/testsuite/gcc.target/i386/pr109117-1.c b/gcc/testsuite/gcc.target/i386/pr109117-1.c new file mode 100644 index 000..87a5c0e7fc9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109117-1.c @@ -0,0 +1,14 @@ +/* PR target/109117 */ +/* { dg-do compile } */ +/* { dg-options "-mvaes -mno-avx512vl" } */ + +typedef char __v16qi __attribute__ ((__vector_size__(16))); +typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); +volatile __v16qi x, y; +volatile __m128i res; + +void +foo (void) +{ + res = __builtin_ia32_vaesdec_v16qi (x, y); /* { dg-warning "implicit declaration of function" } */ +} /* { dg-error "incompatible types when assigning to type" "" { target *-*-* } .-1 } */ -- 2.31.1
RE: [PATCH] loading float member of parameter stored via int registers
Sorry for send this mail. I enter the wrong command line. -Original Message- From: Gcc-patches On Behalf Of Segher Boessenkool Sent: Tuesday, January 3, 2023 5:00 PM To: Andrew Pinski Cc: Jiufu Guo ; Jiufu Guo via Gcc-patches ; Richard Biener ; Richard Biener ; dje@gmail.com; li...@gcc.gnu.org; jeffreya...@gmail.com Subject: Re: [PATCH] loading float member of parameter stored via int registers Hi! On Fri, Dec 30, 2022 at 12:30:04AM -0800, Andrew Pinski wrote: > On Thu, Dec 29, 2022 at 11:45 PM Segher Boessenkool > wrote: > > Ah! This simply shows rs6000_modes_tieable_p is decidedly non-optimal: > > it does not allow tying a scalar float to anything else. No such > > thing is required, or good apparently. I wonder why we have such > > restrictions at all in rs6000; is it just unfortunate history, was > > it good at one point in time? > > The documentation for TARGET_MODES_TIEABLE_P says the following: > If TARGET_HARD_REGNO_MODE_OK (r, mode1) and TARGET_HARD_REGNO_MODE_OK > (r, mode2) are always the same for any r, then TARGET_MODES_TIEABLE_P > (mode1, mode2) should be true. If they differ for any r, you should > define this hook to return false unless some other mechanism ensures > the accessibility of the value in a narrower mode. > > even though rs6000_hard_regno_mode_ok_uncached's comment has the following: > /* The float registers (except for VSX vector modes) can only hold floating > modes and DImode. */ That comment is incorrect. See fctiw for example, which defines only the SImode part of the result (the other bits are undefined). > TARGET_P8_VECTOR and TARGET_P9_VECTOR has special cased different modes now: > if (TARGET_P8_VECTOR && (mode == SImode)) > return 1; > > if (TARGET_P9_VECTOR && (mode == QImode || mode == HImode)) > return 1; > Which I suspect that means rs6000_modes_tieable_p should return true > for SImode and SFmode if TARGET_P8_VECTOR is true. Likewise for > TARGET_P9_VECTOR and SFmode and QImode/HImode too. It means that older CPUs do not have as many instructions to do scalar integer operations in vector registers, making it (almost) always a losing proposition to put scalar integers there. On newer CPUs it is not quite as bad, there is a full(er) complement of instructions to do such things in vector regs, just a bit slower than on GPRs. But yeah we might need to fix hard_regno_mode_ok if we change tieable. Segher
RE: [PATCH 2/4] Initial Emeraldrapids Support
"PATCH 2 Initial Emeraldrapids Support" aims to support Emeraldrapids for GCC. It's my mistake, resulting in the omission of its information. -Original Message- From: Liu, Hongtao Sent: Tuesday, January 3, 2023 4:48 PM To: Hu, Lin1 ; gcc-patches@gcc.gnu.org Cc: ubiz...@gmail.com Subject: RE: [PATCH 2/4] Initial Emeraldrapids Support There are actually only two patches, not four, and the subject *Patch 2/4* should be a typo. > -Original Message----- > From: Hu, Lin1 > Sent: Tuesday, January 3, 2023 4:37 PM > To: gcc-patches@gcc.gnu.org > Cc: Liu, Hongtao ; ubiz...@gmail.com > Subject: [PATCH 2/4] Initial Emeraldrapids Support > > gcc/ChangeLog: > > * common/config/i386/cpuinfo.h (get_intel_cpu): Handle Emeraldrapids. > * common/config/i386/i386-common.cc: Add Emeraldrapids. > --- > gcc/common/config/i386/cpuinfo.h | 2 ++ > gcc/common/config/i386/i386-common.cc | 2 ++ > 2 files changed, 4 insertions(+) > > diff --git a/gcc/common/config/i386/cpuinfo.h > b/gcc/common/config/i386/cpuinfo.h > index bde231c07ee..3729b0f14a5 100644 > --- a/gcc/common/config/i386/cpuinfo.h > +++ b/gcc/common/config/i386/cpuinfo.h > @@ -551,6 +551,8 @@ get_intel_cpu (struct __processor_model *cpu_model, >break; > case 0x8f: >/* Sapphire Rapids. */ > +case 0xcf: > + /* Emerald Rapids. */ >cpu = "sapphirerapids"; >CHECK___builtin_cpu_is ("corei7"); >CHECK___builtin_cpu_is ("sapphirerapids"); diff --git > a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386- > common.cc index 7751265aff4..026926d8b41 100644 > --- a/gcc/common/config/i386/i386-common.cc > +++ b/gcc/common/config/i386/i386-common.cc > @@ -2465,6 +2465,8 @@ const pta processor_alias_table[] = > M_CPU_SUBTYPE (INTEL_COREI7_COOPERLAKE), P_PROC_AVX512F}, >{"sapphirerapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, > PTA_SAPPHIRERAPIDS, > M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F}, > + {"emeraldrapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, > PTA_SAPPHIRERAPIDS, > +M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F}, >{"alderlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, > M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, >{"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, > -- > 2.18.2
[PATCH 2/4] Initial Emeraldrapids Support
gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_intel_cpu): Handle Emeraldrapids. * common/config/i386/i386-common.cc: Add Emeraldrapids. --- gcc/common/config/i386/cpuinfo.h | 2 ++ gcc/common/config/i386/i386-common.cc | 2 ++ 2 files changed, 4 insertions(+) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index bde231c07ee..3729b0f14a5 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -551,6 +551,8 @@ get_intel_cpu (struct __processor_model *cpu_model, break; case 0x8f: /* Sapphire Rapids. */ +case 0xcf: + /* Emerald Rapids. */ cpu = "sapphirerapids"; CHECK___builtin_cpu_is ("corei7"); CHECK___builtin_cpu_is ("sapphirerapids"); diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc index 7751265aff4..026926d8b41 100644 --- a/gcc/common/config/i386/i386-common.cc +++ b/gcc/common/config/i386/i386-common.cc @@ -2465,6 +2465,8 @@ const pta processor_alias_table[] = M_CPU_SUBTYPE (INTEL_COREI7_COOPERLAKE), P_PROC_AVX512F}, {"sapphirerapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, PTA_SAPPHIRERAPIDS, M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F}, + {"emeraldrapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, PTA_SAPPHIRERAPIDS, +M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F}, {"alderlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2}, {"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE, -- 2.18.2
[PATCH 1/4] i386: Remove Meteorlake's family_model
Hi all, This patch aims to modified meteorlake's family_model. Regtested on x86_64-pc-linux-gnu. Ok for trunk? BRs, Lin gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_intel_cpu): Remove case 0xb5 for meteorlake. --- gcc/common/config/i386/cpuinfo.h | 1 - 1 file changed, 1 deletion(-) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index 099a02467e6..bde231c07ee 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -540,7 +540,6 @@ get_intel_cpu (struct __processor_model *cpu_model, /* Alder Lake. */ case 0xb7: /* Raptor Lake. */ -case 0xb5: case 0xaa: case 0xac: /* Meteor Lake. */ -- 2.18.2
[PATCH] testsuite: Fix up avx256-unaligned-store-3.c test.
Hi all, This patch aims to fix a problem that avx256-unaligned-store-3.c test reports two unexpected fails under "-march=cascadelake". Regtested on x86_64-pc-linux-gnu. Ok for trunk? BRs, Lin gcc/testsuite/ChangeLog: PR target/94962 * gcc.target/i386/avx256-unaligned-store-3.c: Add -mno-avx512f --- gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c index f909099bcb1..67635fb9e66 100644 --- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store -mtune=generic -fno-common" } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store -mtune=generic -fno-common -mno-avx512f" } */ #define N 1024 -- 2.18.2
RE: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))
Hi, Hongtao I have modefied this patch and regtested on x86_64-pc-linux-gnu. BRs. Lin -Original Message- From: Hongtao Liu Sent: Friday, September 23, 2022 9:48 AM To: Hu, Lin1 Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao Subject: Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches wrote: > > Hi all, > > This patch aims to optimize code generation of > __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of > instructions required to achieve the final result. > > Regtested on x86_64-pc-linux-gnu. Ok for trunk? > > BRs, > Lin > > gcc/ChangeLog: > > PR target/94962 > * config/i386/constraints.md (BH): New define_constraint. > * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when > operand matches new predicate. > (standard_sse_constant_opcode): Add new alternative branch to return > "vpcmpeqd". > * config/i386/predicates.md > (vector_all_ones_zero_extend_half_operand): New define_predicate. > (vector_all_ones_zero_extend_quarter_operand): Ditto. > * config/i386/sse.md: Add constraint to insn "mov_internal". (mov_internal): Add new constraint BH. Put the insn name at first. > > gcc/testsuite/ChangeLog: > > PR target/94962 > * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. > * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. > * gcc.target/i386/pr94962-1.c: New test. > * gcc.target/i386/pr94962-2.c: Ditto. > * gcc.target/i386/pr94962-3.c: Ditto. > * gcc.target/i386/pr94962-4.c: Ditto. > --- > gcc/config/i386/constraints.md| 8 +++ > gcc/config/i386/i386.cc | 26 +++- > gcc/config/i386/predicates.md | 49 ++ > gcc/config/i386/sse.md| 8 +-- > .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- > .../i386/avx256-unaligned-store-1.c | 4 +- > .../i386/avx256-unaligned-store-2.c | 4 +- > .../i386/avx256-unaligned-store-3.c | 4 +- > gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 > gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 + > gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++ > gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++ > 12 files changed, 235 insertions(+), 13 deletions(-) create mode > 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c > > diff --git a/gcc/config/i386/constraints.md > b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 > --- a/gcc/config/i386/constraints.md > +++ b/gcc/config/i386/constraints.md > @@ -168,6 +168,9 @@ > ;; z Constant call address operand. > ;; C Integer SSE constant with all bits set operand. > ;; F Floating-point SSE constant with all bits set operand. > +;; H Integer SSE constant that is 128/256bit all ones > +;; and zero-extand to 256/512bit, or 128bit all ones > +;; and zero-extend to 512bit. > ;; M x86-64 memory operand. > > (define_constraint "Bf" > @@ -233,6 +236,11 @@ >(and (match_test "TARGET_SSE") > (match_operand 0 "float_vector_all_ones_operand"))) > > +(define_constraint "BH" > + "@internal integer constant with last half/quarter bits set operand." > + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") > + (match_operand 0 > +"vector_all_ones_zero_extend_quarter_operand"))) > + > ;; NB: Similar to 'm', but don't use define_memory_constraint on > x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg > X))' > ;; where X is a base register. > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index > dadf453d6c0..ca799da5d7e 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) >XFmode); } > > -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 > +/* Return 1 if X is all bits 0, 2 if X is all bits 1 > + and 3 if X is all bits 1 with zero extend > in supported SSE/AVX vector mode. */ > > int > @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) >
[PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))
Hi all, This patch aims to optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions required to achieve the final result. Regtested on x86_64-pc-linux-gnu. Ok for trunk? BRs, Lin gcc/ChangeLog: PR target/94962 * config/i386/constraints.md (BH): New define_constraint. * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when operand matches new predicate. (standard_sse_constant_opcode): Add new alternative branch to return "vpcmpeqd". * config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): New define_predicate. (vector_all_ones_zero_extend_quarter_operand): Ditto. * config/i386/sse.md: Add constraint to insn "mov_internal". gcc/testsuite/ChangeLog: PR target/94962 * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. * gcc.target/i386/pr94962-1.c: New test. * gcc.target/i386/pr94962-2.c: Ditto. * gcc.target/i386/pr94962-3.c: Ditto. * gcc.target/i386/pr94962-4.c: Ditto. --- gcc/config/i386/constraints.md| 8 +++ gcc/config/i386/i386.cc | 26 +++- gcc/config/i386/predicates.md | 49 ++ gcc/config/i386/sse.md| 8 +-- .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- .../i386/avx256-unaligned-store-1.c | 4 +- .../i386/avx256-unaligned-store-2.c | 4 +- .../i386/avx256-unaligned-store-3.c | 4 +- gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 + gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++ gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++ 12 files changed, 235 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -168,6 +168,9 @@ ;; z Constant call address operand. ;; C Integer SSE constant with all bits set operand. ;; F Floating-point SSE constant with all bits set operand. +;; H Integer SSE constant that is 128/256bit all ones +;; and zero-extand to 256/512bit, or 128bit all ones +;; and zero-extend to 512bit. ;; M x86-64 memory operand. (define_constraint "Bf" @@ -233,6 +236,11 @@ (and (match_test "TARGET_SSE") (match_operand 0 "float_vector_all_ones_operand"))) +(define_constraint "BH" + "@internal integer constant with last half/quarter bits set operand." + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") + (match_operand 0 "vector_all_ones_zero_extend_quarter_operand"))) + ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg X))' ;; where X is a base register. diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index dadf453d6c0..ca799da5d7e 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) XFmode); } -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 +/* Return 1 if X is all bits 0, 2 if X is all bits 1 + and 3 if X is all bits 1 with zero extend in supported SSE/AVX vector mode. */ int @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) } } + if (vector_all_ones_zero_extend_half_operand (x, mode) + || vector_all_ones_zero_extend_quarter_operand (x, mode)) +return 3; + return 0; } @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) gcc_unreachable (); } } + else if (vector_all_ones_zero_extend_half_operand (x, mode)) +{ + if (GET_MODE_SIZE (mode) == 64) + { + gcc_assert (TARGET_AVX512F); + return "vpcmpeqd \t %t0, %t0, %t0"; + } + else if (GET_MODE_SIZE (mode) == 32) + { + gcc_assert (TARGET_AVX); + return "vpcmpeqd \t %x0, %x0, %x0"; + } + gcc_unreachable (); +} + else if (vector_all_ones_zero_extend_quarter_operand (x, mode)) +{ + gcc_assert (TARGET_AVX512F); + return "vpcmpeqd \t %x0, %x0, %x0"; +} gcc_unreachable (); } diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 4f16bb748b5..655eabf793b 100644 --- a/gcc/config/i386/predicates.md +++