[PATCH 1/3 v4] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-06-24 Thread Hu, Lin1
Hi,

This is the current version. 

I haven't made any major changes to the original code, I think it will have 
less impact on your code. And I think the current API is sufficient to support 
the mode selection you mentioned, if you have any concerns you can mention 
them. I can tweak it further.

BRs,
Lin

gcc/ChangeLog:

PR target/107432
* tree-vect-generic.cc
(expand_vector_conversion): Support convert for int -> int,
float -> float and int <-> float.
* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
indirect convert part.
(supportable_indirect_convert_operation): New function.
* tree-vectorizer.h (supportable_indirect_convert_operation):
Define the new function.

gcc/testsuite/ChangeLog:

PR target/107432
* gcc.target/i386/pr107432-1.c: New test.
* gcc.target/i386/pr107432-2.c: Ditto.
* gcc.target/i386/pr107432-3.c: Ditto.
* gcc.target/i386/pr107432-4.c: Ditto.
* gcc.target/i386/pr107432-5.c: Ditto.
* gcc.target/i386/pr107432-6.c: Ditto.
* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +++
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 +++
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 150 
 gcc/tree-vect-generic.cc   |  34 ++-
 gcc/tree-vect-stmts.cc | 259 ++---
 gcc/tree-vectorizer.h  |   4 +
 10 files changed, 1013 insertions(+), 95 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c 
b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include 
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{

RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-06-24 Thread Hu, Lin1
> -Original Message-
> From: Tamar Christina 
> Sent: Monday, June 24, 2024 10:12 PM
> To: Richard Biener ; Hu, Lin1 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> ubiz...@gmail.com
> Subject: RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> 
> int,
> float -> float and int <-> float.
> 
> > -Original Message-
> > From: Richard Biener 
> > Sent: Monday, June 24, 2024 1:34 PM
> > To: Hu, Lin1 
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> > ubiz...@gmail.com
> > Subject: RE: [PATCH 1/3 v3] vect: generate suitable convert insn for
> > int -> int, float
> > -> float and int <-> float.
> >
> > On Thu, 20 Jun 2024, Hu, Lin1 wrote:
> >
> > > > >else if (ret_elt_bits > arg_elt_bits)
> > > > >  modifier = WIDEN;
> > > > >
> > > > > +  if (supportable_convert_operation (code, ret_type, arg_type, 
> > > > > ))
> > > > > +{
> > > > > +  g = gimple_build_assign (lhs, code1, arg);
> > > > > +  gsi_replace (gsi, g, false);
> > > > > +  return;
> > > > > +}
> > > >
> > > > Given the API change I suggest below it might make sense to have
> > > > supportable_indirect_convert_operation do the above and represent
> > > > it as
> > single-
> > > > step conversion?
> > > >
> > >
> > > OK, if you want to supportable_indirect_convert_operation can do
> > > something like supportable_convert_operation, I'll give it a try.
> > > This functionality is really the part that this function can cover.
> > > But this would require some changes not only the API change, because
> > > supportable_indirect_convert_operation originally only supported
> > > Float
> > > -> Int or Int ->Float.
> >
> > I think I'd like to see a single API to handle direct and
> > (multi-)indirect-level converts that operate on vectors with all the
> > same number of lanes.
> >
> > > >
> > > > > +  code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
> > > > > +  int multi_step_cvt = 0;
> > > > > +  vec interm_types = vNULL;
> > > > > +  if (supportable_indirect_convert_operation (NULL,
> > > > > +   code,
> > > > > +   ret_type, arg_type,
> > > > > +   , ,
> > > > > +   _step_cvt,
> > > > > +   _types, arg))
> > > > > +{
> > > > > +  new_rhs = make_ssa_name (interm_types[0]);
> > > > > +  g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
> > > > > +  gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > > > > +  g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
> > > > > +  gsi_replace (gsi, g, false);
> > > > > +  return;
> > > > > +}
> > > > > +
> > > > >if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> > > > FLOAT_EXPR))
> > > > >  {
> > > > > -  if (supportable_convert_operation (code, ret_type, arg_type,
> ))
> > > > > - {
> > > > > -   g = gimple_build_assign (lhs, code1, arg);
> > > > > -   gsi_replace (gsi, g, false);
> > > > > -   return;
> > > > > - }
> > > > >/* Can't use get_compute_type here, as
> supportable_convert_operation
> > > > >doesn't necessarily use an optab and needs two arguments.  */
> > > > >tree vec_compute_type
> > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > index 05a169ecb2d..0aa608202ca 100644
> > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
> > > > >tree scalar_dest;
> > > > >tree op0, op1 = NULL_TREE;
> > > > >loop_vec_info loop_vinfo = dyn_cast  (vinfo);
> > > > > -  tree_code tc1, tc2;
> > > > > +  tree_code tc1;
> > > > >code_helper code, code1, code2;
> > > > >code_helper codecvt1 = ERROR_MARK, c

RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-06-20 Thread Hu, Lin1
> >else if (ret_elt_bits > arg_elt_bits)
> >  modifier = WIDEN;
> >
> > +  if (supportable_convert_operation (code, ret_type, arg_type, ))
> > +{
> > +  g = gimple_build_assign (lhs, code1, arg);
> > +  gsi_replace (gsi, g, false);
> > +  return;
> > +}
> 
> Given the API change I suggest below it might make sense to have
> supportable_indirect_convert_operation do the above and represent it as 
> single-
> step conversion?
>

OK, if you want to supportable_indirect_convert_operation can do something like 
supportable_convert_operation, I'll give it a try. This functionality is really 
the part that this function can cover. But this would require some changes not 
only the API change, because supportable_indirect_convert_operation originally 
only supported Float -> Int or Int ->Float.
 
>
> > +  code_helper code2 = ERROR_MARK, code3 = ERROR_MARK;
> > +  int multi_step_cvt = 0;
> > +  vec interm_types = vNULL;
> > +  if (supportable_indirect_convert_operation (NULL,
> > + code,
> > + ret_type, arg_type,
> > + , ,
> > + _step_cvt,
> > + _types, arg))
> > +{
> > +  new_rhs = make_ssa_name (interm_types[0]);
> > +  g = gimple_build_assign (new_rhs, (tree_code) code3, arg);
> > +  gsi_insert_before (gsi, g, GSI_SAME_STMT);
> > +  g = gimple_build_assign (lhs, (tree_code) code2, new_rhs);
> > +  gsi_replace (gsi, g, false);
> > +  return;
> > +}
> > +
> >if (modifier == NONE && (code == FIX_TRUNC_EXPR || code ==
> FLOAT_EXPR))
> >  {
> > -  if (supportable_convert_operation (code, ret_type, arg_type, ))
> > -   {
> > - g = gimple_build_assign (lhs, code1, arg);
> > - gsi_replace (gsi, g, false);
> > - return;
> > -   }
> >/* Can't use get_compute_type here, as supportable_convert_operation
> >  doesn't necessarily use an optab and needs two arguments.  */
> >tree vec_compute_type
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> > 05a169ecb2d..0aa608202ca 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -5175,7 +5175,7 @@ vectorizable_conversion (vec_info *vinfo,
> >tree scalar_dest;
> >tree op0, op1 = NULL_TREE;
> >loop_vec_info loop_vinfo = dyn_cast  (vinfo);
> > -  tree_code tc1, tc2;
> > +  tree_code tc1;
> >code_helper code, code1, code2;
> >code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
> >tree new_temp;
> > @@ -5384,92 +5384,17 @@ vectorizable_conversion (vec_info *vinfo,
> > break;
> >}
> >
> > -  /* For conversions between float and integer types try whether
> > -we can use intermediate signed integer types to support the
> > -conversion.  */
> > -  if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
> > - && (code == FLOAT_EXPR ||
> > - (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
> > -   {
> > - bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE
> (lhs_mode);
> > - bool float_expr_p = code == FLOAT_EXPR;
> > - unsigned short target_size;
> > - scalar_mode intermediate_mode;
> > - if (demotion)
> > -   {
> > - intermediate_mode = lhs_mode;
> > - target_size = GET_MODE_SIZE (rhs_mode);
> > -   }
> > - else
> > -   {
> > - target_size = GET_MODE_SIZE (lhs_mode);
> > - if (!int_mode_for_size
> > - (GET_MODE_BITSIZE (rhs_mode), 0).exists
> (_mode))
> > -   goto unsupported;
> > -   }
> > - code1 = float_expr_p ? code : NOP_EXPR;
> > - codecvt1 = float_expr_p ? NOP_EXPR : code;
> > - opt_scalar_mode mode_iter;
> > - FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
> > -   {
> > - intermediate_mode = mode_iter.require ();
> > -
> > - if (GET_MODE_SIZE (intermediate_mode) > target_size)
> > -   break;
> > -
> > - scalar_mode cvt_mode;
> > - if (!int_mode_for_size
> > - (GET_MODE_BITSIZE (intermediate_mode), 0).exists
> (_mode))
> > -   break;
> > -
> > - cvt_type = build_nonstandard_integer_type
> > -   (GET_MODE_BITSIZE (cvt_mode), 0);
> > -
> > - /* Check if the intermediate type can hold OP0's range.
> > -When converting from float to integer this is not necessary
> > -because values that do not fit the (smaller) target type are
> > -unspecified anyway.  */
> > - if (demotion && float_expr_p)
> > -   {
> > - wide_int op_min_value, op_max_value;
> > - if (!vect_get_range_info (op0, _min_value,
> _max_value))
> > -   break;
> > -
> > - if (cvt_type == NULL_TREE
> > - || (wi::min_precision (op_max_value, SIGNED)
> > - > TYPE_PRECISION 

RE: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-06-16 Thread Hu, Lin1
Ping this thread.

BRs,
Lin

-Original Message-
From: Hu, Lin1  
Sent: Tuesday, June 11, 2024 2:49 PM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao ; ubiz...@gmail.com; rguent...@suse.de
Subject: [PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, 
float -> float and int <-> float.

I wrap a part of code about indirect conversion. The API refers to 
supportable_narrowing/widening_operations.

BRs,
Lin

gcc/ChangeLog:

PR target/107432
* tree-vect-generic.cc
(expand_vector_conversion): Support convert for int -> int,
float -> float and int <-> float.
* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
indirect convert part.
(supportable_indirect_convert_operation): New function.
* tree-vectorizer.h (supportable_indirect_convert_operation):
Define the new function.

gcc/testsuite/ChangeLog:

PR target/107432
* gcc.target/i386/pr107432-1.c: New test.
* gcc.target/i386/pr107432-2.c: Ditto.
* gcc.target/i386/pr107432-3.c: Ditto.
* gcc.target/i386/pr107432-4.c: Ditto.
* gcc.target/i386/pr107432-5.c: Ditto.
* gcc.target/i386/pr107432-6.c: Ditto.
* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234   
gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +  
gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +  
gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +  
gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++  
gcc/testsuite/gcc.target/i386/pr107432-6.c | 139   
gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 +
 gcc/tree-vect-generic.cc   |  33 ++-
 gcc/tree-vect-stmts.cc | 244 +
 gcc/tree-vectorizer.h  |   9 +
 10 files changed, 1011 insertions(+), 92 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c 
b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include 
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); 
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
+  return __builtin_convertvector((__v2di)a, __v2si); }
+
+__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
+
+__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
+
+__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi); }
+
+__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi); }
+
+__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
+
+__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi); }
+
+__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i

[PATCH] i386: Refine all cvtt* instructions with UNSPEC instead of FIX/UNSIGNED_FIX.

2024-06-13 Thread Hu, Lin1
Hi, all

This patch aims to refine all cvtt* instructions with UNSPEC instead of
FIX/UNSIGNED_FIX. Because the intrinsics should behave as documented.

Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?

BRs,
Lin

gcc/ChangeLog:

PR target/115161
* config/i386/i386-builtin.def: Change CODE_FOR_* for cvtt*'s builtins.
* config/i386/sse.md
* 
(unspec_avx512fp16_fix_trunc2):
Use UNSPEC instead of FIX/UNSIGNED_FIX.
(unspec_avx512fp16_fix_trunc2):
Ditto.
(unspec_avx512fp16_fix_truncv2di2): Ditto.

(unspec_avx512fp16_fix_trunc2):
Ditto.
(unspec_sse_cvttps2pi): Ditto.
(unspec_sse_cvttss2si): Ditto.

(unspec_fix_truncv16sfv16si2):
Ditto.
(unspec_fix_truncv8sfv8si2): Ditto.
(unspec_fix_truncv4sfv4si2): Ditto.
(unspec_sse2_cvttpd2pi): Ditto.
(unspec_fixuns_truncv2dfv2si2): Ditto.
(unspec_avx512f_vcvttss2usi):
Ditto.
(unspec_avx512f_vcvttsd2usi):
Ditto.
(unspec_sse2_cvttsd2si): Ditto.

(unspec_fix_truncv8dfv8si2):
Ditto.
(*unspec_fixuns_truncv2dfv2si2): Ditto.
(unspec_fixuns_truncv2dfv2si2_mask): Ditto.
(unspec_fix_truncv4dfv4si2): Ditto.
(unspec_fixuns_truncv4dfv4si2): Ditto.

(unspec_fix_trunc2):
Ditto.

(unspec_fix_trunc2):
Ditto.
(unspec_avx512dq_fix_truncv2sfv2di2):
Ditto.

(unspec_fixuns_trunc2):
Ditto.
(unspec_sse2_cvttpd2dq): Ditto.

gcc/testsuite/ChangeLog:

PR target/115161
* gcc.target/i386/pr115161-1.c: New test.
---
 gcc/config/i386/i386-builtin.def   | 128 
 gcc/config/i386/sse.md | 335 +
 gcc/testsuite/gcc.target/i386/pr115161-1.c |  65 
 3 files changed, 464 insertions(+), 64 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115161-1.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 729355230b8..893e2baa006 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -631,9 +631,9 @@ BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_rcpv4sf2, 
"__builtin_ia32_rcpps", IX
 BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX, 0, CODE_FOR_sse_cvtps2pi, 
"__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) 
V2SI_FTYPE_V4SF)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_cvtss2si, 
"__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF)
 BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse_cvtss2siq, 
"__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) 
INT64_FTYPE_V4SF)
-BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX, 0, CODE_FOR_sse_cvttps2pi, 
"__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) 
V2SI_FTYPE_V4SF)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_cvttss2si, 
"__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) 
INT_FTYPE_V4SF)
-BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, 0, 
CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", 
IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF)
+BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_unspec_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", 
IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF)
+BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_unspec_sse_cvttss2si, 
"__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) 
INT_FTYPE_V4SF)
+BDESC (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, 0, 
CODE_FOR_unspec_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", 
IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF)
 
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", 
IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
 
@@ -725,19 +725,19 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_floatv4siv4sf2, 
"__builtin_ia32_cvtdq2p
 BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_cvtpd2dq, 
"__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) 
V4SI_FTYPE_V2DF)
 BDESC (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_sse2_cvtpd2pi, 
"__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) 
V2SI_FTYPE_V2DF)
 BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_cvtpd2ps, 
"__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) 
V4SF_FTYPE_V2DF)
-BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_cvttpd2dq, 
"__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) 
V4SI_FTYPE_V2DF)
-BDESC (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_sse2_cvttpd2pi, 
"__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) 
V2SI_FTYPE_V2DF)
+BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_unspec_sse2_cvttpd2dq, 
"__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) 
V4SI_FTYPE_V2DF)
+BDESC (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_unspec_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", 

[PATCH 1/3 v3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-06-11 Thread Hu, Lin1
I wrap a part of code about indirect conversion. The API refers to 
supportable_narrowing/widening_operations.

BRs,
Lin

gcc/ChangeLog:

PR target/107432
* tree-vect-generic.cc
(expand_vector_conversion): Support convert for int -> int,
float -> float and int <-> float.
* tree-vect-stmts.cc (vectorizable_conversion): Wrap the
indirect convert part.
(supportable_indirect_convert_operation): New function.
* tree-vectorizer.h (supportable_indirect_convert_operation):
Define the new function.

gcc/testsuite/ChangeLog:

PR target/107432
* gcc.target/i386/pr107432-1.c: New test.
* gcc.target/i386/pr107432-2.c: Ditto.
* gcc.target/i386/pr107432-3.c: Ditto.
* gcc.target/i386/pr107432-4.c: Ditto.
* gcc.target/i386/pr107432-5.c: Ditto.
* gcc.target/i386/pr107432-6.c: Ditto.
* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 +
 gcc/tree-vect-generic.cc   |  33 ++-
 gcc/tree-vect-stmts.cc | 244 +
 gcc/tree-vectorizer.h  |   9 +
 10 files changed, 1011 insertions(+), 92 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c 
b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include 
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+

RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-06-03 Thread Hu, Lin1
> -Original Message-
> From: Richard Biener 
> Sent: Monday, June 3, 2024 5:03 PM
> To: Hu, Lin1 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> ubiz...@gmail.com
> Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, 
> float
> -> float and int <-> float.
> 
> On Mon, 3 Jun 2024, Hu, Lin1 wrote:
> 
> > > -Original Message-
> > > From: Richard Biener 
> > > Sent: Friday, May 31, 2024 8:41 PM
> > > To: Hu, Lin1 
> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> > > ubiz...@gmail.com
> > > Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for
> > > int -> int, float
> > > -> float and int <-> float.
> > >
> > > On Fri, 31 May 2024, Hu, Lin1 wrote:
> > >
> > > > > -Original Message-
> > > > > From: Richard Biener 
> > > > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > > > To: Hu, Lin1 
> > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao
> > > > > ; ubiz...@gmail.com
> > > > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn
> > > > > for int -> int, float
> > > > > -> float and int <-> float.
> > > > >
> > > > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > PR target/107432
> > > > > > * tree-vect-generic.cc
> > > > > > (supportable_indirect_narrowing_operation): New function for
> > > > > > support indirect narrowing convert.
> > > > > > (supportable_indirect_widening_operation): New function for
> > > > > > support indirect widening convert.
> > > > > > (expand_vector_conversion): Support convert for int -> int,
> > > > > > float -> float and int <-> float.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > PR target/107432
> > > > > > * gcc.target/i386/pr107432-1.c: New test.
> > > > > > * gcc.target/i386/pr107432-2.c: Ditto.
> > > > > > * gcc.target/i386/pr107432-3.c: Ditto.
> > > > > > * gcc.target/i386/pr107432-4.c: Ditto.
> > > > > > * gcc.target/i386/pr107432-5.c: Ditto.
> > > > > > * gcc.target/i386/pr107432-6.c: Ditto.
> > > > > > * gcc.target/i386/pr107432-7.c: Ditto.
> > > > > > ---
> > > > > > diff --git a/gcc/tree-vect-generic.cc
> > > > > > b/gcc/tree-vect-generic.cc index
> > > > > > ab640096ca2..0bedb53d9f9 100644
> > > > > > --- a/gcc/tree-vect-generic.cc
> > > > > > +++ b/gcc/tree-vect-generic.cc
> > > > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If
> > > > > > not see #include "gimple-match.h"
> > > > > >  #include "recog.h" /* FIXME: for insn_data */
> > > > > >  #include "optabs-libfuncs.h"
> > > > > > +#include "cfgloop.h"
> > > > > > +#include "tree-vectorizer.h"
> > > > > >
> > > > > >
> > > > > >  /* Build a ternary operation and gimplify it.  Emit code before 
> > > > > > GSI.
> > > > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > > > >return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > > > >
> > > > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > > > +conversion
> > > > > for
> > > > > > +   float <-> int, like double -> char.  */ bool
> > > > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator 
> > > > > > *gsi,
> > > > > > +enum tree_code code,
> > > > > > +tree lhs,
> > > > > > +tree arg)
> > > > > > +{
> > > > > > +  gimple *g;
> > > > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > > > +  tree arg_type = TREE_TYPE (arg);
> > 

RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-06-03 Thread Hu, Lin1
> -Original Message-
> From: Richard Biener 
> Sent: Friday, May 31, 2024 8:41 PM
> To: Hu, Lin1 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> ubiz...@gmail.com
> Subject: RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, 
> float
> -> float and int <-> float.
> 
> On Fri, 31 May 2024, Hu, Lin1 wrote:
> 
> > > -Original Message-
> > > From: Richard Biener 
> > > Sent: Wednesday, May 29, 2024 5:41 PM
> > > To: Hu, Lin1 
> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> > > ubiz...@gmail.com
> > > Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for
> > > int -> int, float
> > > -> float and int <-> float.
> > >
> > > On Thu, 23 May 2024, Hu, Lin1 wrote:
> > >
> > > > gcc/ChangeLog:
> > > >
> > > > PR target/107432
> > > > * tree-vect-generic.cc
> > > > (supportable_indirect_narrowing_operation): New function for
> > > > support indirect narrowing convert.
> > > > (supportable_indirect_widening_operation): New function for
> > > > support indirect widening convert.
> > > > (expand_vector_conversion): Support convert for int -> int,
> > > > float -> float and int <-> float.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR target/107432
> > > > * gcc.target/i386/pr107432-1.c: New test.
> > > > * gcc.target/i386/pr107432-2.c: Ditto.
> > > > * gcc.target/i386/pr107432-3.c: Ditto.
> > > > * gcc.target/i386/pr107432-4.c: Ditto.
> > > > * gcc.target/i386/pr107432-5.c: Ditto.
> > > > * gcc.target/i386/pr107432-6.c: Ditto.
> > > > * gcc.target/i386/pr107432-7.c: Ditto.
> > > > ---
> > > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> > > > index
> > > > ab640096ca2..0bedb53d9f9 100644
> > > > --- a/gcc/tree-vect-generic.cc
> > > > +++ b/gcc/tree-vect-generic.cc
> > > > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not
> > > > see #include "gimple-match.h"
> > > >  #include "recog.h" /* FIXME: for insn_data */
> > > >  #include "optabs-libfuncs.h"
> > > > +#include "cfgloop.h"
> > > > +#include "tree-vectorizer.h"
> > > >
> > > >
> > > >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > > > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> > > (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> > > >return gimplify_build2 (gsi, code, outer_type, b, c);  }
> > > >
> > > > +/* A subroutine of expand_vector_conversion, support indirect
> > > > +conversion
> > > for
> > > > +   float <-> int, like double -> char.  */ bool
> > > > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > > > +enum tree_code code,
> > > > +tree lhs,
> > > > +tree arg)
> > > > +{
> > > > +  gimple *g;
> > > > +  tree ret_type = TREE_TYPE (lhs);
> > > > +  tree arg_type = TREE_TYPE (arg);
> > > > +  tree new_rhs;
> > > > +
> > > > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > > > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > > > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> > > arg_elt_bits)
> > > > +return false;
> > > > +
> > > > +  unsigned short target_size;
> > > > +  scalar_mode tmp_cvt_mode;
> > > > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > > > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));
> > > > + tree cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;
> > > > + target_size = GET_MODE_SIZE (rhs_mode);
> > > > +
> > > > +  opt_scalar_mode mode_iter;
> > > > +  enum tree_code tc1, tc2;
> > > > +  unsigned HOST_WIDE_INT nelts
> > > > += constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > > > +
> > > > +  FOR_EACH_2XWI

RE: [PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-05-31 Thread Hu, Lin1
> -Original Message-
> From: Richard Biener 
> Sent: Wednesday, May 29, 2024 5:41 PM
> To: Hu, Lin1 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> ubiz...@gmail.com
> Subject: Re: [PATCH 1/3] vect: generate suitable convert insn for int -> int, 
> float
> -> float and int <-> float.
> 
> On Thu, 23 May 2024, Hu, Lin1 wrote:
> 
> > gcc/ChangeLog:
> >
> > PR target/107432
> > * tree-vect-generic.cc
> > (supportable_indirect_narrowing_operation): New function for
> > support indirect narrowing convert.
> > (supportable_indirect_widening_operation): New function for
> > support indirect widening convert.
> > (expand_vector_conversion): Support convert for int -> int,
> > float -> float and int <-> float.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/107432
> > * gcc.target/i386/pr107432-1.c: New test.
> > * gcc.target/i386/pr107432-2.c: Ditto.
> > * gcc.target/i386/pr107432-3.c: Ditto.
> > * gcc.target/i386/pr107432-4.c: Ditto.
> > * gcc.target/i386/pr107432-5.c: Ditto.
> > * gcc.target/i386/pr107432-6.c: Ditto.
> > * gcc.target/i386/pr107432-7.c: Ditto.
> > ---
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > ab640096ca2..0bedb53d9f9 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -45,6 +45,8 @@ along with GCC; see the file COPYING3.  If not see
> > #include "gimple-match.h"
> >  #include "recog.h" /* FIXME: for insn_data */
> >  #include "optabs-libfuncs.h"
> > +#include "cfgloop.h"
> > +#include "tree-vectorizer.h"
> >
> >
> >  /* Build a ternary operation and gimplify it.  Emit code before GSI.
> > @@ -1834,6 +1836,142 @@ do_vec_narrow_conversion
> (gimple_stmt_iterator *gsi, tree inner_type, tree a,
> >return gimplify_build2 (gsi, code, outer_type, b, c);  }
> >
> > +/* A subroutine of expand_vector_conversion, support indirect conversion
> for
> > +   float <-> int, like double -> char.  */ bool
> > +supportable_indirect_narrowing_operation (gimple_stmt_iterator *gsi,
> > +enum tree_code code,
> > +tree lhs,
> > +tree arg)
> > +{
> > +  gimple *g;
> > +  tree ret_type = TREE_TYPE (lhs);
> > +  tree arg_type = TREE_TYPE (arg);
> > +  tree new_rhs;
> > +
> > +  unsigned int ret_elt_bits = vector_element_bits (ret_type);
> > + unsigned int arg_elt_bits = vector_element_bits (arg_type);  if
> > + (code != FIX_TRUNC_EXPR || flag_trapping_math || ret_elt_bits >=
> arg_elt_bits)
> > +return false;
> > +
> > +  unsigned short target_size;
> > +  scalar_mode tmp_cvt_mode;
> > +  scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (ret_type));
> > + scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (arg_type));  tree
> > + cvt_type = NULL_TREE;  tmp_cvt_mode = lhs_mode;  target_size =
> > + GET_MODE_SIZE (rhs_mode);
> > +
> > +  opt_scalar_mode mode_iter;
> > +  enum tree_code tc1, tc2;
> > +  unsigned HOST_WIDE_INT nelts
> > += constant_lower_bound (TYPE_VECTOR_SUBPARTS (arg_type));
> > +
> > +  FOR_EACH_2XWIDER_MODE (mode_iter, tmp_cvt_mode)
> > +{
> > +  tmp_cvt_mode = mode_iter.require ();
> > +
> > +  if (GET_MODE_SIZE (tmp_cvt_mode) > target_size)
> > +   break;
> > +
> > +  scalar_mode cvt_mode;
> > +  int tmp_cvt_size = GET_MODE_BITSIZE (tmp_cvt_mode);
> > +  if (!int_mode_for_size (tmp_cvt_size, 0).exists (_mode))
> > +   break;
> > +
> > +  int cvt_size = GET_MODE_BITSIZE (cvt_mode);
> > +  bool isUnsigned = TYPE_UNSIGNED (ret_type) || TYPE_UNSIGNED
> (arg_type);
> > +  cvt_type = build_nonstandard_integer_type (cvt_size,
> > + isUnsigned);
> > +
> > +  cvt_type = build_vector_type (cvt_type, nelts);
> > +  if (cvt_type == NULL_TREE
> > + || !supportable_convert_operation ((tree_code) NOP_EXPR,
> > +ret_type,
> > +cvt_type, )
> > + || !supportable_convert_operation ((tree_code) code,
> > +cvt_type,
> > +arg_type, ))
> > +   continue;
> > +
> > +  new_rhs = make_ssa_name (cvt_type);
> > +  g = vect_gimple_build (ne

[PATCH] i386: Handle target of __builtin_ia32_cmp[p|s][s|d] from avx into sse/sse2/avx

2024-05-29 Thread Hu, Lin1
Hi, all

This patch aims to extend __builtin_ia32_cmp[p|s][s|d] from avx to
sse/sse2/avx, where its immediate is in range of [0, 7].

Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?

BRs,
Lin

gcc/ChangeLog:

* config/i386/avxintrin.h: Move cmp[p|s][s|d] to [e|x]mmintrin.h,
and move macros to xmmintrin.h
* config/i386/emmintrin.h: Add cmp[p|s]s intrins.
* config/i386/i386-builtin.def: Modify __builtin_ia32_cmp[p|s][s|d].
* config/i386/i386-expand.cc
(ix86_expand_args_builtin): Raise error when imm is in range of
[8, 32] without avx.
* config/i386/sse.md (avx_cmp3): Modefy define_insn.
(avx_vmcmp3): Ditto.
* config/i386/xmmintrin.h (_CMP_EQ_OQ): New macro for sse/sse2.
(_CMP_LT_OS): Ditto
(_CMP_LE_OS): Ditto
(_CMP_UNORD_Q): Ditto
(_CMP_NEQ_UQ): Ditto
(_CMP_NLT_US): Ditto
(_CMP_NLE_US): Ditto
(_CMP_ORD_Q): Ditto
(_mm_cmp_ps): Move intrin from avxintrin.h to xmmintrin.h
(_mm_cmp_ss): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/sse-cmp-1.c: New test.
* gcc.target/i386/sse-cmp-2.c: Ditto.
* gcc.target/i386/sse-cmp-error-1.c: Ditto.
---
 gcc/config/i386/avxintrin.h   | 56 ---
 gcc/config/i386/emmintrin.h   | 22 +
 gcc/config/i386/i386-builtin.def  | 10 +-
 gcc/config/i386/i386-expand.cc|  6 ++
 gcc/config/i386/predicates.md |  5 +
 gcc/config/i386/sse.md| 42 
 gcc/config/i386/xmmintrin.h   | 41 
 gcc/testsuite/gcc.target/i386/sse-cmp-1.c | 20 
 gcc/testsuite/gcc.target/i386/sse-cmp-2.c | 96 +++
 gcc/testsuite/gcc.target/i386/sse-cmp-error.c | 16 
 10 files changed, 236 insertions(+), 78 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-cmp-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-cmp-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-cmp-error.c

diff --git a/gcc/config/i386/avxintrin.h b/gcc/config/i386/avxintrin.h
index 80214540888..ec9b9905b5f 100644
--- a/gcc/config/i386/avxintrin.h
+++ b/gcc/config/i386/avxintrin.h
@@ -72,22 +72,6 @@ typedef double __m256d_u __attribute__ ((__vector_size__ 
(32),
 
 /* Compare predicates for scalar and packed compare intrinsics.  */
 
-/* Equal (ordered, non-signaling)  */
-#define _CMP_EQ_OQ 0x00
-/* Less-than (ordered, signaling)  */
-#define _CMP_LT_OS 0x01
-/* Less-than-or-equal (ordered, signaling)  */
-#define _CMP_LE_OS 0x02
-/* Unordered (non-signaling)  */
-#define _CMP_UNORD_Q   0x03
-/* Not-equal (unordered, non-signaling)  */
-#define _CMP_NEQ_UQ0x04
-/* Not-less-than (unordered, signaling)  */
-#define _CMP_NLT_US0x05
-/* Not-less-than-or-equal (unordered, signaling)  */
-#define _CMP_NLE_US0x06
-/* Ordered (nonsignaling)   */
-#define _CMP_ORD_Q 0x07
 /* Equal (unordered, non-signaling)  */
 #define _CMP_EQ_UQ 0x08
 /* Not-greater-than-or-equal (unordered, signaling)  */
@@ -381,18 +365,6 @@ _mm256_xor_ps (__m256 __A, __m256 __B)
 }
 
 #ifdef __OPTIMIZE__
-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
-_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
-{
-  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
-}
-
-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
-_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
-{
-  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
-}
-
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
 {
@@ -406,27 +378,7 @@ _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
   return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
   __P);
 }
-
-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
-_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
-{
-  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
-}
-
-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
-_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
-{
-  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
-}
 #else
-#define _mm_cmp_pd(X, Y, P)\
-  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),   \
-  (__v2df)(__m128d)(Y), (int)(P)))
-
-#define _mm_cmp_ps(X, Y, P)\
-  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (int)(P)))
-
 #define _mm256_cmp_pd(X, Y, P) \
   ((__m256d) 

[PATCH 3/3 v2] vect: support direct conversion under x86-64-v3.

2024-05-29 Thread Hu, Lin1
According to hongtao's suggestion, I support some trunc in mmx.md under
x86-64-v3, and optimize ix86_expand_trunc_with_avx2_noavx512f.

BRs,
Lin

gcc/ChangeLog:

PR 107432
* config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
New function for generate a series of suitable insn.
* config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
Define new function.
* config/i386/sse.md: Extend trunc2 for x86-64-v3.
(ssebytemode) Add V8HI.
(PMOV_DST_MODE_2_AVX2): New mode iterator.
(PMOV_SRC_MODE_3_AVX2): Ditto.
* config/i386/mmx.md
(trunc2): Ditto.
(avx512vl_trunc2): Ditto.
(truncv2si2): Ditto.
(avx512vl_truncv2si2): Ditto.
(mmxbytemode): New mode attr.

gcc/testsuite/ChangeLog:

PR 107432
* gcc.target/i386/pr107432-8.c: New test.
* gcc.target/i386/pr107432-9.c: Ditto.
* gcc.target/i386/pr92645-4.c: Modify test.
---
 gcc/config/i386/i386-expand.cc |  44 ++-
 gcc/config/i386/i386-protos.h  |   3 +
 gcc/config/i386/mmx.md |  35 +-
 gcc/config/i386/sse.md |  88 ++
 gcc/testsuite/gcc.target/i386/pr107432-8.c |  94 +++
 gcc/testsuite/gcc.target/i386/pr107432-9.c | 129 +
 gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
 7 files changed, 363 insertions(+), 32 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..90705803d29 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
   emit_insn (gen_xorv4si3 (value, value, large));
 }
 
-static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
-machine_mode mode, rtx target,
-rtx var, int one_var);
-
 /* Convert an unsigned DImode value into a DFmode, using only SSE.
Expects the 64-bit DImode to be supplied in a pair of integral
registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
@@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
whose ONE_VAR element is VAR, and other elements are zero.  Return true
if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
 rtx target, rtx var, int one_var)
 {
@@ -25551,4 +25547,42 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
   return ret;
 }
 
+/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
+
+void
+ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode 
cvt_mode)
+{
+  machine_mode out_mode = GET_MODE (output);
+  machine_mode in_mode = GET_MODE (input);
+  int len = GET_MODE_SIZE (in_mode);
+  gcc_assert (len == GET_MODE_SIZE (cvt_mode)
+ && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode)
+ && (REG_P (input) || SUBREG_P (input)));
+  scalar_mode inner_out_mode = GET_MODE_INNER (out_mode);
+  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
+  int out_innersize = GET_MODE_SIZE (inner_out_mode);
+
+  struct expand_vec_perm_d d;
+  d.target = gen_reg_rtx (cvt_mode);
+  d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode);
+  d.op1 = d.op0;
+  d.vmode = cvt_mode;
+  d.nelt = GET_MODE_NUNITS (cvt_mode);
+  d.testing_p = false;
+  d.one_operand_p = true;
+
+  /* Init perm. Put the needed bits of input in order and
+ fill the rest of bits by default.  */
+  for (int i = 0; i < d.nelt; ++i)
+{
+  d.perm[i] = i;
+  if (i < GET_MODE_NUNITS (out_mode))
+   d.perm[i] = i * (in_innersize / out_innersize);
+}
+
+  bool ok = ix86_expand_vec_perm_const_1();
+  gcc_assert (ok);
+  emit_move_insn (output, gen_lowpart (out_mode, d.target));
+}
+
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..aa826f4864f 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, 
rtx, enum rtx_code,
 extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
  bool, rtx_code_label *);
 extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
+extern void ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx, machine_mode);
 extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
 extern bool ix86_memtag_can_tag_addresses (void);
 
@@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
 extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
  

[PATCH 2/3 v2] vect: Support v4hi -> v4qi.

2024-05-29 Thread Hu, Lin1
Exclude add TARGET_MMX_WITH_SSE, I merge two patterns.

BRs,
Lin

gcc/ChangeLog:

PR target/107432
* config/i386/mmx.md
(VI2_32_64): New mode iterator.
(mmxhalfmode): New mode atter.
(mmxhalfmodelower): Ditto.
(truncv2hiv2qi2): Extend mode v4hi and change name from
truncv2hiv2qi to trunc2.

gcc/testsuite/ChangeLog:

PR target/107432
* gcc.target/i386/pr107432-1.c: Modify test.
* gcc.target/i386/pr107432-6.c: Add test.
---
 gcc/config/i386/mmx.md | 17 +
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 13 -
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 19 ---
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 5f342497885..27b080bfeb6 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -67,6 +67,9 @@ (define_mode_iterator V2F_32 [V2HF V2BF])
 ;; 4-byte integer vector modes
 (define_mode_iterator VI_32 [V4QI V2HI])
 
+;; 8-byte and 4-byte HImode vector modes
+(define_mode_iterator VI2_32_64 [(V4HI "TARGET_MMX_WITH_SSE") V2HI])
+
 ;; 4-byte and 2-byte integer vector modes
 (define_mode_iterator VI_16_32 [V4QI V2QI V2HI])
 
@@ -106,6 +109,12 @@ (define_mode_attr mmxinsnmode
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
+(define_mode_attr mmxhalfmode
+  [(V4HI "V4QI") (V2HI "V2QI")])
+
+(define_mode_attr mmxhalfmodelower
+  [(V4HI "v4qi") (V2HI "v2qi")])
+
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr mmxintvecmode
   [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")
@@ -4873,10 +4882,10 @@ (define_expand "v2qiv2hi2"
   DONE;
 })
 
-(define_insn "truncv2hiv2qi2"
-  [(set (match_operand:V2QI 0 "register_operand" "=v")
-   (truncate:V2QI
- (match_operand:V2HI 1 "register_operand" "v")))]
+(define_insn "trunc2"
+  [(set (match_operand: 0 "register_operand" "=v")
+   (truncate:
+ (match_operand:VI2_32_64 1 "register_operand" "v")))]
   "TARGET_AVX512VL && TARGET_AVX512BW"
   "vpmovwb\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c 
b/gcc/testsuite/gcc.target/i386/pr107432-1.c
index a4f37447eb4..afdf367afe2 100644
--- a/gcc/testsuite/gcc.target/i386/pr107432-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -7,7 +7,8 @@
 /* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */
 
 #include 
 
@@ -113,6 +114,11 @@ __v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi 
a)
   return __builtin_convertvector((__v2hi)a, __v2qi);
 }
 
+__v4qi mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a)
+{
+  return __builtin_convertvector((__v4hi)a, __v4qi);
+}
+
 __v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
 {
   return __builtin_convertvector((__v8hi)a, __v8qi);
@@ -218,6 +224,11 @@ __v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu 
a)
   return __builtin_convertvector((__v2hu)a, __v2qu);
 }
 
+__v4qu mm64_cvtepu16_epu8_builtin_convertvector(__v4hu a)
+{
+  return __builtin_convertvector((__v4hu)a, __v4qu);
+}
+
 __v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
 {
   return __builtin_convertvector((__v8hu)a, __v8qu);
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c 
b/gcc/testsuite/gcc.target/i386/pr107432-6.c
index 4a68a10b089..7d3717d45bc 100644
--- a/gcc/testsuite/gcc.target/i386/pr107432-6.c
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -8,11 +8,14 @@
 /* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } 
*/
 /* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } 
*/
-/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
-/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 5 { target { ! ia32 } } } } 
*/
 /* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */
 
 #include 
 
@@ -103,6 +106,11 @@ __v2qi 

[PATCH] i386: Optimize EQ/NE comparison between avx512 kmask and -1.

2024-05-28 Thread Hu, Lin1
Hi all,

This patch aims to acheive EQ/NE comparison between avx512 kmask and -1
by using kxortest with checking CF.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk?

BRs,
Lin

gcc/ChangeLog:

PR target/113609
* config/i386/sse.md
(*kortest_cmp_setcc): New define_insn_and_split.
(*kortest_cmp_jcc): Ditto.

gcc/testsuite/ChangeLog:

PR target/113609
* gcc.target/i386/pr113609-1.c: New test.
* gcc.target/i386/pr113609-2.c: Ditto.
---
 gcc/config/i386/sse.md |  67 +++
 gcc/testsuite/gcc.target/i386/pr113609-1.c | 194 +
 gcc/testsuite/gcc.target/i386/pr113609-2.c | 161 +
 3 files changed, 422 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr113609-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b59c988fc31..34fd2e4afac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2201,6 +2201,73 @@ (define_expand "kortest"
  UNSPEC_KORTEST))]
   "TARGET_AVX512F")
 
+;; Optimize cmp + setcc with mask register by kortest + setcc.
+(define_insn_and_split "*kortest_cmp_setcc"
+   [(set (match_operand:QI 0 "nonimmediate_operand" "=qm, qm")
+(match_operator:QI 1 "bt_comparison_operator"
+   [(match_operand:SWI1248_AVX512BWDQ_64 2 "register_operand" "?k, 
")
+(const_int -1)]))
+  (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  if (MASK_REGNO_P (REGNO (operands[2])))
+{
+  emit_insn (gen_kortest_ccc (operands[2], operands[2]));
+  operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG);
+}
+  else
+{
+  operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG);
+  emit_insn (gen_rtx_SET (operands[4],
+ gen_rtx_COMPARE (CCZmode,
+  operands[2],
+  constm1_rtx)));
+}
+  ix86_expand_setcc (operands[0],
+GET_CODE (operands[1]),
+operands[4],
+const0_rtx);
+  DONE;
+})
+
+;; Optimize cmp + jcc with mask register by kortest + jcc.
+(define_insn_and_split "*kortest_cmp_jcc"
+   [(set (pc)
+  (if_then_else
+   (match_operator 0 "bt_comparison_operator"
+ [(match_operand:SWI1248_AVX512BWDQ_64 1 "register_operand" "?k, ")
+  (const_int -1)])
+ (label_ref (match_operand 2))
+  (pc)))
+  (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  if (MASK_REGNO_P (REGNO (operands[1])))
+{
+  emit_insn (gen_kortest_ccc (operands[1], operands[1]));
+  operands[4] = gen_rtx_REG (CCCmode, FLAGS_REG);
+}
+  else
+{
+  operands[4] = gen_rtx_REG (CCZmode, FLAGS_REG);
+  emit_insn (gen_rtx_SET (operands[4],
+ gen_rtx_COMPARE (CCZmode,
+  operands[1],
+  constm1_rtx)));
+}
+  ix86_expand_branch (GET_CODE (operands[0]),
+ operands[4],
+ const0_rtx,
+ operands[2]);
+  DONE;
+})
+
 (define_insn "kunpckhi"
   [(set (match_operand:HI 0 "register_operand" "=k")
(ior:HI
diff --git a/gcc/testsuite/gcc.target/i386/pr113609-1.c 
b/gcc/testsuite/gcc.target/i386/pr113609-1.c
new file mode 100644
index 000..f0639b8500a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113609-1.c
@@ -0,0 +1,194 @@
+/* PR target/113609 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4" } */
+/* { dg-final { scan-assembler-not "^cmp" } } */
+/* { dg-final { scan-assembler-not "\[ \\t\]+sete" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "\[ \\t\]+setne" { target { ! ia32 } } } } 
*/
+/* { dg-final { scan-assembler-not "\[ \\t\]+je" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "\[ \\t\]+jne" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+sete" 1 { target { ia32 } } } } 
*/
+/* { dg-final { scan-assembler-times "\[ \\t\]+setne" 1 { target { ia32 } } } 
} */
+/* { dg-final { scan-assembler-times "\[ \\t\]+je" 1 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "\[ \\t\]+jne" 2 { target { ia32 } } } } 
*/
+/* { dg-final { scan-assembler-times "kortest" 12 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "kortest" 17 { target { ! ia32 } } } } */
+
+#include 
+
+unsigned int
+cmp_vector_sete_mask8(__m128i a, __m128i b)
+{
+__mmask8 k = _mm_cmpeq_epi16_mask (a, b);
+if (k == (__mmask8) -1)
+  return 1;
+else
+  return 0;
+}
+
+unsigned int
+cmp_vector_sete_mask16(__m128i a, __m128i b)
+{
+__mmask16 k = _mm_cmpeq_epi8_mask (a, b);
+if (k == (__mmask16) -1)
+  return 1;
+else
+  return 0;
+}

RE: [PATCH 3/3] vect: support direct conversion under x86-64-v3.

2024-05-23 Thread Hu, Lin1
> -Original Message-
> From: Hongtao Liu 
> Sent: Thursday, May 23, 2024 2:42 PM
> To: Hu, Lin1 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> ubiz...@gmail.com; rguent...@suse.de
> Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> 
> On Thu, May 23, 2024 at 2:38 PM Hu, Lin1  wrote:
> >
> > gcc/ChangeLog:
> >
> > PR 107432
> > * config/i386/i386-expand.cc 
> > (ix86_expand_trunc_with_avx2_noavx512f):
> > New function for generate a series of suitable insn.
> > * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
> > Define new function.
> > * config/i386/sse.md: Extend trunc2 for x86-64-v3.
> I have some concern for this patch since
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this
> patch.

OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some 
better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5).
Or we can disable some of the optimization via vpermq. In pr107432-8.c, there 
are only 5 tests that use vpermq.

BRs,
Lin
 
> > gcc/testsuite/ChangeLog:
> >
> > PR 107432
> > * gcc.target/i386/pr107432-8.c: New test.
> > * gcc.target/i386/pr107432-9.c: Ditto.
> > * gcc.target/i386/pr92645-4.c: Modify test.
> > ---
> >  gcc/config/i386/i386-expand.cc |  47 +++-
> >  gcc/config/i386/i386-protos.h  |   3 +
> >  gcc/config/i386/sse.md |  87 +++
> >  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +
> > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +
> >  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
> >  6 files changed, 304 insertions(+), 29 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
> >emit_insn (gen_xorv4si3 (value, value, large));  }
> >
> > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> > -machine_mode mode, rtx 
> > target,
> > -rtx var, int one_var);
> > -
> >  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> > Expects the 64-bit DImode to be supplied in a pair of integral
> > registers.  Requires SSE2; will use SSE3 if available.  For
> > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool
> mmx_ok, machine_mode mode,
> > whose ONE_VAR element is VAR, and other elements are zero.  Return true
> > if successful.  */
> >
> > -static bool
> > +bool
> >  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
> >  rtx target, rtx var, int one_var)
> > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
> >return ret;
> >  }
> >
> > +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> > +
> > +bool
> > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) {
> > +  machine_mode out_mode = GET_MODE (output);
> > +  machine_mode in_mode = GET_MODE (input);
> > +  int len = GET_MODE_SIZE (in_mode);
> > +  gcc_assert (len == 16 || len == 32);
> > +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> > +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> > +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> > +
> > +  struct expand_vec_perm_d d;
> > +  d.target = gen_reg_rtx (cvt_mode);
> > +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input),
> > + in_mode);
> > +  d.op1 = d.op0;
> > +  d.vmode = cvt_mode;
> > +  d.nelt = len;
> > +  d.testing_p = false;
> > +  d.one_operand_p = true;
> > +
> > +  /* Init perm. Put the needed bits of input in order and
> > + fill the rest of bits by default.  */  int tot = 0;  for (int i
> > + = 0; i < len; ++i)
> > +{
> > +  d.perm[i] = i;
> > +  if ((i % in_innersize) < out_innersize)
> > +   d.perm[tot++] = i;
> > +}
> > +
> > +  if (ix86_expand_vec_perm_const_1())
> > +{
> > +  emit_move_insn (outp

[PATCH 1/3] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-05-23 Thread Hu, Lin1
gcc/ChangeLog:

PR target/107432
* tree-vect-generic.cc
(supportable_indirect_narrowing_operation): New function for
support indirect narrowing convert.
(supportable_indirect_widening_operation): New function for
support indirect widening convert.
(expand_vector_conversion): Support convert for int -> int,
float -> float and int <-> float.

gcc/testsuite/ChangeLog:

PR target/107432
* gcc.target/i386/pr107432-1.c: New test.
* gcc.target/i386/pr107432-2.c: Ditto.
* gcc.target/i386/pr107432-3.c: Ditto.
* gcc.target/i386/pr107432-4.c: Ditto.
* gcc.target/i386/pr107432-5.c: Ditto.
* gcc.target/i386/pr107432-6.c: Ditto.
* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++
 gcc/tree-vect-generic.cc   | 157 +-
 8 files changed, 968 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c 
b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include 
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__m128imm256_cvtepi32_epi16_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v8si)a, __v8hi);
+}
+
+__m256imm512_cvtepi32_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v16si)a, __v16hi);

[PATCH 3/3] vect: support direct conversion under x86-64-v3.

2024-05-23 Thread Hu, Lin1
gcc/ChangeLog:

PR 107432
* config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
New function for generate a series of suitable insn.
* config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
Define new function.
* config/i386/sse.md: Extend trunc2 for x86-64-v3.

gcc/testsuite/ChangeLog:

PR 107432
* gcc.target/i386/pr107432-8.c: New test.
* gcc.target/i386/pr107432-9.c: Ditto.
* gcc.target/i386/pr92645-4.c: Modify test.
---
 gcc/config/i386/i386-expand.cc |  47 +++-
 gcc/config/i386/i386-protos.h  |   3 +
 gcc/config/i386/sse.md |  87 +++
 gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +
 gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +
 gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
 6 files changed, 304 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..bca8b85c9d1 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
   emit_insn (gen_xorv4si3 (value, value, large));
 }
 
-static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
-machine_mode mode, rtx target,
-rtx var, int one_var);
-
 /* Convert an unsigned DImode value into a DFmode, using only SSE.
Expects the 64-bit DImode to be supplied in a pair of integral
registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
@@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
whose ONE_VAR element is VAR, and other elements are zero.  Return true
if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
 rtx target, rtx var, int one_var)
 {
@@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
   return ret;
 }
 
+/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
+
+bool
+ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input)
+{
+  machine_mode out_mode = GET_MODE (output);
+  machine_mode in_mode = GET_MODE (input);
+  int len = GET_MODE_SIZE (in_mode);
+  gcc_assert (len == 16 || len == 32);
+  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
+  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
+  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
+
+  struct expand_vec_perm_d d;
+  d.target = gen_reg_rtx (cvt_mode);
+  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode);
+  d.op1 = d.op0;
+  d.vmode = cvt_mode;
+  d.nelt = len;
+  d.testing_p = false;
+  d.one_operand_p = true;
+
+  /* Init perm. Put the needed bits of input in order and
+ fill the rest of bits by default.  */
+  int tot = 0;
+  for (int i = 0; i < len; ++i)
+{
+  d.perm[i] = i;
+  if ((i % in_innersize) < out_innersize)
+   d.perm[tot++] = i;
+}
+
+  if (ix86_expand_vec_perm_const_1())
+{
+  emit_move_insn (output, gen_lowpart (out_mode, d.target));
+  return true;
+}
+
+  return false;
+}
+
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..ac29fb34028 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, 
rtx, enum rtx_code,
 extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
  bool, rtx_code_label *);
 extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
+extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
 extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
 extern bool ix86_memtag_can_tag_addresses (void);
 
@@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
 extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
   rtx);
+extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
+rtx, int);
 extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
 
 /* In i386-c.cc  */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f57f36ae380..0b14b3dc1ac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -14373,14 +14373,25 @@ (define_expand "avx512bw_v32hiv32qi2_mask_store"
 
 (define_mode_iterator PMOV_DST_MODE_2
   [V4SI V8HI (V16QI "TARGET_AVX512BW")])
+(define_mode_iterator PMOV_DST_MODE_2_AVX2
+  [V4SI V8HI V16QI])
 (define_mode_attr pmov_suff_2
   

[PATCH 2/3] vect: Support v4hi -> v4qi.

2024-05-23 Thread Hu, Lin1
gcc/ChangeLog:

PR target/107432
* config/i386/mmx.md (truncv4hiv4qi2): New define_insn.

gcc/testsuite/ChangeLog:

PR target/107432
* gcc.target/i386/pr107432-6.c: Add test.
---
 gcc/config/i386/mmx.md | 10 ++
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 12 +++-
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 19 ---
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 5f342497885..30f0d88af9f 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -4883,6 +4883,16 @@ (define_insn "truncv2hiv2qi2"
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
 
+(define_insn "truncv4hiv4qi2"
+  [(set (match_operand:V4QI 0 "register_operand" "=v")
+   (truncate:V4QI
+ (match_operand:V4HI 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_AVX512BW"
+  "vpmovwb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
 (define_mode_iterator V2QI_V2HI [V2QI V2HI])
 (define_insn "truncv2si2"
   [(set (match_operand:V2QI_V2HI 0 "register_operand" "=v")
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c 
b/gcc/testsuite/gcc.target/i386/pr107432-1.c
index a4f37447eb4..e0c7ffc8e5b 100644
--- a/gcc/testsuite/gcc.target/i386/pr107432-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -7,7 +7,7 @@
 /* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 10 } } */
 
 #include 
 
@@ -113,6 +113,11 @@ __v2qi mm32_cvtepi16_epi8_builtin_convertvector(__v2hi 
a)
   return __builtin_convertvector((__v2hi)a, __v2qi);
 }
 
+__v4qi mm64_cvtepi16_epi8_builtin_convertvector(__v4hi a)
+{
+  return __builtin_convertvector((__v4hi)a, __v4qi);
+}
+
 __v8qi mm_cvtepi16_epi8_builtin_convertvector(__m128i a)
 {
   return __builtin_convertvector((__v8hi)a, __v8qi);
@@ -218,6 +223,11 @@ __v2qu mm32_cvtepu16_epu8_builtin_convertvector(__v2hu 
a)
   return __builtin_convertvector((__v2hu)a, __v2qu);
 }
 
+__v4qu mm64_cvtepu16_epu8_builtin_convertvector(__v4hu a)
+{
+  return __builtin_convertvector((__v4hu)a, __v4qu);
+}
+
 __v8qu mm_cvtepu16_epu8_builtin_convertvector(__m128i a)
 {
   return __builtin_convertvector((__v8hu)a, __v8qu);
diff --git a/gcc/testsuite/gcc.target/i386/pr107432-6.c 
b/gcc/testsuite/gcc.target/i386/pr107432-6.c
index 4a68a10b089..7d3717d45bc 100644
--- a/gcc/testsuite/gcc.target/i386/pr107432-6.c
+++ b/gcc/testsuite/gcc.target/i386/pr107432-6.c
@@ -8,11 +8,14 @@
 /* { dg-final { scan-assembler-times "vcvttps2dq" 4 { target { ! ia32 } } } } 
*/
 /* { dg-final { scan-assembler-times "vcvttps2udq" 3 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vcvttps2udq" 4 { target { ! ia32 } } } } 
*/
-/* { dg-final { scan-assembler-times "vcvttph2w" 4 } } */
-/* { dg-final { scan-assembler-times "vcvttph2uw" 4 } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2w" 5 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 4 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vcvttph2uw" 5 { target { ! ia32 } } } } 
*/
 /* { dg-final { scan-assembler-times "vpmovdb" 10 { target { ia32 } } } } */
 /* { dg-final { scan-assembler-times "vpmovdb" 14 { target { ! ia32 } } } } */
-/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 10 { target { ! ia32 } } } } */
 
 #include 
 
@@ -103,6 +106,11 @@ __v2qi mm32_cvtph_epi8_builtin_convertvector(__v2hf a)
   return __builtin_convertvector((__v2hf)a, __v2qi);
 }
 
+__v4qi mm64_cvtph_epi8_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector((__v4hf)a, __v4qi);
+}
+
 __v8qi mm128_cvtph_epi8_builtin_convertvector(__v8hf a)
 {
   return __builtin_convertvector((__v8hf)a, __v8qi);
@@ -123,6 +131,11 @@ __v2qu mm32_cvtph_epu8_builtin_convertvector(__v2hf a)
   return __builtin_convertvector((__v2hf)a, __v2qu);
 }
 
+__v4qu mm64_cvtph_epu8_builtin_convertvector(__v4hf a)
+{
+  return __builtin_convertvector((__v4hf)a, __v4qu);
+}
+
 __v8qu mm128_cvtph_epu8_builtin_convertvector(__v8hf a)
 {
   return __builtin_convertvector((__v8hf)a, __v8qu);
-- 
2.31.1



[PATCH 0/3] Optimize __builtin_convertvector for x86-64-v4 and

2024-05-23 Thread Hu, Lin1
These patches are a series of improved patches to the __builtin_convertvector
for x86-64-v4 and x86-64-v3.

I modified the first patch according to Richard's suggestion and send them out
together to show my complete modification of the function.

They are bootstrapped and regtested on x86_64-pc-linux-gnu.

BRs,
Lin

Hu, Lin1 (3):
  vect: generate suitable convert insn for int -> int, float -> float
and int <-> float.
  vect: Support v4hi -> v4qi.
  vect: support direct conversion under x86-64-v3.

 gcc/config/i386/i386-expand.cc |  47 +++-
 gcc/config/i386/i386-protos.h  |   3 +
 gcc/config/i386/mmx.md |  10 +
 gcc/config/i386/sse.md |  87 ++--
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 244 +
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 ++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 152 +
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 +
 gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 ++
 gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 ++
 gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
 gcc/tree-vect-generic.cc   | 157 -
 15 files changed, 1305 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c

-- 
2.31.1



RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-05-14 Thread Hu, Lin1
> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, May 14, 2024 8:23 PM
> To: Hu, Lin1 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> ubiz...@gmail.com
> Subject: RE: [PATCH] vect: generate suitable convert insn for int -> int, 
> float ->
> float and int <-> float.
> 
> On Tue, 14 May 2024, Hu, Lin1 wrote:
> 
> > Do you have any advice?
> >
> > BRs,
> > Lin
> >
> > -Original Message-
> > From: Hu, Lin1 
> > Sent: Wednesday, May 8, 2024 9:38 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Liu, Hongtao ; ubiz...@gmail.com
> > Subject: [PATCH] vect: generate suitable convert insn for int -> int, float 
> > ->
> float and int <-> float.
> >
> > Hi, all
> >
> > This patch aims to optimize __builtin_convertvector. We want the function
> can generate more efficient insn for some situations. Like v2si -> v2di.
> >
> > The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK
> for trunk?
> 
> I don't like the new code to be in a separate function, not integrated with 
> the
> existing handling.  Note the existing handling should get, say, V8DF -> V8SI
> correct for SSE by splitting the operation into smaller vectors but your code
> seems to just handle the cases the vectors are already properly sized.
>

Yes, my code only handles some cases, but others are handled by the core part 
of 
tree-vect-generic.cc. I just take care of some special cases up front. So, V8DF 
-> V8SI
is still split into smaller vectors for SSE.

And for SSE, I have
another patch to expand the available direct optab environment with 
ix86_expand_vec_perm_const_1 (...). This patch hasn't been sent yet. 
I will sending it out together after I modify this patch. This gives an overall 
view
of my changes to this function.

> 
> Without checking it seems you are basing the code on what the vectorizer does?
> Maybe we should have some common code that computes intermediate
> conversion steps supported by the HW unifying what for example
> supportable_widening_operation or supportable_narrowing_operation can do
> to also cover int <-> float conversions.
>

Yes, my code is based on vectorizable_conversion(...). I will consider to split 
the function
and  define some new function like your advises to make my code more common.

BRs,
Lin
 
>
> That said, if you don't want to do that please still think about the core 
> part of
> tree-vect-generic.cc which is breaking down large emulated vectors into small
> supported vectors.
> 
> Richard.
> 
> > BRs,
> > Lin
> >
> > gcc/ChangeLog:
> >
> > PR target/107432
> > * tree-vect-generic.cc (expand_vector_conversion): Support
> > convert for int -> int, float -> float and int <-> float.
> > (expand_vector_conversion_no_vec_pack): Check if can convert
> > int <-> int, float <-> float and int <-> float, directly.
> > Support indirect convert, when direct optab is not supported.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/107432
> > * gcc.target/i386/pr107432-1.c: New test.
> > * gcc.target/i386/pr107432-2.c: Ditto.
> > * gcc.target/i386/pr107432-3.c: Ditto.
> > * gcc.target/i386/pr107432-4.c: Ditto.
> > * gcc.target/i386/pr107432-5.c: Ditto.
> > * gcc.target/i386/pr107432-6.c: Ditto.
> > * gcc.target/i386/pr107432-7.c: Ditto.
> > ---
> >  gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +
> gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +
> gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +
> gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +
> gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++
> gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 
> gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++
> >  gcc/tree-vect-generic.cc   | 107 +-
> >  8 files changed, 918 insertions(+), 6 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c
> > b/gcc/testsuite/gcc.target/i386/pr107432-1.c
> > new file mode 100644
> &g

RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-05-13 Thread Hu, Lin1
Do you have any advice?

BRs,
Lin

-Original Message-
From: Hu, Lin1  
Sent: Wednesday, May 8, 2024 9:38 AM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao ; ubiz...@gmail.com
Subject: [PATCH] vect: generate suitable convert insn for int -> int, float -> 
float and int <-> float.

Hi, all

This patch aims to optimize __builtin_convertvector. We want the function can 
generate more efficient insn for some situations. Like v2si -> v2di.

The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for 
trunk?

BRs,
Lin

gcc/ChangeLog:

PR target/107432
* tree-vect-generic.cc (expand_vector_conversion): Support
convert for int -> int, float -> float and int <-> float.
(expand_vector_conversion_no_vec_pack): Check if can convert
int <-> int, float <-> float and int <-> float, directly.
Support indirect convert, when direct optab is not supported.

gcc/testsuite/ChangeLog:

PR target/107432
* gcc.target/i386/pr107432-1.c: New test.
* gcc.target/i386/pr107432-2.c: Ditto.
* gcc.target/i386/pr107432-3.c: Ditto.
* gcc.target/i386/pr107432-4.c: Ditto.
* gcc.target/i386/pr107432-5.c: Ditto.
* gcc.target/i386/pr107432-6.c: Ditto.
* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +  
gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +  
gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +  
gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +  
gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++  
gcc/testsuite/gcc.target/i386/pr107432-6.c | 139   
gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++
 gcc/tree-vect-generic.cc   | 107 +-
 8 files changed, 918 insertions(+), 6 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c 
b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } 
+} */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } 
+} } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include 
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef 
+char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi 
+__attribute__ ((__vector_size__ (4))); typedef char __v8qi 
+__attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); 
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); 
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); 
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) {
+  return __builtin_convertvector((__v2di)a, __v2si); }
+
+__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si); }
+
+__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si); }
+
+__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi); }
+
+__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi); }
+
+__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); }
+
+__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi); }
+
+__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v

RE: [committed] testsuite: Fix up pr84508* tests [PR84508]

2024-05-09 Thread Hu, Lin1
> -Original Message-
> From: Jakub Jelinek 
> Sent: Friday, May 10, 2024 3:04 AM
> To: Hongtao Liu 
> Cc: Hu, Lin1 ; gcc-patches@gcc.gnu.org; Liu, Hongtao
> ; ubiz...@gmail.com
> Subject: [committed] testsuite: Fix up pr84508* tests [PR84508]
> 
> On Thu, May 09, 2024 at 12:45:42PM +0800, Hongtao Liu wrote:
> > > PR target/84508
> > > * gcc.target/i386/pr84508-1.c: New test.
> > > * gcc.target/i386/pr84508-2.c: Ditto.
> 
> The tests FAIL on x86_64-linux with
> /usr/bin/ld: cannot find -lubsan
> collect2: error: ld returned 1 exit status compiler exited with status 1
> FAIL: gcc.target/i386/pr84508-1.c (test for excess errors) Excess errors:
> /usr/bin/ld: cannot find -lubsan
> 
> The problem is that only *.dg/ubsan/ubsan.exp calls ubsan_init which adds the
> needed search paths to libubsan library.
> So, link/run tests for -fsanitize=undefined need to go into gcc.dg/ubsan/ or
> g++.dg/ubsan/, even when they are target specific.
>

Oh, I get it, thanks.
 
>
> Tested on x86_64-linux with
> make check-gcc RUNTESTFLAGS='--target_board=unix\{-m32,-m64\}
> i386.exp=pr84508* ubsan.exp=pr84508*'
> and committed to trunk as obvious.
> 
> 2024-05-09  Jakub Jelinek  
> 
>   PR target/84508
>   * gcc.target/i386/pr84508-1.c: Move to ...
>   * gcc.dg/ubsan/pr84508-1.c: ... here.  Restrict to i?86/x86_64
>   non-ia32 targets.
>   * gcc.target/i386/pr84508-2.c: Move to ...
>   * gcc.dg/ubsan/pr84508-2.c: ... here.  Restrict to i?86/x86_64
>   non-ia32 targets.
> 
> diff --git a/gcc/testsuite/gcc.target/i386/pr84508-1.c
> b/gcc/testsuite/gcc.dg/ubsan/pr84508-1.c
> similarity index 74%
> rename from gcc/testsuite/gcc.target/i386/pr84508-1.c
> rename to gcc/testsuite/gcc.dg/ubsan/pr84508-1.c
> index bb3e28d017e..d781e01 100644
> --- a/gcc/testsuite/gcc.target/i386/pr84508-1.c
> +++ b/gcc/testsuite/gcc.dg/ubsan/pr84508-1.c
> @@ -1,5 +1,6 @@
> -/* { dg-do run { target { ! ia32 } } } */
> +/* { dg-do run { target { { i?86-*-* x86_64-*-* } && { ! ia32 } } } }
> +*/
>  /* { dg-options "-fsanitize=undefined" } */
> +
>  #include 
> 
>  int main()
> diff --git a/gcc/testsuite/gcc.target/i386/pr84508-2.c
> b/gcc/testsuite/gcc.dg/ubsan/pr84508-2.c
> similarity index 73%
> rename from gcc/testsuite/gcc.target/i386/pr84508-2.c
> rename to gcc/testsuite/gcc.dg/ubsan/pr84508-2.c
> index 32a8f20a536..cf9c7db1d15 100644
> --- a/gcc/testsuite/gcc.target/i386/pr84508-2.c
> +++ b/gcc/testsuite/gcc.dg/ubsan/pr84508-2.c
> @@ -1,5 +1,6 @@
> -/* { dg-do run { target { ! ia32 } } } */
> +/* { dg-do run { target { { i?86-*-* x86_64-*-* } && { ! ia32 } } } }
> +*/
>  /* { dg-options "-fsanitize=undefined" } */
> +
>  #include 
> 
>  int main()
> 
>   Jakub



[PATCH] i386: Fix some intrinsics without alignment requirements.

2024-05-07 Thread Hu, Lin1
Hi all,

This patch aims to fix some intrinsics without alignment requirement, but
raised runtime error's problem.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

BRs,
Lin

gcc/ChangeLog:

PR target/84508
* config/i386/emmintrin.h
(_mm_load_sd): Remove alignment requirement.
(_mm_store_sd): Ditto.
(_mm_loadh_pd): Ditto.
(_mm_loadl_pd): Ditto.
(_mm_storel_pd): Add alignment requirement.
* config/i386/xmmintrin.h
(_mm_loadh_pi): Remove alignment requirement.
(_mm_loadl_pi): Ditto.
(_mm_load_ss): Ditto.
(_mm_store_ss): Ditto.

gcc/testsuite/ChangeLog:

PR target/84508
* gcc.target/i386/pr84508-1.c: New test.
* gcc.target/i386/pr84508-2.c: Ditto.
---
 gcc/config/i386/emmintrin.h   | 11 ++-
 gcc/config/i386/xmmintrin.h   |  9 +
 gcc/testsuite/gcc.target/i386/pr84508-1.c | 11 +++
 gcc/testsuite/gcc.target/i386/pr84508-2.c | 11 +++
 4 files changed, 33 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr84508-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr84508-2.c

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index 915a5234c38..d7fc1af9687 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -56,6 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), 
__may_alias__));
 /* Unaligned version of the same types.  */
 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), 
__may_alias__, __aligned__ (1)));
 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
+typedef double double_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
@@ -145,7 +146,7 @@ _mm_load1_pd (double const *__P)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_sd (double const *__P)
 {
-  return _mm_set_sd (*__P);
+  return __extension__ (__m128d){ *(double_u *)__P, 0.0 };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -180,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *__P = ((__v2df)__A)[0];
+  *(double_u *)__P = ((__v2df)__A)[0] ;
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -192,7 +193,7 @@ _mm_cvtsd_f64 (__m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_storel_pd (double *__P, __m128d __A)
 {
-  _mm_store_sd (__P, __A);
+  *__P = ((__v2df)__A)[0];
 }
 
 /* Stores the upper DPFP value.  */
@@ -973,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
+  return __extension__ (__m128d) { ((__v2df)__A)[0], *(double_u*)__B };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
+  return __extension__ (__m128d) { *(double_u*)__B, ((__v2df)__A)[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 71b9955b843..9e20f262839 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -73,6 +73,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), 
__may_alias__));
 
 /* Unaligned version of the same type.  */
 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
+typedef float float_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Internal data types for implementing the intrinsics.  */
 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
@@ -774,7 +775,7 @@ _mm_unpacklo_ps (__m128 __A, __m128 __B)
 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
the lower two values are passed through from A.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
-_mm_loadh_pi (__m128 __A, __m64 const *__P)
+_mm_loadh_pi (__m128 __A, __m64_u const *__P)
 {
   return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
 }
@@ -803,7 +804,7 @@ _mm_movelh_ps (__m128 __A, __m128 __B)
 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
the upper two values are passed through from A.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
-_mm_loadl_pi (__m128 

[PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.

2024-05-07 Thread Hu, Lin1
Hi, all

This patch aims to optimize __builtin_convertvector. We want the function
can generate more efficient insn for some situations. Like v2si -> v2di.

The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for
trunk?

BRs,
Lin

gcc/ChangeLog:

PR target/107432
* tree-vect-generic.cc (expand_vector_conversion): Support
convert for int -> int, float -> float and int <-> float.
(expand_vector_conversion_no_vec_pack): Check if can convert
int <-> int, float <-> float and int <-> float, directly.
Support indirect convert, when direct optab is not supported.

gcc/testsuite/ChangeLog:

PR target/107432
* gcc.target/i386/pr107432-1.c: New test.
* gcc.target/i386/pr107432-2.c: Ditto.
* gcc.target/i386/pr107432-3.c: Ditto.
* gcc.target/i386/pr107432-4.c: Ditto.
* gcc.target/i386/pr107432-5.c: Ditto.
* gcc.target/i386/pr107432-6.c: Ditto.
* gcc.target/i386/pr107432-7.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 +
 gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 +
 gcc/testsuite/gcc.target/i386/pr107432-3.c |  55 +
 gcc/testsuite/gcc.target/i386/pr107432-4.c |  56 +
 gcc/testsuite/gcc.target/i386/pr107432-5.c |  72 +++
 gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 
 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++
 gcc/tree-vect-generic.cc   | 107 +-
 8 files changed, 918 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c

diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c 
b/gcc/testsuite/gcc.target/i386/pr107432-1.c
new file mode 100644
index 000..a4f37447eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c
@@ -0,0 +1,234 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */
+/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */
+
+#include 
+
+typedef short __v2hi __attribute__ ((__vector_size__ (4)));
+typedef char __v2qi __attribute__ ((__vector_size__ (2)));
+typedef char __v4qi __attribute__ ((__vector_size__ (4)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+
+typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4)));
+typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8)));
+typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2)));
+typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4)));
+typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+
+__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2si);
+}
+
+__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a)
+{
+  return (__m128i)__builtin_convertvector((__v4di)a, __v4si);
+}
+
+__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a)
+{
+  return (__m256i)__builtin_convertvector((__v8di)a, __v8si);
+}
+
+__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2hi);
+}
+
+__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4hi);
+}
+
+__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a)
+{
+  return (__m128i)__builtin_convertvector((__v8di)a, __v8hi);
+}
+
+__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v2di)a, __v2qi);
+}
+
+__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a)
+{
+  return __builtin_convertvector((__v4di)a, __v4qi);
+}
+
+__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a)
+{
+  return __builtin_convertvector((__v8di)a, __v8qi);
+}
+
+__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a)
+{
+  return __builtin_convertvector((__v2si)a, __v2hi);
+}
+
+__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a)
+{
+  return __builtin_convertvector((__v4si)a, __v4hi);
+}
+
+__m128i

[PATCH] i386: Fix CPUID of USER_MSR.

2023-11-28 Thread Hu, Lin1
Hi, all

This patch aims to fix the wrong CPUID of USER_MSR, its correct CPUID is
(0x7, 0x1).EDX[15], But I set it as (0x7, 0x0).EDX[15]. And the patch modefied
testcase for give the user a better example.

It has been bootstrapped and regtested on x86-64-pc-linux-gnu, OK for trunk?

BR,
Lin

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features): Move USER_MSR
to the correct location.

gcc/testsuite/ChangeLog:

* gcc.target/i386/user_msr-1.c: Correct the MSR index for give the user
an proper example.
---
 gcc/common/config/i386/cpuinfo.h   | 4 ++--
 gcc/testsuite/gcc.target/i386/user_msr-1.c | 9 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index f90fb4d56a2..a1eb285daed 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -861,8 +861,6 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_IBT);
   if (edx & bit_UINTR)
set_feature (FEATURE_UINTR);
-  if (edx & bit_USER_MSR)
-   set_feature (FEATURE_USER_MSR);
   if (amx_usable)
{
  if (edx & bit_AMX_TILE)
@@ -921,6 +919,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_PREFETCHI);
  if (eax & bit_RAOINT)
set_feature (FEATURE_RAOINT);
+ if (edx & bit_USER_MSR)
+   set_feature (FEATURE_USER_MSR);
  if (avx_usable)
{
  if (eax & bit_AVXVNNI)
diff --git a/gcc/testsuite/gcc.target/i386/user_msr-1.c 
b/gcc/testsuite/gcc.target/i386/user_msr-1.c
index 447852306df..f315016d088 100644
--- a/gcc/testsuite/gcc.target/i386/user_msr-1.c
+++ b/gcc/testsuite/gcc.target/i386/user_msr-1.c
@@ -1,9 +1,9 @@
 /* { dg-do compile { target { ! ia32  }  }  } */
 /* { dg-options "-musermsr -O2"  } */
 /* { dg-final { scan-assembler-times "urdmsr\[ \\t\]\\%r\[a-z\]x, 
\\%r\[a-z\]x" 1  }  } */
-/* { dg-final { scan-assembler-times "urdmsr\[ \\t\]\\\$121" 1  }  } */
+/* { dg-final { scan-assembler-times "urdmsr\[ \\t\]\\\$6912" 1  }  } */
 /* { dg-final { scan-assembler-times "uwrmsr\[ \\t\]\\%r\[a-z\]x, 
\\%r\[a-z\]x" 1  }  } */
-/* { dg-final { scan-assembler-times "uwrmsr\[ \\t\]\\%r\[a-z\]x, \\\$121" 1  
}  } */
+/* { dg-final { scan-assembler-times "uwrmsr\[ \\t\]\\%r\[a-z\]x, \\\$6912" 1  
}  } */
 
 #include 
 
@@ -13,8 +13,9 @@ volatile unsigned long long y;
 void extern
 user_msr_test (void)
 {
+  y = 6913;
   x = _urdmsr(y);
-  x = _urdmsr(121);
+  x = _urdmsr(6912);
   _uwrmsr(y, x);
-  _uwrmsr(121, x);
+  _uwrmsr(6912, x);
 }
-- 
2.31.1



RE: [PATCH] Avoid generate vblendps with ymm16+

2023-11-12 Thread Hu, Lin1
On Saturday, November 11, 2023 4:11 AM,  Jakub Jelinek  wrote:
> On Thu, Nov 09, 2023 at 03:27:11PM +0800, Hongtao Liu wrote:
> > On Thu, Nov 9, 2023 at 3:15 PM Hu, Lin1  wrote:
> > >
> > > This patch aims to avoid generate vblendps with ymm16+, And have
> > > bootstrapped and tested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > > PR target/112435
> > > * config/i386/sse.md: Adding constraints to restrict the 
> > > generation of
> > > vblendps.
> > It should be "Don't output vblendps when evex sse reg or gpr32 is involved."
> > Others LGTM.
> 
> I've missed this patch, so wrote my own today, and am wondering
> 
> 1) if it isn't better to use separate alternative instead of
>x86_evex_reg_mentioned_p, like in the patch below
> 2) why do you need the last two hunks in sse.md, both avx2_permv2ti and
>*avx_vperm2f128_nozero insns only use x in constraints, never v,
>so x86_evex_reg_mentioned_p ought to be always false there
>

Yes, I think your method is better. For the second problem, I didn't focus on 
the constraints when I solved this problem. I did learn a good thought. Feel 
free to upstream this patch.

BRs,
Lin
 
>
> Here is the untested patch, of course you have more testcases (though, I 
> think it
> is better to test dg-do assemble with avx512vl target rather than dg-do 
> compile
> and scan the assembler, after all, the problem was that it didn't assemble).
> 
> 2023-11-10  Jakub Jelinek  
> 
>   PR target/112435
>   * config/i386/sse.md
> (avx512vl_shuf_32x4_1,
>   avx512dq_shuf_64x2_1):
> Add
>   alternative with just x instead of v constraints and use vblendps
>   as optimization only with that alternative.
> 
>   * gcc.target/i386/avx512vl-pr112435.c: New test.
> 
> --- gcc/config/i386/sse.md.jj 2023-11-09 09:04:18.616543403 +0100
> +++ gcc/config/i386/sse.md2023-11-10 15:56:44.138499931 +0100
> @@ -19235,11 +19235,11 @@ (define_expand "avx512dq_shuf_  })
> 
>  (define_insn
> "avx512dq_shuf_64x2_1"
> -  [(set (match_operand:VI8F_256 0 "register_operand" "=v")
> +  [(set (match_operand:VI8F_256 0 "register_operand" "=x,v")
>   (vec_select:VI8F_256
> (vec_concat:
> - (match_operand:VI8F_256 1 "register_operand" "v")
> - (match_operand:VI8F_256 2 "nonimmediate_operand" "vm"))
> + (match_operand:VI8F_256 1 "register_operand" "x,v")
> + (match_operand:VI8F_256 2 "nonimmediate_operand" "xm,vm"))
> (parallel [(match_operand 3 "const_0_to_3_operand")
>(match_operand 4 "const_0_to_3_operand")
>(match_operand 5 "const_4_to_7_operand") @@ -19254,7
> +19254,7 @@ (define_insn "avx512dq_shu
>mask = INTVAL (operands[3]) / 2;
>mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
>operands[3] = GEN_INT (mask);
> -  if (INTVAL (operands[3]) == 2 && !)
> +  if (INTVAL (operands[3]) == 2 && ! && which_alternative
> + == 0)
>  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
>return
> "vshuf64x2\t{%3, %2, %1, %0|%0 d7>, %1, %2, %3}";  } @@ -19386,11 +19386,11 @@ (define_expand
> "avx512vl_shuf_  })
> 
>  (define_insn "avx512vl_shuf_32x4_1"
> -  [(set (match_operand:VI4F_256 0 "register_operand" "=v")
> +  [(set (match_operand:VI4F_256 0 "register_operand" "=x,v")
>   (vec_select:VI4F_256
> (vec_concat:
> - (match_operand:VI4F_256 1 "register_operand" "v")
> - (match_operand:VI4F_256 2 "nonimmediate_operand" "vm"))
> + (match_operand:VI4F_256 1 "register_operand" "x,v")
> + (match_operand:VI4F_256 2 "nonimmediate_operand" "xm,vm"))
> (parallel [(match_operand 3 "const_0_to_7_operand")
>(match_operand 4 "const_0_to_7_operand")
>(match_operand 5 "const_0_to_7_operand") @@ -19414,7
> +19414,7 @@ (define_insn "avx512vl_shuf_mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
>operands[3] = GEN_INT (mask);
> 
> -  if (INTVAL (operands[3]) == 2 && !)
> +  if (INTVAL (operands[3]) == 2 && ! && which_alternative
> + == 0)
>  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> 
>return
> "vshuf32x4\t{%3, %2, %1, %0|%0 nd11>, %1, %2, %3}";
> --- gcc/testsuite/gcc.target/i386/avx512vl-pr112435.c.jj  2023-11-10
> 16:04:21.708046771 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512vl-pr112435.c 2023-11-10
> 16:03:51.053479094 +0100
> @@ -0,0 +1,13 @@
> +/* PR target/112435 */
> +/* { dg-do assemble { target { avx512vl && { ! ia32 } } } } */
> +/* { dg-options "-mavx512vl -O2" } */
> +
> +#include 
> +
> +__m256i
> +foo (__m256i a, __m256i b)
> +{
> +  register __m256i c __asm__("ymm16") = a;
> +  asm ("" : "+v" (c));
> +  return _mm256_shuffle_i32x4 (c, b, 2); }
> 
>   Jakub



[PATCH] Avoid generate vblendps with ymm16+

2023-11-08 Thread Hu, Lin1
This patch aims to avoid generate vblendps with ymm16+, And have
bootstrapped and tested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk?

gcc/ChangeLog:

PR target/112435
* config/i386/sse.md: Adding constraints to restrict the generation of
vblendps.

gcc/testsuite/ChangeLog:

PR target/112435
* gcc.target/i386/pr112435-1.c: New test.
* gcc.target/i386/pr112435-2.c: Ditto.
* gcc.target/i386/pr112435-3.c: Ditto.
---
 gcc/config/i386/sse.md | 28 +---
 gcc/testsuite/gcc.target/i386/pr112435-1.c | 14 
 gcc/testsuite/gcc.target/i386/pr112435-2.c | 64 ++
 gcc/testsuite/gcc.target/i386/pr112435-3.c | 79 ++
 4 files changed, 175 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-3.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 33198756bb0..666f931c88d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -19254,7 +19254,8 @@
   mask = INTVAL (operands[3]) / 2;
   mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
   operands[3] = GEN_INT (mask);
-  if (INTVAL (operands[3]) == 2 && !)
+  if (INTVAL (operands[3]) == 2 && !
+  && !x86_evex_reg_mentioned_p (operands, 3))
 return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
   return "vshuf64x2\t{%3, %2, %1, 
%0|%0, %1, %2, %3}";
 }
@@ -19414,7 +19415,8 @@
   mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
   operands[3] = GEN_INT (mask);
 
-  if (INTVAL (operands[3]) == 2 && !)
+  if (INTVAL (operands[3]) == 2 && !
+  && !x86_evex_reg_mentioned_p (operands, 3))
 return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
 
   return "vshuf32x4\t{%3, %2, %1, 
%0|%0, %1, %2, %3}";
@@ -26776,10 +26778,13 @@
else
  return "vmovaps\t{%2, %0|%0, %2}";
   }
-if ((mask & 0xbb) == 18)
-  return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
-if ((mask & 0xbb) == 48)
-  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+if (!x86_evex_reg_mentioned_p (operands, 3))
+  {
+   if ((mask & 0xbb) == 18)
+ return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+   if ((mask & 0xbb) == 48)
+ return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+  }
 return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
   }
   [(set_attr "type" "sselog")
@@ -27433,10 +27438,13 @@
&& avx_vperm2f128_parallel (operands[3], mode)"
 {
   int mask = avx_vperm2f128_parallel (operands[3], mode) - 1;
-  if ((mask & 0xbb) == 0x12)
-return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
-  if ((mask & 0xbb) == 0x30)
-return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+  if (!x86_evex_reg_mentioned_p (operands, 3))
+{
+  if ((mask & 0xbb) == 0x12)
+   return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+  if ((mask & 0xbb) == 0x30)
+   return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+}
   if ((mask & 0xbb) == 0x20)
 return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
   operands[3] = GEN_INT (mask);
diff --git a/gcc/testsuite/gcc.target/i386/pr112435-1.c 
b/gcc/testsuite/gcc.target/i386/pr112435-1.c
new file mode 100644
index 000..ff56523b4e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112435-1.c
@@ -0,0 +1,14 @@
+/* PR target/112435 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-not "vblendps" } } */
+
+#include
+
+__m256i
+f(__m256i a, __m256i  b)
+{
+  register __m256i t __asm__("ymm17") = a;
+  asm("":"+v"(t));
+  return _mm256_shuffle_i32x4 (t, b, 2);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr112435-2.c 
b/gcc/testsuite/gcc.target/i386/pr112435-2.c
new file mode 100644
index 000..27ba80b1e68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112435-2.c
@@ -0,0 +1,64 @@
+/* PR target/112435 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-not "vblendps.*ymm17\$" } } */
+
+#include
+
+/* Vpermi128/Vpermf128 */
+__m256i
+perm0 (__m256i a, __m256i b)
+{
+  register __m256i t __asm__("ymm17") = a;
+  asm("":"+v"(t));
+  return _mm256_permute2x128_si256 (t, b, 50);
+}
+
+__m256i
+perm1 (__m256i a, __m256i b)
+{
+  register __m256i t __asm__("ymm17") = a;
+  asm("":"+v"(t));
+  return _mm256_permute2x128_si256 (t, b, 18);
+}
+
+__m256i
+perm2 (__m256i a, __m256i b)
+{
+  register __m256i t __asm__("ymm17") = a;
+  asm("":"+v"(t));
+  return _mm256_permute2x128_si256 (t, b, 48);
+}
+
+/* vshuf{i,f}{32x4,64x2} ymm .*/
+__m256i
+shuff0 (__m256i a, __m256i b)
+{
+  register __m256i t __asm__("ymm17") = a;
+  asm("":"+v"(t));
+  return _mm256_shuffle_i32x4(t, b, 2);
+}
+
+__m256
+shuff1 (__m256 a, __m256 b)
+{
+  register __m256 t __asm__("ymm17") = a;
+  

[PATCH] Fix testcases that are raised by support -mevex512

2023-10-11 Thread Hu, Lin1
Hi, all

This patch aims to fix some scan-asm fail of pr89229-{5,6,7}b.c since we emit
scalar vmov{s,d} here, when trying to use x/ymm 16+ w/o avx512vl but with
avx512f+evex512.

If everyone has no objection to the modification of this behavior, then we tend
to solve these failures by modifying these testcases.

BRs,
Lin

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr89229-5b.c: Modify test.
* gcc.target/i386/pr89229-6b.c: Ditto.
* gcc.target/i386/pr89229-7b.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr89229-5b.c | 2 +-
 gcc/testsuite/gcc.target/i386/pr89229-6b.c | 2 +-
 gcc/testsuite/gcc.target/i386/pr89229-7b.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr89229-5b.c 
b/gcc/testsuite/gcc.target/i386/pr89229-5b.c
index 261f2e12e8d..8a81585e790 100644
--- a/gcc/testsuite/gcc.target/i386/pr89229-5b.c
+++ b/gcc/testsuite/gcc.target/i386/pr89229-5b.c
@@ -3,4 +3,4 @@
 
 #include "pr89229-5a.c"
 
-/* { dg-final { scan-assembler-times 
"vmovdqa32\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */
+/* { dg-final { scan-assembler-times 
"vmovsd\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-6b.c 
b/gcc/testsuite/gcc.target/i386/pr89229-6b.c
index a74f7169e6e..0c27daa4f74 100644
--- a/gcc/testsuite/gcc.target/i386/pr89229-6b.c
+++ b/gcc/testsuite/gcc.target/i386/pr89229-6b.c
@@ -3,4 +3,4 @@
 
 #include "pr89229-6a.c"
 
-/* { dg-final { scan-assembler-times 
"vmovaps\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */
+/* { dg-final { scan-assembler-times 
"vmovss\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-7b.c 
b/gcc/testsuite/gcc.target/i386/pr89229-7b.c
index d3a56e6e2b7..baba99ec775 100644
--- a/gcc/testsuite/gcc.target/i386/pr89229-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr89229-7b.c
@@ -3,4 +3,4 @@
 
 #include "pr89229-7a.c"
 
-/* { dg-final { scan-assembler-times 
"vmovdqa32\[^\n\r]*zmm1\[67]\[^\n\r]*zmm1\[67]" 1 } } */
+/* { dg-final { scan-assembler-times 
"vmovss\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]\[^\n\r]*xmm1\[67]" 1 } } */
-- 
2.31.1



RE: [PATCH] Support Intel USER_MSR

2023-10-10 Thread Hu, Lin1
There are some typos In /gcc/doc/extend.texi and /gcc/doc/invoke.texi. They 
should be USER_MSR, not UMSR. I have modified them in my branch.

-Original Message-
From: Hu, Lin1  
Sent: Tuesday, October 10, 2023 3:47 PM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao ; ubiz...@gmail.com
Subject: [PATCH] Support Intel USER_MSR

This patch aims to support Intel USER_MSR.

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features):
Detect USER_MSR.
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_USER_MSR_SET): 
New.
(OPTION_MASK_ISA2_USER_MSR_UNSET): Ditto.
(ix86_handle_option): Handle -musermsr.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_USER_MSR.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for usermsr.
* config.gcc: Add usermsrintrin.h
* config/i386/cpuid.h (bit_USER_MSR): New.
* config/i386/i386-builtin-types.def:
Add DEF_FUNCTION_TYPE (VOID, UINT64, UINT64).
* config/i386/i386-builtins.cc (ix86_init_mmx_sse_builtins):
Add __builtin_urdmsr and __builtin_uwrmsr.
* config/i386/i386-builtins.h (ix86_builtins):
Add IX86_BUILTIN_URDMSR and IX86_BUILTIN_UWRMSR.
* config/i386/i386-c.cc (ix86_target_macros_internal):
Define __USER_MSR__.
* config/i386/i386-expand.cc (ix86_expand_builtin):
Handle new builtins.
* config/i386/i386-isa.def (USER_MSR): Add DEF_PTA(USER_MSR).
* config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p):
Handle usermsr.
* config/i386/i386.md (urdmsr): New define_insn.
(uwrmsr): Ditto.
* config/i386/i386.opt: Add option -musermsr.
* config/i386/x86gprintrin.h: Include usermsrintrin.h
* doc/extend.texi: Document usermsr.
* doc/invoke.texi: Document -musermsr.
* doc/sourcebuild.texi: Document target usermsr.
* config/i386/usermsrintrin.h: New file.

gcc/testsuite/ChangeLog:

* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/x86gprintrin-1.c: Add -musermsr for 64bit target.
* gcc.target/i386/x86gprintrin-2.c: Ditto.
* gcc.target/i386/x86gprintrin-3.c: Ditto.
* gcc.target/i386/x86gprintrin-4.c: Add musermsr for 64bit target.
* gcc.target/i386/x86gprintrin-5.c: Ditto
* gcc.target/i386/usermsr-1.c: New test.
* gcc.target/i386/usermsr-2.c: Ditto.
---
 gcc/common/config/i386/cpuinfo.h  |  2 +
 gcc/common/config/i386/i386-common.cc | 15 +
 gcc/common/config/i386/i386-cpuinfo.h |  1 +
 gcc/common/config/i386/i386-isas.h|  1 +
 gcc/config.gcc|  3 +-
 gcc/config/i386/cpuid.h   |  1 +
 gcc/config/i386/i386-builtin-types.def|  3 +
 gcc/config/i386/i386-builtins.cc  |  8 +++
 gcc/config/i386/i386-builtins.h   |  2 +
 gcc/config/i386/i386-c.cc |  2 +
 gcc/config/i386/i386-expand.cc| 35 +++
 gcc/config/i386/i386-isa.def  |  1 +
 gcc/config/i386/i386-options.cc   |  4 +-
 gcc/config/i386/i386.md   | 24 
 gcc/config/i386/i386.opt  |  4 ++
 gcc/config/i386/usermsrintrin.h   | 60 +++
 gcc/config/i386/x86gprintrin.h|  2 +
 gcc/doc/extend.texi   |  5 ++
 gcc/doc/invoke.texi   |  6 +-
 gcc/doc/sourcebuild.texi  |  3 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |  2 +
 gcc/testsuite/gcc.target/i386/user_msr-1.c| 20 +++
 gcc/testsuite/gcc.target/i386/user_msr-2.c| 16 +
 .../gcc.target/i386/x86gprintrin-1.c  |  2 +-
 .../gcc.target/i386/x86gprintrin-2.c  |  6 +-
 .../gcc.target/i386/x86gprintrin-3.c  | 28 -
 .../gcc.target/i386/x86gprintrin-4.c  | 32 +-
 .../gcc.target/i386/x86gprintrin-5.c  |  6 +-
 28 files changed, 286 insertions(+), 8 deletions(-)  create mode 100644 
gcc/config/i386/usermsrintrin.h  create mode 100644 
gcc/testsuite/gcc.target/i386/user_msr-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/user_msr-2.c

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 141d3743316..0f86b44730b 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -838,6 +838,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_IBT);
   if (edx & bit_UINTR)
set_feature (FEATURE_UINTR);
+  if (edx & bit_USER_MSR)
+   set_feature (FEATURE_USER_MSR);
   if (amx_usable)
{
  if (edx & bit_AMX_TILE)
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 684b0451bb3..13e423deceb 1006

[PATCH] Support Intel USER_MSR

2023-10-10 Thread Hu, Lin1
This patch aims to support Intel USER_MSR.

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features):
Detect USER_MSR.
* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_USER_MSR_SET): 
New.
(OPTION_MASK_ISA2_USER_MSR_UNSET): Ditto.
(ix86_handle_option): Handle -musermsr.
* common/config/i386/i386-cpuinfo.h (enum processor_features):
Add FEATURE_USER_MSR.
* common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for usermsr.
* config.gcc: Add usermsrintrin.h
* config/i386/cpuid.h (bit_USER_MSR): New.
* config/i386/i386-builtin-types.def:
Add DEF_FUNCTION_TYPE (VOID, UINT64, UINT64).
* config/i386/i386-builtins.cc (ix86_init_mmx_sse_builtins):
Add __builtin_urdmsr and __builtin_uwrmsr.
* config/i386/i386-builtins.h (ix86_builtins):
Add IX86_BUILTIN_URDMSR and IX86_BUILTIN_UWRMSR.
* config/i386/i386-c.cc (ix86_target_macros_internal):
Define __USER_MSR__.
* config/i386/i386-expand.cc (ix86_expand_builtin):
Handle new builtins.
* config/i386/i386-isa.def (USER_MSR): Add DEF_PTA(USER_MSR).
* config/i386/i386-options.cc (ix86_valid_target_attribute_inner_p):
Handle usermsr.
* config/i386/i386.md (urdmsr): New define_insn.
(uwrmsr): Ditto.
* config/i386/i386.opt: Add option -musermsr.
* config/i386/x86gprintrin.h: Include usermsrintrin.h
* doc/extend.texi: Document usermsr.
* doc/invoke.texi: Document -musermsr.
* doc/sourcebuild.texi: Document target usermsr.
* config/i386/usermsrintrin.h: New file.

gcc/testsuite/ChangeLog:

* gcc.target/i386/funcspec-56.inc: Add new target attribute.
* gcc.target/i386/x86gprintrin-1.c: Add -musermsr for 64bit target.
* gcc.target/i386/x86gprintrin-2.c: Ditto.
* gcc.target/i386/x86gprintrin-3.c: Ditto.
* gcc.target/i386/x86gprintrin-4.c: Add musermsr for 64bit target.
* gcc.target/i386/x86gprintrin-5.c: Ditto
* gcc.target/i386/usermsr-1.c: New test.
* gcc.target/i386/usermsr-2.c: Ditto.
---
 gcc/common/config/i386/cpuinfo.h  |  2 +
 gcc/common/config/i386/i386-common.cc | 15 +
 gcc/common/config/i386/i386-cpuinfo.h |  1 +
 gcc/common/config/i386/i386-isas.h|  1 +
 gcc/config.gcc|  3 +-
 gcc/config/i386/cpuid.h   |  1 +
 gcc/config/i386/i386-builtin-types.def|  3 +
 gcc/config/i386/i386-builtins.cc  |  8 +++
 gcc/config/i386/i386-builtins.h   |  2 +
 gcc/config/i386/i386-c.cc |  2 +
 gcc/config/i386/i386-expand.cc| 35 +++
 gcc/config/i386/i386-isa.def  |  1 +
 gcc/config/i386/i386-options.cc   |  4 +-
 gcc/config/i386/i386.md   | 24 
 gcc/config/i386/i386.opt  |  4 ++
 gcc/config/i386/usermsrintrin.h   | 60 +++
 gcc/config/i386/x86gprintrin.h|  2 +
 gcc/doc/extend.texi   |  5 ++
 gcc/doc/invoke.texi   |  6 +-
 gcc/doc/sourcebuild.texi  |  3 +
 gcc/testsuite/gcc.target/i386/funcspec-56.inc |  2 +
 gcc/testsuite/gcc.target/i386/user_msr-1.c| 20 +++
 gcc/testsuite/gcc.target/i386/user_msr-2.c| 16 +
 .../gcc.target/i386/x86gprintrin-1.c  |  2 +-
 .../gcc.target/i386/x86gprintrin-2.c  |  6 +-
 .../gcc.target/i386/x86gprintrin-3.c  | 28 -
 .../gcc.target/i386/x86gprintrin-4.c  | 32 +-
 .../gcc.target/i386/x86gprintrin-5.c  |  6 +-
 28 files changed, 286 insertions(+), 8 deletions(-)
 create mode 100644 gcc/config/i386/usermsrintrin.h
 create mode 100644 gcc/testsuite/gcc.target/i386/user_msr-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/user_msr-2.c

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 141d3743316..0f86b44730b 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -838,6 +838,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_IBT);
   if (edx & bit_UINTR)
set_feature (FEATURE_UINTR);
+  if (edx & bit_USER_MSR)
+   set_feature (FEATURE_USER_MSR);
   if (amx_usable)
{
  if (edx & bit_AMX_TILE)
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 684b0451bb3..13e423deceb 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -125,6 +125,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_SM4_SET OPTION_MASK_ISA2_SM4
 #define OPTION_MASK_ISA2_APX_F_SET OPTION_MASK_ISA2_APX_F
 #define OPTION_MASK_ISA2_EVEX512_SET OPTION_MASK_ISA2_EVEX512
+#define 

RE: [PATCH 00/18] Support -mevex512 for AVX512

2023-09-27 Thread Hu, Lin1
Hi, 

Thanks for you reply.

I'd like to verify that our understanding of your requirements is correct, and 
that __EVEX256__ can be considered a default macro to determine whether the 
compiler supports the __EVEX***__ series of switches. 

For example:

I have a segment of code like:
#if defined(__EVEX512__):
__mm512.*__;
#else
__mm256.*__;
#endif

But __EVEX512__ is undefined that doesn't mean I only need 256bit, maybe I use 
gcc-13, so I can still use 512bit.

So the code should be:
#if defined(__EVEX512__):
__mm512.*__;
#elif defined(__EVEX256__):
__mm256.*__;
#else
__mm512.*__;
#endif

If we understand correctly, we'll consider the request. But since we're about 
to have a vacation, follow-up replies may be a bit slower.

BRs,
Lin

-Original Message-
From: ZiNgA BuRgA  
Sent: Thursday, September 28, 2023 8:32 AM
To: Hu, Lin1 ; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH 00/18] Support -mevex512 for AVX512

Thanks for the new patch!

I see that there's a new __EVEX512__ define.  Will there be some __EVEX256__ 
(or maybe some max EVEX width) define, so that code can detect whether the 
compiler supports AVX10.1/256 without resorting to version checks?




[PATCH 11/18] [PATCH 5/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
---
 gcc/config/i386/i386-builtin.def | 156 +++
 1 file changed, 78 insertions(+), 78 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 8250e2998cd..b90d5ccc969 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -1568,9 +1568,9 @@ BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_copysignv8df3
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, 
UNKNOWN, (int) V8DF_FTYPE_V8DF)
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, CODE_FOR_sqrtv16sf2, 
"__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF)
 BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf, 
"__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, 
"__builtin_ia32_floorph512", IX86_BUILTIN_FLOORPH512, (enum rtx_code) 
ROUND_FLOOR, (int) V32HF_FTYPE_V32HF_ROUND)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, 
"__builtin_ia32_ceilph512", IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, 
(int) V32HF_FTYPE_V32HF_ROUND)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512bw_rndscalev32hf, 
"__builtin_ia32_truncph512", IX86_BUILTIN_TRUNCPH512, (enum rtx_code) 
ROUND_TRUNC, (int) V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_floorph512", 
IX86_BUILTIN_FLOORPH512, (enum rtx_code) ROUND_FLOOR, (int) 
V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_ceilph512", 
IX86_BUILTIN_CEILPH512, (enum rtx_code) ROUND_CEIL, (int) 
V32HF_FTYPE_V32HF_ROUND)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_rndscalev32hf, "__builtin_ia32_truncph512", 
IX86_BUILTIN_TRUNCPH512, (enum rtx_code) ROUND_TRUNC, (int) 
V32HF_FTYPE_V32HF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512f_roundps512, "__builtin_ia32_floorps512", 
IX86_BUILTIN_FLOORPS512, (enum rtx_code) ROUND_FLOOR, (int) 
V16SF_FTYPE_V16SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512f_roundps512, "__builtin_ia32_ceilps512", 
IX86_BUILTIN_CEILPS512, (enum rtx_code) ROUND_CEIL, (int) 
V16SF_FTYPE_V16SF_ROUND)
 BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512f_roundps512, "__builtin_ia32_truncps512", 
IX86_BUILTIN_TRUNCPS512, (enum rtx_code) ROUND_TRUNC, (int) 
V16SF_FTYPE_V16SF_ROUND)
@@ -2874,40 +2874,40 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_extendbfsf2_1, 
"__builtin_ia32_cvtbf2sf
 /* AVX512FP16.  */
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", 
IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_addv16hf3_mask, "__builtin_ia32_addph256_mask", 
IX86_BUILTIN_ADDPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv32hf3_mask, 
"__builtin_ia32_addph512_mask", IX86_BUILTIN_ADDPH512_MASK, UNKNOWN, (int) 
V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_addv32hf3_mask, "__builtin_ia32_addph512_mask", 
IX86_BUILTIN_ADDPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_subv8hf3_mask, "__builtin_ia32_subph128_mask", 
IX86_BUILTIN_SUBPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_subv16hf3_mask, "__builtin_ia32_subph256_mask", 
IX86_BUILTIN_SUBPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_subv32hf3_mask, 
"__builtin_ia32_subph512_mask", IX86_BUILTIN_SUBPH512_MASK, UNKNOWN, (int) 
V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
+BDESC (0, OPTION_MASK_ISA2_AVX512FP16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_subv32hf3_mask, "__builtin_ia32_subph512_mask", 
IX86_BUILTIN_SUBPH512_MASK, UNKNOWN, (int) V32HF_FTYPE_V32HF_V32HF_V32HF_USI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_mulv8hf3_mask, "__builtin_ia32_mulph128_mask", 
IX86_BUILTIN_MULPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_mulv16hf3_mask, "__builtin_ia32_mulph256_mask", 
IX86_BUILTIN_MULPH256_MASK, UNKNOWN, (int) V16HF_FTYPE_V16HF_V16HF_V16HF_UHI)
-BDESC (0, OPTION_MASK_ISA2_AVX512FP16, 

[PATCH 12/18] Disable zmm register and 512 bit libmvec call when !TARGET_EVEX512

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_broadcast_from_constant):
Disable zmm broadcast for !TARGET_EVEX512.
* config/i386/i386-options.cc (ix86_option_override_internal):
Do not use PVW_512 when no-evex512.
(ix86_simd_clone_adjust): Add evex512 target into string.
* config/i386/i386.cc (type_natural_mode): Report ABI warning
when using zmm register w/o evex512.
(ix86_return_in_memory): Do not allow zmm when !TARGET_EVEX512.
(ix86_hard_regno_mode_ok): Ditto.
(ix86_set_reg_reg_cost): Ditto.
(ix86_rtx_costs): Ditto.
(ix86_vector_mode_supported_p): Ditto.
(ix86_preferred_simd_mode): Ditto.
(ix86_get_mask_mode): Ditto.
(ix86_simd_clone_compute_vecsize_and_simdlen): Disable 512 bit
libmvec call when !TARGET_EVEX512.
(ix86_simd_clone_usable): Ditto.
* config/i386/i386.h (BIGGEST_ALIGNMENT): Disable 512 alignment
when !TARGET_EVEX512
(MOVE_MAX): Do not use PVW_512 when !TARGET_EVEX512.
(STORE_MAX_PIECES): Ditto.
---
 gcc/config/i386/i386-expand.cc  |  1 +
 gcc/config/i386/i386-options.cc | 14 +
 gcc/config/i386/i386.cc | 53 ++---
 gcc/config/i386/i386.h  |  7 +++--
 4 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index e42ff27c6ef..6eedcb384c0 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -611,6 +611,7 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op)
  avx512 embed broadcast is available.  */
   if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
   && (!TARGET_AVX512F
+ || (GET_MODE_SIZE (mode) == 64 && !TARGET_EVEX512)
  || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
 return nullptr;
 
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index a1a7a92da9f..e2a90d7d9e2 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2845,7 +2845,8 @@ ix86_option_override_internal (bool main_args_p,
  opts->x_ix86_move_max = opts->x_prefer_vector_width_type;
  if (opts_set->x_ix86_move_max == PVW_NONE)
{
- if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
+ if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
+ && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
  else
opts->x_ix86_move_max = PVW_AVX128;
@@ -2866,7 +2867,8 @@ ix86_option_override_internal (bool main_args_p,
  opts->x_ix86_store_max = opts->x_prefer_vector_width_type;
  if (opts_set->x_ix86_store_max == PVW_NONE)
{
- if (TARGET_AVX512F_P (opts->x_ix86_isa_flags))
+ if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
+ && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_store_max = PVW_AVX512;
  else
opts->x_ix86_store_max = PVW_AVX128;
@@ -3145,13 +3147,13 @@ ix86_simd_clone_adjust (struct cgraph_node *node)
 case 'e':
   if (TARGET_PREFER_AVX256)
{
- if (!TARGET_AVX512F)
-   str = "avx512f,prefer-vector-width=512";
+ if (!TARGET_AVX512F || !TARGET_EVEX512)
+   str = "avx512f,evex512,prefer-vector-width=512";
  else
str = "prefer-vector-width=512";
}
-  else if (!TARGET_AVX512F)
-   str = "avx512f";
+  else if (!TARGET_AVX512F || !TARGET_EVEX512)
+   str = "avx512f,evex512";
   break;
 default:
   gcc_unreachable ();
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 477e6cecc38..0df3bf10547 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -1924,7 +1924,8 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS 
*cum,
if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
&& GET_MODE_INNER (mode) == innermode)
  {
-   if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
+   if (size == 64 && (!TARGET_AVX512F || !TARGET_EVEX512)
+   && !TARGET_IAMCU)
  {
static bool warnedavx512f;
static bool warnedavx512f_ret;
@@ -4347,7 +4348,7 @@ ix86_return_in_memory (const_tree type, const_tree fntype 
ATTRIBUTE_UNUSED)
 
  /* AVX512F values are returned in ZMM0 if available.  */
  if (size == 64)
-   return !TARGET_AVX512F;
+   return !TARGET_AVX512F || !TARGET_EVEX512;
}
 
   if (mode == XFmode)
@@ -20286,7 +20287,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, 
machine_mode mode)
  - any of 512-bit wide vector mode
  - any scalar mode.  */
   if (TARGET_AVX512F
- && (VALID_AVX512F_REG_OR_XI_MODE (mode)
+  

[PATCH 13/18] Support -mevex512 for AVX512F intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtins.cc
(ix86_vectorize_builtin_gather): Disable 512 bit gather
when !TARGET_EVEX512.
* config/i386/i386-expand.cc (ix86_valid_mask_cmp_mode):
Add TARGET_EVEX512.
(ix86_expand_int_sse_cmp): Ditto.
(ix86_expand_vector_init_one_nonzero): Disable subroutine
when !TARGET_EVEX512.
(ix86_emit_swsqrtsf): Add TARGET_EVEX512.
(ix86_vectorize_vec_perm_const): Disable subroutine when
!TARGET_EVEX512.
* config/i386/i386.cc
(standard_sse_constant_p): Add TARGET_EVEX512.
(standard_sse_constant_opcode): Ditto.
(ix86_get_ssemov): Ditto.
(ix86_legitimate_constant_p): Ditto.
(ix86_vectorize_builtin_scatter): Diable 512 bit scatter
when !TARGET_EVEX512.
* config/i386/i386.md (avx512f_512): New.
(movxi): Add TARGET_EVEX512.
(*movxi_internal_avx512f): Ditto.
(*movdi_internal): Change alternative 12 to ?Yv. Adjust mode
for alternative 13.
(*movsi_internal): Change alternative 8 to ?Yv. Adjust mode for
alternative 9.
(*movhi_internal): Change alternative 11 to *Yv.
(*movdf_internal): Change alternative 12 to Yv.
(*movsf_internal): Change alternative 5 to Yv. Adjust mode for
alternative 5 and 6.
(*mov_internal): Change alternative 4 to Yv.
(define_split for convert SF to DF): Add TARGET_EVEX512.
(extendbfsf2_1): Ditto.
* config/i386/predicates.md (bcst_mem_operand): Disable predicate
for 512 bit when !TARGET_EVEX512.
* config/i386/sse.md (VMOVE): Add TARGET_EVEX512.
(V48_AVX512VL): Ditto.
(V48_256_512_AVX512VL): Ditto.
(V48H_AVX512VL): Ditto.
(VI12_AVX512VL): Ditto.
(V): Ditto.
(V_512): Ditto.
(V_256_512): Ditto.
(VF): Ditto.
(VF1_VF2_AVX512DQ): Ditto.
(VFH): Ditto.
(VFB): Ditto.
(VF1): Ditto.
(VF1_AVX2): Ditto.
(VF2): Ditto.
(VF2H): Ditto.
(VF2_512_256): Ditto.
(VF2_512_256VL): Ditto.
(VF_512): Ditto.
(VFB_512): Ditto.
(VI48_AVX512VL): Ditto.
(VI1248_AVX512VLBW): Ditto.
(VF_AVX512VL): Ditto.
(VFH_AVX512VL): Ditto.
(VF1_AVX512VL): Ditto.
(VI): Ditto.
(VIHFBF): Ditto.
(VI_AVX2): Ditto.
(VI8): Ditto.
(VI8_AVX512VL): Ditto.
(VI2_AVX512F): Ditto.
(VI4_AVX512F): Ditto.
(VI4_AVX512VL): Ditto.
(VI48_AVX512F_AVX512VL): Ditto.
(VI8_AVX2_AVX512F): Ditto.
(VI8_AVX_AVX512F): Ditto.
(V8FI): Ditto.
(V16FI): Ditto.
(VI124_AVX2_24_AVX512F_1_AVX512BW): Ditto.
(VI248_AVX512VLBW): Ditto.
(VI248_AVX2_8_AVX512F_24_AVX512BW): Ditto.
(VI248_AVX512BW): Ditto.
(VI248_AVX512BW_AVX512VL): Ditto.
(VI48_AVX512F): Ditto.
(VI48_AVX_AVX512F): Ditto.
(VI12_AVX_AVX512F): Ditto.
(VI148_512): Ditto.
(VI124_256_AVX512F_AVX512BW): Ditto.
(VI48_512): Ditto.
(VI_AVX512BW): Ditto.
(VIHFBF_AVX512BW): Ditto.
(VI4F_256_512): Ditto.
(VI48F_256_512): Ditto.
(VI48F): Ditto.
(VI12_VI48F_AVX512VL): Ditto.
(V32_512): Ditto.
(AVX512MODE2P): Ditto.
(STORENT_MODE): Ditto.
(REDUC_PLUS_MODE): Ditto.
(REDUC_SMINMAX_MODE): Ditto.
(*andnot3): Change isa attribute to avx512f_512.
(*andnot3): Ditto.
(3): Ditto.
(tf3): Ditto.
(FMAMODEM): Add TARGET_EVEX512.
(FMAMODE_AVX512): Ditto.
(VFH_SF_AVX512VL): Ditto.
(avx512f_fix_notruncv16sfv16si): Ditto.
(fix_truncv16sfv16si2):
Ditto.
(avx512f_cvtdq2pd512_2): Ditto.
(avx512f_cvtpd2dq512): Ditto.
(fix_truncv8dfv8si2):
Ditto.
(avx512f_cvtpd2ps512): Ditto.
(vec_unpacks_lo_v16sf): Ditto.
(vec_unpacks_hi_v16sf): Ditto.
(vec_unpacks_float_hi_v16si): Ditto.
(vec_unpacks_float_lo_v16si): Ditto.
(vec_unpacku_float_hi_v16si): Ditto.
(vec_unpacku_float_lo_v16si): Ditto.
(vec_pack_sfix_trunc_v8df): Ditto.
(avx512f_vec_pack_sfix_v8df): Ditto.
(avx512f_unpckhps512): Ditto.
(avx512f_unpcklps512): Ditto.
(avx512f_movshdup512): Ditto.
(avx512f_movsldup512): Ditto.
(AVX512_VEC): Ditto.
(AVX512_VEC_2): Ditto.
(vec_extract_lo_v64qi): Ditto.
(vec_extract_hi_v64qi): Ditto.
(VEC_EXTRACT_MODE): Ditto.
(avx512f_unpckhpd512): Ditto.
(avx512f_movddup512): Ditto.
(avx512f_unpcklpd512): Ditto.
(*_vternlog_all): Ditto.
(*_vpternlog_1): Ditto.
(*_vpternlog_2): Ditto.
(*_vpternlog_3): Ditto.
(avx512f_shufps512_mask): Ditto.
(avx512f_shufps512_1): Ditto.
  

[PATCH 04/18] [PATCH 3/5] Push evex512 target for 512 bit intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/avx512bwintrin.h: Add evex512 target for 512 bit
intrins.
---
 gcc/config/i386/avx512bwintrin.h | 291 ---
 1 file changed, 153 insertions(+), 138 deletions(-)

diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h
index d1cd549ce18..925bae1457c 100644
--- a/gcc/config/i386/avx512bwintrin.h
+++ b/gcc/config/i386/avx512bwintrin.h
@@ -34,16 +34,6 @@
 #define __DISABLE_AVX512BW__
 #endif /* __AVX512BW__ */
 
-/* Internal data types for implementing the intrinsics.  */
-typedef short __v32hi __attribute__ ((__vector_size__ (64)));
-typedef short __v32hi_u __attribute__ ((__vector_size__ (64),  \
-   __may_alias__, __aligned__ (1)));
-typedef char __v64qi __attribute__ ((__vector_size__ (64)));
-typedef char __v64qi_u __attribute__ ((__vector_size__ (64),   \
-  __may_alias__, __aligned__ (1)));
-
-typedef unsigned long long __mmask64;
-
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _ktest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
@@ -54,229 +44,292 @@ _ktest_mask32_u8  (__mmask32 __A,  __mmask32 __B, 
unsigned char *__CF)
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktest_mask64_u8  (__mmask64 __A,  __mmask64 __B, unsigned char *__CF)
+_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
 {
-  *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
-  return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
+  return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
 }
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
+_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
 {
-  return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
+  return (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
 }
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
+_kortest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
 {
-  return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
+  *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
 }
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
+_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
 {
-  return (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
 }
 
 extern __inline unsigned char
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
+_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
 {
-  return (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
 }
 
-extern __inline unsigned char
+extern __inline __mmask32
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
+_kadd_mask32 (__mmask32 __A, __mmask32 __B)
 {
-  *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
-  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
+  return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B);
 }
 
-extern __inline unsigned char
+extern __inline unsigned int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortest_mask64_u8  (__mmask64 __A,  __mmask64 __B, unsigned char *__CF)
+_cvtmask32_u32 (__mmask32 __A)
 {
-  *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
-  return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
+  return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A);
 }
 
-extern __inline unsigned char
+extern __inline __mmask32
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
+_cvtu32_mask32 (unsigned int __A)
 {
-  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
+  return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A);
 }
 
-extern __inline unsigned char
+extern __inline __mmask32
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
+_load_mask32 (__mmask32 *__A)
 {
-  return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
+  return (__mmask32) __builtin_ia32_kmovd (*__A);
 }
 
-extern __inline unsigned char
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
+_store_mask32 (__mmask32 *__A, __mmask32 __B)
 {
-  return (unsigned char) 

[PATCH 15/18] Support -mevex512 for AVX512BW intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/Changelog:

* config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
Make sure there is EVEX512 enabled.
(ix86_expand_vecop_qihi2): Refuse V32QI->V32HI when no EVEX512.
* config/i386/i386.cc (ix86_hard_regno_mode_ok): Disable 64 bit mask
when !TARGET_EVEX512.
* config/i386/i386.md (avx512bw_512): New.
(SWI1248_AVX512BWDQ_64): Add TARGET_EVEX512.
(*zero_extendsidi2): Change isa to avx512bw_512.
(kmov_isa): Ditto.
(*anddi_1): Ditto.
(*andn_1): Change isa to kmov_isa.
(*_1): Ditto.
(*notxor_1): Ditto.
(*one_cmpl2_1): Ditto.
(*one_cmplsi2_1_zext): Change isa to avx512bw_512.
(*ashl3_1): Change isa to kmov_isa.
(*lshr3_1): Ditto.
* config/i386/sse.md (VI12HFBF_AVX512VL): Add TARGET_EVEX512.
(VI1248_AVX512VLBW): Ditto.
(VHFBF_AVX512VL): Ditto.
(VI): Ditto.
(VIHFBF): Ditto.
(VI_AVX2): Ditto.
(VI1_AVX512): Ditto.
(VI12_256_512_AVX512VL): Ditto.
(VI2_AVX2_AVX512BW): Ditto.
(VI2_AVX512VNNIBW): Ditto.
(VI2_AVX512VL): Ditto.
(VI2HFBF_AVX512VL): Ditto.
(VI8_AVX2_AVX512BW): Ditto.
(VIMAX_AVX2_AVX512BW): Ditto.
(VIMAX_AVX512VL): Ditto.
(VI12_AVX2_AVX512BW): Ditto.
(VI124_AVX2_24_AVX512F_1_AVX512BW): Ditto.
(VI248_AVX512VL): Ditto.
(VI248_AVX512VLBW): Ditto.
(VI248_AVX2_8_AVX512F_24_AVX512BW): Ditto.
(VI248_AVX512BW): Ditto.
(VI248_AVX512BW_AVX512VL): Ditto.
(VI248_512): Ditto.
(VI124_256_AVX512F_AVX512BW): Ditto.
(VI_AVX512BW): Ditto.
(VIHFBF_AVX512BW): Ditto.
(SWI1248_AVX512BWDQ): Ditto.
(SWI1248_AVX512BW): Ditto.
(SWI1248_AVX512BWDQ2): Ditto.
(*knotsi_1_zext): Ditto.
(define_split for zero_extend + not): Ditto.
(kunpckdi): Ditto.
(REDUC_SMINMAX_MODE): Ditto.
(VEC_EXTRACT_MODE): Ditto.
(*avx512bw_permvar_truncv16siv16hi_1): Ditto.
(*avx512bw_permvar_truncv16siv16hi_1_hf): Ditto.
(truncv32hiv32qi2): Ditto.
(avx512bw_v32hiv32qi2): Ditto.
(avx512bw_v32hiv32qi2_mask): Ditto.
(avx512bw_v32hiv32qi2_mask_store): Ditto.
(usadv64qi): Ditto.
(VEC_PERM_AVX2): Ditto.
(AVX512ZEXTMASK): Ditto.
(SWI24_MASK): New.
(vec_pack_trunc_): Change iterator to SWI24_MASK.
(avx512bw_packsswb): Add TARGET_EVEX512.
(avx512bw_packssdw): Ditto.
(avx512bw_interleave_highv64qi): Ditto.
(avx512bw_interleave_lowv64qi): Ditto.
(avx512bw_pshuflwv32hi): Ditto.
(avx512bw_pshufhwv32hi): Ditto.
(vec_unpacks_lo_di): Ditto.
(SWI48x_MASK): New.
(vec_unpacks_hi_): Change iterator to SWI48x_MASK.
(avx512bw_umulhrswv32hi3): Add TARGET_EVEX512.
(VI1248_AVX512VL_AVX512BW): Ditto.
(avx512bw_v32qiv32hi2): Ditto.
(*avx512bw_zero_extendv32qiv32hi2_1): Ditto.
(*avx512bw_zero_extendv32qiv32hi2_2): Ditto.
(v32qiv32hi2): Ditto.
(pbroadcast_evex_isa): Change isa attribute to avx512bw_512.
(VPERMI2): Add TARGET_EVEX512.
(VPERMI2I): Ditto.
---
 gcc/config/i386/i386-expand.cc |   3 +-
 gcc/config/i386/i386.cc|   4 +-
 gcc/config/i386/i386.md|  54 -
 gcc/config/i386/sse.md | 193 ++---
 4 files changed, 128 insertions(+), 126 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 063561e1265..ff2423f91ed 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -15617,6 +15617,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
 case E_V32HFmode:
 case E_V32BFmode:
 case E_V64QImode:
+  gcc_assert (TARGET_EVEX512);
   if (TARGET_AVX512BW)
return ix86_vector_duplicate_value (mode, target, val);
   else
@@ -23512,7 +23513,7 @@ ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, 
rtx op1, rtx op2)
   bool uns_p = code != ASHIFTRT;
 
   if ((qimode == V16QImode && !TARGET_AVX2)
-  || (qimode == V32QImode && !TARGET_AVX512BW)
+  || (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512))
   /* There are no V64HImode instructions.  */
   || qimode == V64QImode)
  return false;
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 589b29a324d..03c96ff048d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20308,8 +20308,8 @@ ix86_hard_regno_mode_ok (unsigned int regno, 
machine_mode mode)
return MASK_PAIR_REGNO_P(regno);
 
   return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode))
- || (TARGET_AVX512BW
- && VALID_MASK_AVX512BW_MODE (mode)));
+ || (TARGET_AVX512BW && mode == SImode)
+ || (TARGET_AVX512BW && TARGET_EVEX512 

[PATCH 10/18] [PATCH 4/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
---
 gcc/config/i386/i386-builtin.def | 188 +++
 1 file changed, 94 insertions(+), 94 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 167d530a537..8250e2998cd 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -299,8 +299,8 @@ BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_sto
 BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_storev64qi_mask, "__builtin_ia32_storedquqi512_mask", 
IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, (int) VOID_FTYPE_PCHAR_V64QI_UDI)
 
 /* AVX512VP2INTERSECT */
-BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, UNKNOWN, (int) 
VOID_FTYPE_PUHI_PUHI_V16SI_V16SI)
-BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectq512", IX86_BUILTIN_2INTERSECTQ512, UNKNOWN, (int) 
VOID_FTYPE_PUQI_PUQI_V8DI_V8DI)
+BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_nothing, "__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, 
UNKNOWN, (int) VOID_FTYPE_PUHI_PUHI_V16SI_V16SI)
+BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_nothing, "__builtin_ia32_2intersectq512", IX86_BUILTIN_2INTERSECTQ512, 
UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V8DI_V8DI)
 BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectd256", IX86_BUILTIN_2INTERSECTD256, UNKNOWN, (int) 
VOID_FTYPE_PUQI_PUQI_V8SI_V8SI)
 BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectq256", IX86_BUILTIN_2INTERSECTQ256, UNKNOWN, (int) 
VOID_FTYPE_PUQI_PUQI_V4DI_V4DI)
 BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectd128", IX86_BUILTIN_2INTERSECTD128, UNKNOWN, (int) 
VOID_FTYPE_PUQI_PUQI_V4SI_V4SI)
@@ -430,17 +430,17 @@ BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_rdpkru,  
"__builtin_ia32_rdpkru", IX86_B
 BDESC (OPTION_MASK_ISA_PKU, 0, CODE_FOR_wrpkru,  "__builtin_ia32_wrpkru", 
IX86_BUILTIN_WRPKRU, UNKNOWN, (int) VOID_FTYPE_UNSIGNED)
 
 /* VBMI2 */
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressstorev64qi_mask, 
"__builtin_ia32_compressstoreuqi512_mask", IX86_BUILTIN_PCOMPRESSBSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_compressstorev32hi_mask, 
"__builtin_ia32_compressstoreuhi512_mask", IX86_BUILTIN_PCOMPRESSWSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_compressstorev64qi_mask, "__builtin_ia32_compressstoreuqi512_mask", 
IX86_BUILTIN_PCOMPRESSBSTORE512, UNKNOWN, (int) VOID_FTYPE_PV64QI_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_compressstorev32hi_mask, "__builtin_ia32_compressstoreuhi512_mask", 
IX86_BUILTIN_PCOMPRESSWSTORE512, UNKNOWN, (int) VOID_FTYPE_PV32HI_V32HI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_compressstorev32qi_mask, "__builtin_ia32_compressstoreuqi256_mask", 
IX86_BUILTIN_PCOMPRESSBSTORE256, UNKNOWN, (int) VOID_FTYPE_PV32QI_V32QI_USI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_compressstorev16qi_mask, "__builtin_ia32_compressstoreuqi128_mask", 
IX86_BUILTIN_PCOMPRESSBSTORE128, UNKNOWN, (int) VOID_FTYPE_PV16QI_V16QI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_compressstorev16hi_mask, "__builtin_ia32_compressstoreuhi256_mask", 
IX86_BUILTIN_PCOMPRESSWSTORE256, UNKNOWN, (int) VOID_FTYPE_PV16HI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512VBMI2 | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_compressstorev8hi_mask, "__builtin_ia32_compressstoreuhi128_mask", 
IX86_BUILTIN_PCOMPRESSWSTORE128, UNKNOWN, (int) VOID_FTYPE_PV8HI_V8HI_UQI)
 
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_mask, 
"__builtin_ia32_expandloadqi512_mask", IX86_BUILTIN_PEXPANDBLOAD512, UNKNOWN, 
(int) V64QI_FTYPE_PCV64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv64qi_maskz, 
"__builtin_ia32_expandloadqi512_maskz", IX86_BUILTIN_PEXPANDBLOAD512Z, UNKNOWN, 
(int) V64QI_FTYPE_PCV64QI_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_mask, 
"__builtin_ia32_expandloadhi512_mask", IX86_BUILTIN_PEXPANDWLOAD512, UNKNOWN, 
(int) V32HI_FTYPE_PCV32HI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512VBMI2, 0, CODE_FOR_expandv32hi_maskz, 
"__builtin_ia32_expandloadhi512_maskz", IX86_BUILTIN_PEXPANDWLOAD512Z, UNKNOWN, 
(int) V32HI_FTYPE_PCV32HI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512VBMI2, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_expandv64qi_mask, "__builtin_ia32_expandloadqi512_mask", 
IX86_BUILTIN_PEXPANDBLOAD512, UNKNOWN, (int) 

[PATCH 05/18] [PATCH 4/5] Push evex512 target for 512 bit intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config.gcc: Add avx512bitalgvlintrin.h.
* config/i386/avx5124fmapsintrin.h: Add evex512 target for 512 bit
intrins.
* config/i386/avx5124vnniwintrin.h: Ditto.
* config/i386/avx512bf16intrin.h: Ditto.
* config/i386/avx512bitalgintrin.h: Add evex512 target for 512 bit
intrins. Split 128/256 bit intrins to avx512bitalgvlintrin.h.
* config/i386/avx512erintrin.h: Add evex512 target for 512 bit
intrins
* config/i386/avx512ifmaintrin.h: Ditto
* config/i386/avx512pfintrin.h: Ditto
* config/i386/avx512vbmi2intrin.h: Ditto.
* config/i386/avx512vbmiintrin.h: Ditto.
* config/i386/avx512vnniintrin.h: Ditto.
* config/i386/avx512vp2intersectintrin.h: Ditto.
* config/i386/avx512vpopcntdqintrin.h: Ditto.
* config/i386/gfniintrin.h: Ditto.
* config/i386/immintrin.h: Add avx512bitalgvlintrin.h.
* config/i386/vaesintrin.h: Add evex512 target for 512 bit intrins.
* config/i386/vpclmulqdqintrin.h: Ditto.
* config/i386/avx512bitalgvlintrin.h: New.
---
 gcc/config.gcc |  19 +--
 gcc/config/i386/avx5124fmapsintrin.h   |   2 +-
 gcc/config/i386/avx5124vnniwintrin.h   |   2 +-
 gcc/config/i386/avx512bf16intrin.h |  31 ++--
 gcc/config/i386/avx512bitalgintrin.h   | 155 +-
 gcc/config/i386/avx512bitalgvlintrin.h | 180 +
 gcc/config/i386/avx512erintrin.h   |   2 +-
 gcc/config/i386/avx512ifmaintrin.h |   4 +-
 gcc/config/i386/avx512pfintrin.h   |   2 +-
 gcc/config/i386/avx512vbmi2intrin.h|   4 +-
 gcc/config/i386/avx512vbmiintrin.h |   4 +-
 gcc/config/i386/avx512vnniintrin.h |   4 +-
 gcc/config/i386/avx512vp2intersectintrin.h |   4 +-
 gcc/config/i386/avx512vpopcntdqintrin.h|   4 +-
 gcc/config/i386/gfniintrin.h   |  76 +
 gcc/config/i386/immintrin.h|   2 +
 gcc/config/i386/vaesintrin.h   |   4 +-
 gcc/config/i386/vpclmulqdqintrin.h |   4 +-
 18 files changed, 282 insertions(+), 221 deletions(-)
 create mode 100644 gcc/config/i386/avx512bitalgvlintrin.h

diff --git a/gcc/config.gcc b/gcc/config.gcc
index ce5def08e2e..e47e6893e1d 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -425,15 +425,16 @@ i[34567]86-*-* | x86_64-*-*)
   avx512vbmi2vlintrin.h avx512vnniintrin.h
   avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h
   avx512vpopcntdqvlintrin.h avx512bitalgintrin.h
-  pconfigintrin.h wbnoinvdintrin.h movdirintrin.h
-  waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h
-  avx512bf16intrin.h enqcmdintrin.h serializeintrin.h
-  avx512vp2intersectintrin.h avx512vp2intersectvlintrin.h
-  tsxldtrkintrin.h amxtileintrin.h amxint8intrin.h
-  amxbf16intrin.h x86gprintrin.h uintrintrin.h
-  hresetintrin.h keylockerintrin.h avxvnniintrin.h
-  mwaitintrin.h avx512fp16intrin.h avx512fp16vlintrin.h
-  avxifmaintrin.h avxvnniint8intrin.h avxneconvertintrin.h
+  avx512bitalgvlintrin.h pconfigintrin.h wbnoinvdintrin.h
+  movdirintrin.h waitpkgintrin.h cldemoteintrin.h
+  avx512bf16vlintrin.h avx512bf16intrin.h enqcmdintrin.h
+  serializeintrin.h avx512vp2intersectintrin.h
+  avx512vp2intersectvlintrin.h tsxldtrkintrin.h
+  amxtileintrin.h amxint8intrin.h amxbf16intrin.h
+  x86gprintrin.h uintrintrin.h hresetintrin.h
+  keylockerintrin.h avxvnniintrin.h mwaitintrin.h
+  avx512fp16intrin.h avx512fp16vlintrin.h avxifmaintrin.h
+  avxvnniint8intrin.h avxneconvertintrin.h
   cmpccxaddintrin.h amxfp16intrin.h prfchiintrin.h
   raointintrin.h amxcomplexintrin.h avxvnniint16intrin.h
   sm3intrin.h sha512intrin.h sm4intrin.h"
diff --git a/gcc/config/i386/avx5124fmapsintrin.h 
b/gcc/config/i386/avx5124fmapsintrin.h
index 97dd77c9235..4c884a5c203 100644
--- a/gcc/config/i386/avx5124fmapsintrin.h
+++ b/gcc/config/i386/avx5124fmapsintrin.h
@@ -30,7 +30,7 @@
 
 #ifndef __AVX5124FMAPS__
 #pragma GCC push_options
-#pragma GCC target("avx5124fmaps")
+#pragma GCC target("avx5124fmaps,evex512")
 #define __DISABLE_AVX5124FMAPS__
 #endif /* __AVX5124FMAPS__ */
 
diff --git a/gcc/config/i386/avx5124vnniwintrin.h 
b/gcc/config/i386/avx5124vnniwintrin.h
index fd129589798..795e4814f28 100644
--- a/gcc/config/i386/avx5124vnniwintrin.h
+++ b/gcc/config/i386/avx5124vnniwintrin.h
@@ -30,7 +30,7 @@
 
 #ifndef __AVX5124VNNIW__
 #pragma GCC push_options
-#pragma GCC 

[PATCH 07/18] [PATCH 1/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
* config/i386/i386-builtins.cc
(ix86_init_mmx_sse_builtins): Ditto.
---
 gcc/config/i386/i386-builtin.def | 648 +++
 gcc/config/i386/i386-builtins.cc |  72 ++--
 2 files changed, 372 insertions(+), 348 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 8738b3b6a8a..0cc526383db 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -200,53 +200,53 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, 
CODE_FOR_avx2_maskstored256, "__builtin_ia32_mas
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_maskstoreq256, 
"__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) 
VOID_FTYPE_PV4DI_V4DI_V4DI)
 
 /* AVX512F */
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev16sf_mask, 
"__builtin_ia32_compressstoresf512_mask", IX86_BUILTIN_COMPRESSPSSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev16si_mask, 
"__builtin_ia32_compressstoresi512_mask", IX86_BUILTIN_PCOMPRESSDSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev8df_mask, 
"__builtin_ia32_compressstoredf512_mask", IX86_BUILTIN_COMPRESSPDSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressstorev8di_mask, 
"__builtin_ia32_compressstoredi512_mask", IX86_BUILTIN_PCOMPRESSQSTORE512, 
UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16sf_mask, 
"__builtin_ia32_expandloadsf512_mask", IX86_BUILTIN_EXPANDPSLOAD512, UNKNOWN, 
(int) V16SF_FTYPE_PCV16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16sf_maskz, 
"__builtin_ia32_expandloadsf512_maskz", IX86_BUILTIN_EXPANDPSLOAD512Z, UNKNOWN, 
(int) V16SF_FTYPE_PCV16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv16si_mask, 
"__builtin_ia32_expandloadsi512_mask", IX86_BUILTIN_PEXPANDDLOAD512, UNKNOWN, 
(int) V16SI_FTYPE_PCV16SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv16si_maskz, 
"__builtin_ia32_expandloadsi512_maskz", IX86_BUILTIN_PEXPANDDLOAD512Z, UNKNOWN, 
(int) V16SI_FTYPE_PCV16SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8df_mask, 
"__builtin_ia32_expandloaddf512_mask", IX86_BUILTIN_EXPANDPDLOAD512, UNKNOWN, 
(int) V8DF_FTYPE_PCV8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8df_maskz, 
"__builtin_ia32_expandloaddf512_maskz", IX86_BUILTIN_EXPANDPDLOAD512Z, UNKNOWN, 
(int) V8DF_FTYPE_PCV8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8di_mask, 
"__builtin_ia32_expandloaddi512_mask", IX86_BUILTIN_PEXPANDQLOAD512, UNKNOWN, 
(int) V8DI_FTYPE_PCV8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8di_maskz, 
"__builtin_ia32_expandloaddi512_maskz", IX86_BUILTIN_PEXPANDQLOAD512Z, UNKNOWN, 
(int) V8DI_FTYPE_PCV8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16si_mask, 
"__builtin_ia32_loaddqusi512_mask", IX86_BUILTIN_LOADDQUSI512, UNKNOWN, (int) 
V16SI_FTYPE_PCINT_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8di_mask, 
"__builtin_ia32_loaddqudi512_mask", IX86_BUILTIN_LOADDQUDI512, UNKNOWN, (int) 
V8DI_FTYPE_PCINT64_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8df_mask, 
"__builtin_ia32_loadupd512_mask", IX86_BUILTIN_LOADUPD512, UNKNOWN, (int) 
V8DF_FTYPE_PCDOUBLE_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16sf_mask, 
"__builtin_ia32_loadups512_mask", IX86_BUILTIN_LOADUPS512, UNKNOWN, (int) 
V16SF_FTYPE_PCFLOAT_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16sf_mask, 
"__builtin_ia32_loadaps512_mask", IX86_BUILTIN_LOADAPS512, UNKNOWN, (int) 
V16SF_FTYPE_PCV16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv16si_mask, 
"__builtin_ia32_movdqa32load512_mask", IX86_BUILTIN_MOVDQA32LOAD512, UNKNOWN, 
(int) V16SI_FTYPE_PCV16SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8df_mask, 
"__builtin_ia32_loadapd512_mask", IX86_BUILTIN_LOADAPD512, UNKNOWN, (int) 
V8DF_FTYPE_PCV8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_loadv8di_mask, 
"__builtin_ia32_movdqa64load512_mask", IX86_BUILTIN_MOVDQA64LOAD512, UNKNOWN, 
(int) V8DI_FTYPE_PCV8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movntv16sf, 
"__builtin_ia32_movntps512", IX86_BUILTIN_MOVNTPS512, UNKNOWN, (int) 
VOID_FTYPE_PFLOAT_V16SF)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movntv8df, 
"__builtin_ia32_movntpd512", IX86_BUILTIN_MOVNTPD512, UNKNOWN, (int) 
VOID_FTYPE_PDOUBLE_V8DF)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_movntv8di, 
"__builtin_ia32_movntdq512", IX86_BUILTIN_MOVNTDQ512, UNKNOWN, 

[PATCH 17/18] Support -mevex512 for AVX512FP16 intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/sse.md (V48H_AVX512VL): Add TARGET_EVEX512.
(VFH): Ditto.
(VF2H): Ditto.
(VFH_AVX512VL): Ditto.
(VHFBF): Ditto.
(VHF_AVX512VL): Ditto.
(VI2H_AVX512VL): Ditto.
(VI2F_256_512): Ditto.
(VF48_I1248): Remove unused iterator.
(VF48H_AVX512VL): Add TARGET_EVEX512.
(VF_AVX512): Remove unused iterator.
(REDUC_PLUS_MODE): Add TARGET_EVEX512.
(REDUC_SMINMAX_MODE): Ditto.
(FMAMODEM): Ditto.
(VFH_SF_AVX512VL): Ditto.
(VEC_PERM_AVX2): Ditto.

Co-authored-by: Hu, Lin1 
---
 gcc/config/i386/sse.md | 44 --
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a5a95b9de66..25d53e15dce 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -280,7 +280,7 @@
 (define_mode_iterator V48H_AVX512VL
   [(V16SI "TARGET_EVEX512") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")
-   (V32HF "TARGET_AVX512FP16")
+   (V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
@@ -355,7 +355,7 @@
(V2DF "TARGET_AVX512DQ && TARGET_AVX512VL")])
 
 (define_mode_iterator VFH
-  [(V32HF "TARGET_AVX512FP16")
+  [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
@@ -401,7 +401,7 @@
 
 ;; All DFmode & HFmode vector float modes
 (define_mode_iterator VF2H
-  [(V32HF "TARGET_AVX512FP16")
+  [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX") V2DF])
@@ -463,7 +463,7 @@
   [(V16SF "TARGET_AVX512ER") (V8SF "TARGET_AVX") V4SF])
 
 (define_mode_iterator VFH_AVX512VL
-  [(V32HF "TARGET_AVX512FP16")
+  [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")
(V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
@@ -475,12 +475,14 @@
 (define_mode_iterator VF1_AVX512VL
   [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")])
 
-(define_mode_iterator VHFBF [V32HF V16HF V8HF V32BF V16BF V8BF])
+(define_mode_iterator VHFBF
+  [(V32HF "TARGET_EVEX512") V16HF V8HF
+   (V32BF "TARGET_EVEX512") V16BF V8BF])
 (define_mode_iterator VHFBF_256 [V16HF V16BF])
 (define_mode_iterator VHFBF_128 [V8HF V8BF])
 
 (define_mode_iterator VHF_AVX512VL
-  [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")])
+  [(V32HF "TARGET_EVEX512") (V16HF "TARGET_AVX512VL") (V8HF 
"TARGET_AVX512VL")])
 
 (define_mode_iterator VHFBF_AVX512VL
   [(V32HF "TARGET_EVEX512") (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")
@@ -594,9 +596,9 @@
(V8BF "TARGET_AVX512VL") (V16BF "TARGET_AVX512VL") (V32BF 
"TARGET_EVEX512")])
 
 (define_mode_iterator VI2H_AVX512VL
-  [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI
-   (V8SI "TARGET_AVX512VL") V16SI
-   V8DI ])
+  [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") (V32HI "TARGET_EVEX512")
+   (V8SI "TARGET_AVX512VL") (V16SI "TARGET_EVEX512")
+   (V8DI "TARGET_EVEX512")])
 
 (define_mode_iterator VI1_AVX512VL_F
   [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F && TARGET_EVEX512")])
@@ -883,7 +885,10 @@
(V32BF "TARGET_AVX512BW && TARGET_EVEX512")])
 
 ;; Int-float size matches
-(define_mode_iterator VI2F_256_512 [V16HI V32HI V16HF V32HF V16BF V32BF])
+(define_mode_iterator VI2F_256_512
+  [V16HI (V32HI "TARGET_EVEX512")
+   V16HF (V32HF "TARGET_EVEX512")
+   V16BF (V32BF "TARGET_EVEX512")])
 (define_mode_iterator VI4F_128 [V4SI V4SF])
 (define_mode_iterator VI8F_128 [V2DI V2DF])
 (define_mode_iterator VI4F_256 [V8SI V8SF])
@@ -899,10 +90

[PATCH 03/18] [PATCH 2/5] Push evex512 target for 512 bit intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/avx512dqintrin.h: Add evex512 target for 512 bit
intrins.
---
 gcc/config/i386/avx512dqintrin.h | 1840 +++---
 1 file changed, 926 insertions(+), 914 deletions(-)

diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h
index 93900a0b5c7..b6a1d499e25 100644
--- a/gcc/config/i386/avx512dqintrin.h
+++ b/gcc/config/i386/avx512dqintrin.h
@@ -184,1275 +184,1426 @@ _kandn_mask8 (__mmask8 __A, __mmask8 __B)
   return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B);
 }
 
-extern __inline __m512d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_broadcast_f64x2 (__m128d __A)
-{
-  return (__m512d)
-__builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
-_mm512_undefined_pd (),
-(__mmask8) -1);
-}
-
-extern __inline __m512d
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+_kshiftli_mask8 (__mmask8 __A, unsigned int __B)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df)
-  __A,
-  (__v8df)
-  __O, __M);
+  return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B);
 }
 
-extern __inline __m512d
+extern __inline __mmask8
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+_kshiftri_mask8 (__mmask8 __A, unsigned int __B)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df)
-  __A,
-  (__v8df)
-  _mm512_setzero_ps (),
-  __M);
+  return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B);
 }
 
-extern __inline __m512i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_broadcast_i64x2 (__m128i __A)
+_mm_reduce_sd (__m128d __A, __m128d __B, int __C)
 {
-  return (__m512i)
-__builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
-_mm512_undefined_epi32 (),
+  return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+(__v2df) __B, __C,
+(__v2df) _mm_setzero_pd (),
 (__mmask8) -1);
 }
 
-extern __inline __m512i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di)
-  __A,
-  (__v8di)
-  __O, __M);
+  return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
+  (__v2df) __B, __C,
+  (__v2df)
+  _mm_setzero_pd (),
+  (__mmask8) -1, __R);
 }
 
-extern __inline __m512i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+_mm_mask_reduce_sd (__m128d __W,  __mmask8 __U, __m128d __A,
+   __m128d __B, int __C)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di)
-  __A,
-  (__v8di)
-  _mm512_setzero_si512 
(),
-  __M);
+  return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+(__v2df) __B, __C,
+(__v2df) __W,
+(__mmask8) __U);
 }
 
-extern __inline __m512
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_broadcast_f32x2 (__m128 __A)
+_mm_mask_reduce_round_sd (__m128d __W,  __mmask8 __U, __m128d __A,
+ __m128d __B, int __C, const int __R)
 {
-  return 

[PATCH 18/18] Allow -mno-evex512 usage

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386.opt: Allow -mno-evex512.

gcc/testsuite/ChangeLog:

* gcc.target/i386/noevex512-1.c: New test.
* gcc.target/i386/noevex512-2.c: Ditto.
* gcc.target/i386/noevex512-3.c: Ditto.
---
 gcc/config/i386/i386.opt|  2 +-
 gcc/testsuite/gcc.target/i386/noevex512-1.c | 13 +
 gcc/testsuite/gcc.target/i386/noevex512-2.c | 13 +
 gcc/testsuite/gcc.target/i386/noevex512-3.c | 13 +
 4 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-3.c

diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 6d8601b1f75..34fc167af82 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1312,5 +1312,5 @@ Target Alias(mtune-ctrl=, use_scatter, ^use_scatter)
 Enable vectorization for scatter instruction.
 
 mevex512
-Target RejectNegative Mask(ISA2_EVEX512) Var(ix86_isa_flags2) Save
+Target Mask(ISA2_EVEX512) Var(ix86_isa_flags2) Save
 Support 512 bit vector built-in functions and code generation.
diff --git a/gcc/testsuite/gcc.target/i386/noevex512-1.c 
b/gcc/testsuite/gcc.target/i386/noevex512-1.c
new file mode 100644
index 000..7fd45f15be6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noevex512-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -march=x86-64 -mavx512f -mno-evex512 -Wno-psabi" } */
+/* { dg-final { scan-assembler-not ".%zmm" } } */
+
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+__m512d
+foo ()
+{
+  __m512d a, b;
+  a = a + b;
+  return a;
+}
diff --git a/gcc/testsuite/gcc.target/i386/noevex512-2.c 
b/gcc/testsuite/gcc.target/i386/noevex512-2.c
new file mode 100644
index 000..1c206e385d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noevex512-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mavx512bw -mno-evex512" } */
+
+#include 
+
+long long
+foo (long long c)
+{
+  register long long a __asm ("k7") = c;
+  long long b = foo (a);
+  asm volatile ("" : "+k" (b)); /* { dg-error "inconsistent operand 
constraints in an 'asm'" } */
+  return b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/noevex512-3.c 
b/gcc/testsuite/gcc.target/i386/noevex512-3.c
new file mode 100644
index 000..10e00c2d61c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/noevex512-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64 -Wno-psabi -mavx512f" } */
+/* { dg-final { scan-assembler-not ".%zmm" } } */
+
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+__attribute__ ((target ("no-evex512"))) __m512d
+foo ()
+{
+  __m512d a, b;
+  a = a + b;
+  return a;
+}
-- 
2.31.1



[PATCH 14/18] Support -mevex512 for AVX512DQ intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_sse2_mulvxdi3):
Add TARGET_EVEX512 for 512 bit usage.
* config/i386/i386.cc (standard_sse_constant_opcode): Ditto.
* config/i386/sse.md (VF1_VF2_AVX512DQ): Ditto.
(VF1_128_256VL): Ditto.
(VF2_AVX512VL): Ditto.
(VI8_256_512): Ditto.
(fixuns_trunc2):
Ditto.
(AVX512_VEC): Ditto.
(AVX512_VEC_2): Ditto.
(VI4F_BRCST32x2): Ditto.
(VI8F_BRCST64x2): Ditto.
---
 gcc/config/i386/i386-expand.cc |  2 +-
 gcc/config/i386/i386.cc| 22 --
 gcc/config/i386/sse.md | 24 ++--
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 0705e08d38c..063561e1265 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24008,7 +24008,7 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
   machine_mode mode = GET_MODE (op0);
   rtx t1, t2, t3, t4, t5, t6;
 
-  if (TARGET_AVX512DQ && mode == V8DImode)
+  if (TARGET_AVX512DQ && TARGET_EVEX512 && mode == V8DImode)
 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 635dd85e764..589b29a324d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -5332,9 +5332,14 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx 
*operands)
  if (EXT_REX_SSE_REG_P (operands[0]))
{
  if (TARGET_AVX512DQ)
-   return (TARGET_AVX512VL
-   ? "vxorpd\t%x0, %x0, %x0"
-   : "vxorpd\t%g0, %g0, %g0");
+   {
+ if (TARGET_AVX512VL)
+   return "vxorpd\t%x0, %x0, %x0";
+ else if (TARGET_EVEX512)
+   return "vxorpd\t%g0, %g0, %g0";
+ else
+   gcc_unreachable ();
+   }
  else
{
  if (TARGET_AVX512VL)
@@ -5356,9 +5361,14 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx 
*operands)
  if (EXT_REX_SSE_REG_P (operands[0]))
{
  if (TARGET_AVX512DQ)
-   return (TARGET_AVX512VL
-   ? "vxorps\t%x0, %x0, %x0"
-   : "vxorps\t%g0, %g0, %g0");
+   {
+ if (TARGET_AVX512VL)
+   return "vxorps\t%x0, %x0, %x0";
+ else if (TARGET_EVEX512)
+   return "vxorps\t%g0, %g0, %g0";
+ else
+   gcc_unreachable ();
+   }
  else
{
  if (TARGET_AVX512VL)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8d1b75b43e0..a8f93ceddc5 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -350,7 +350,8 @@
 
 (define_mode_iterator VF1_VF2_AVX512DQ
   [(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
-   (V8DF "TARGET_AVX512DQ") (V4DF "TARGET_AVX512DQ && TARGET_AVX512VL")
+   (V8DF "TARGET_AVX512DQ && TARGET_EVEX512")
+   (V4DF "TARGET_AVX512DQ && TARGET_AVX512VL")
(V2DF "TARGET_AVX512DQ && TARGET_AVX512VL")])
 
 (define_mode_iterator VFH
@@ -392,7 +393,7 @@
   [(V8SF "TARGET_AVX") V4SF])
 
 (define_mode_iterator VF1_128_256VL
-  [V8SF (V4SF "TARGET_AVX512VL")])
+  [(V8SF "TARGET_EVEX512") (V4SF "TARGET_AVX512VL")])
 
 ;; All DFmode vector float modes
 (define_mode_iterator VF2
@@ -467,7 +468,7 @@
(V8DF "TARGET_EVEX512") (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
 
 (define_mode_iterator VF2_AVX512VL
-  [V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
+  [(V8DF "TARGET_EVEX512") (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
 
 (define_mode_iterator VF1_AVX512VL
   [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")])
@@ -534,7 +535,7 @@
   [(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
 
 (define_mode_iterator VI8_256_512
-  [V8DI (V4DI "TARGET_AVX512VL")])
+  [(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL")])
 
 (define_mode_iterator VI1_AVX2
   [(V32QI "TARGET_AVX2") V16QI])
@@ -9075,7 +9076,7 @@
 (define_insn "fixuns_trunc2"
   [(set (match_operand: 0 "register_operand" "=v")
(unsigned_fix:
- (match_operand:VF1_128_256VL 1 "nonimmediate_operand" "vm")))]
+ (match_operand:VF1_128_256 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512VL"
   "vcvttps2udq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
@@ -11466,7 +11467,8 @@
(V8SF "32x4") (V8SI "32x4") (V4DF "64x2") (V4DI "64x2")])
 
 (define_mode_iterator AVX512_VEC
-  [(V8DF "TARGET_AVX512DQ") (V8DI "TARGET_AVX512DQ")
+  [(V8DF "TARGET_AVX512DQ && TARGET_EVEX512")
+   (V8DI "TARGET_AVX512DQ && 

[PATCH 09/18] [PATCH 3/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
---
 gcc/config/i386/i386-builtin.def | 226 +++
 1 file changed, 113 insertions(+), 113 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 7a0dec9bc8b..167d530a537 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -293,10 +293,10 @@ BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_CMPCCXADD, 
CODE_FOR_cmpccxadd_si,
 BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_CMPCCXADD, 
CODE_FOR_cmpccxadd_di, "__builtin_ia32_cmpccxadd64", IX86_BUILTIN_CMPCCXADD64, 
UNKNOWN, (int) LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT)
 
 /* AVX512BW */
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_loadv32hi_mask, 
"__builtin_ia32_loaddquhi512_mask", IX86_BUILTIN_LOADDQUHI512_MASK, UNKNOWN, 
(int) V32HI_FTYPE_PCSHORT_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_loadv64qi_mask, 
"__builtin_ia32_loaddquqi512_mask", IX86_BUILTIN_LOADDQUQI512_MASK, UNKNOWN, 
(int) V64QI_FTYPE_PCCHAR_V64QI_UDI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_storev32hi_mask, 
"__builtin_ia32_storedquhi512_mask", IX86_BUILTIN_STOREDQUHI512_MASK, UNKNOWN, 
(int) VOID_FTYPE_PSHORT_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, CODE_FOR_avx512bw_storev64qi_mask, 
"__builtin_ia32_storedquqi512_mask", IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, 
(int) VOID_FTYPE_PCHAR_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_loadv32hi_mask, "__builtin_ia32_loaddquhi512_mask", 
IX86_BUILTIN_LOADDQUHI512_MASK, UNKNOWN, (int) V32HI_FTYPE_PCSHORT_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_loadv64qi_mask, "__builtin_ia32_loaddquqi512_mask", 
IX86_BUILTIN_LOADDQUQI512_MASK, UNKNOWN, (int) V64QI_FTYPE_PCCHAR_V64QI_UDI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_storev32hi_mask, "__builtin_ia32_storedquhi512_mask", 
IX86_BUILTIN_STOREDQUHI512_MASK, UNKNOWN, (int) VOID_FTYPE_PSHORT_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_storev64qi_mask, "__builtin_ia32_storedquqi512_mask", 
IX86_BUILTIN_STOREDQUQI512_MASK, UNKNOWN, (int) VOID_FTYPE_PCHAR_V64QI_UDI)
 
 /* AVX512VP2INTERSECT */
 BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
"__builtin_ia32_2intersectd512", IX86_BUILTIN_2INTERSECTD512, UNKNOWN, (int) 
VOID_FTYPE_PUHI_PUHI_V16SI_V16SI)
@@ -407,9 +407,9 @@ BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 
0, CODE_FOR_avx512vl
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_avx512vl_ss_truncatev16hiv16qi2_mask_store, 
"__builtin_ia32_pmovswb256mem_mask", IX86_BUILTIN_PMOVSWB256_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV16QI_V16HI_UHI)
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_avx512vl_us_truncatev8hiv8qi2_mask_store_2, 
"__builtin_ia32_pmovuswb128mem_mask", IX86_BUILTIN_PMOVUSWB128_MEM, UNKNOWN, 
(int) VOID_FTYPE_PUDI_V8HI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512BW | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_avx512vl_us_truncatev16hiv16qi2_mask_store, 
"__builtin_ia32_pmovuswb256mem_mask", IX86_BUILTIN_PMOVUSWB256_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV16QI_V16HI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, 
CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovuswb512mem_mask", IX86_BUILTIN_PMOVUSWB512_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV32QI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, 
CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV32QI_V32HI_USI)
-BDESC (OPTION_MASK_ISA_AVX512BW, 0, 
CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) 
VOID_FTYPE_PV32QI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_us_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovuswb512mem_mask", IX86_BUILTIN_PMOVUSWB512_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV32QI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_ss_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovswb512mem_mask", IX86_BUILTIN_PMOVSWB512_MEM, UNKNOWN, 
(int) VOID_FTYPE_PV32QI_V32HI_USI)
+BDESC (OPTION_MASK_ISA_AVX512BW, OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512bw_truncatev32hiv32qi2_mask_store, 
"__builtin_ia32_pmovwb512mem_mask", IX86_BUILTIN_PMOVWB512_MEM, UNKNOWN, (int) 
VOID_FTYPE_PV32QI_V32HI_USI)
 
 /* AVX512FP16 */
 BDESC (0, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_avx512f_loadhf_mask, 
"__builtin_ia32_loadsh_mask", IX86_BUILTIN_LOADSH_MASK, UNKNOWN, (int) 
V8HF_FTYPE_PCFLOAT16_V8HF_UQI)
@@ -1590,61 +1590,61 @@ BDESC (OPTION_MASK_ISA_AVX512F, 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512f_round
 BDESC (OPTION_MASK_ISA_AVX512DQ, 0, 

[PATCH 16/18] Support -mevex512 for AVX512{IFMA, VBMI, VNNI, BF16, VPOPCNTDQ, VBMI2, BITALG, VP2INTERSECT}, VAES, GFNI, VPCLMULQDQ intrins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/sse.md (VI1_AVX512VL): Add TARGET_EVEX512.
(VI8_FVL): Ditto.
(VI1_AVX512F): Ditto.
(VI1_AVX512VNNI): Ditto.
(VI1_AVX512VL_F): Ditto.
(VI12_VI48F_AVX512VL): Ditto.
(*avx512f_permvar_truncv32hiv32qi_1): Ditto.
(sdot_prod): Ditto.
(VEC_PERM_AVX2): Ditto.
(VPERMI2): Ditto.
(VPERMI2I): Ditto.
(vpmadd52v8di): Ditto.
(usdot_prod): Ditto.
(vpdpbusd_v16si): Ditto.
(vpdpbusds_v16si): Ditto.
(vpdpwssd_v16si): Ditto.
(vpdpwssds_v16si): Ditto.
(VI48_AVX512VP2VL): Ditto.
(avx512vp2intersect_2intersectv16si): Ditto.
(VF_AVX512BF16VL): Ditto.
(VF1_AVX512_256): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr90096.c: Adjust error message.

Co-authored-by: Hu, Lin1 
---
 gcc/config/i386/sse.md  | 56 +
 gcc/testsuite/gcc.target/i386/pr90096.c |  2 +-
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e59f6bf4410..a5a95b9de66 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -298,7 +298,7 @@
(V32BF "TARGET_EVEX512") (V16BF "TARGET_AVX512VL") (V8BF 
"TARGET_AVX512VL")])
 
 (define_mode_iterator VI1_AVX512VL
-  [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
+  [(V64QI "TARGET_EVEX512") (V16QI "TARGET_AVX512VL") (V32QI 
"TARGET_AVX512VL")])
 
 ;; All vector modes
 (define_mode_iterator V
@@ -531,7 +531,7 @@
   [(V8DI "TARGET_AVX512F && TARGET_EVEX512") (V4DI "TARGET_AVX") V2DI])
 
 (define_mode_iterator VI8_FVL
-  [(V8DI "TARGET_AVX512F") V4DI (V2DI "TARGET_AVX512VL")])
+  [(V8DI "TARGET_AVX512F && TARGET_EVEX512") V4DI (V2DI "TARGET_AVX512VL")])
 
 (define_mode_iterator VI8_AVX512VL
   [(V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
@@ -546,10 +546,10 @@
   [(V64QI "TARGET_AVX512BW && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI])
 
 (define_mode_iterator VI1_AVX512F
-  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI])
+  [(V64QI "TARGET_AVX512F && TARGET_EVEX512") (V32QI "TARGET_AVX") V16QI])
 
 (define_mode_iterator VI1_AVX512VNNI
-  [(V64QI "TARGET_AVX512VNNI") (V32QI "TARGET_AVX2") V16QI])
+  [(V64QI "TARGET_AVX512VNNI && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI])
 
 (define_mode_iterator VI12_256_512_AVX512VL
   [(V64QI "TARGET_EVEX512") (V32QI "TARGET_AVX512VL")
@@ -599,7 +599,7 @@
V8DI ])
 
 (define_mode_iterator VI1_AVX512VL_F
-  [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F")])
+  [V32QI (V16QI "TARGET_AVX512VL") (V64QI "TARGET_AVX512F && TARGET_EVEX512")])
 
 (define_mode_iterator VI8_AVX2_AVX512BW
   [(V8DI "TARGET_AVX512BW && TARGET_EVEX512") (V4DI "TARGET_AVX2") V2DI])
@@ -923,8 +923,8 @@
(V4DI "TARGET_AVX512VL") (V4DF "TARGET_AVX512VL")
(V4SI "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
(V2DI "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")
-   V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")
-   V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")])
+   (V64QI "TARGET_EVEX512") (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")
+   (V32HI "TARGET_EVEX512") (V16HI "TARGET_AVX512VL") (V8HI 
"TARGET_AVX512VL")])
 
 (define_mode_iterator VI48F_256 [V8SI V8SF V4DI V4DF])
 
@@ -14217,7 +14217,7 @@
 (const_int 26) (const_int 27)
 (const_int 28) (const_int 29)
 (const_int 30) (const_int 31)])))]
-  "TARGET_AVX512VBMI && ix86_pre_reload_split ()"
+  "TARGET_AVX512VBMI && TARGET_EVEX512 && ix86_pre_reload_split ()"
   "#"
   "&& 1"
   [(set (match_dup 0)
@@ -16040,7 +16040,7 @@
   "TARGET_SSE2"
 {
   /* Try with vnni instructions.  */
-  if (( == 64 && TARGET_AVX512VNNI)
+  if (( == 64 && TARGET_AVX512VNNI && TARGET_EVEX512)
   || ( < 64
  && ((TARGET_AVX512VNNI && TARGET_AVX512VL) || TARGET_AVXVNNI)))
 {
@@ -17320,7 +17320,8 @@
(V8DF "TARGET_AVX512F && TARGET_EVEX512")
(V16SI "TARGET_AVX512F && TARGET_EVEX512")
(V8DI "TARGET_AVX512F && TARGET_EVEX512")
-   (V32HI "TARGET_AVX512BW && TARGET_EVEX512") (V64QI "TARGET_AVX512VBMI")
+  

[PATCH 08/18] [PATCH 2/5] Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* config/i386/i386-builtin.def (BDESC): Add
OPTION_MASK_ISA2_EVEX512.
---
 gcc/config/i386/i386-builtin.def | 94 
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 0cc526383db..7a0dec9bc8b 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2408,37 +2408,37 @@ BDESC (OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_avx512vl_cmpv2df3_mask, "__builtin_
 BDESC (OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_avx512vl_cmpv4sf3_mask, 
"__builtin_ia32_cmpps128_mask", IX86_BUILTIN_CMPPS128_MASK, UNKNOWN, (int) 
UQI_FTYPE_V4SF_V4SF_INT_UQI)
 
 /* AVX512DQ.  */
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16sf_mask, 
"__builtin_ia32_broadcastf32x2_512_mask", IX86_BUILTIN_BROADCASTF32x2_512, 
UNKNOWN, (int) V16SF_FTYPE_V4SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16si_mask, 
"__builtin_ia32_broadcasti32x2_512_mask", IX86_BUILTIN_BROADCASTI32x2_512, 
UNKNOWN, (int) V16SI_FTYPE_V4SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv8df_mask_1, 
"__builtin_ia32_broadcastf64x2_512_mask", IX86_BUILTIN_BROADCASTF64X2_512, 
UNKNOWN, (int) V8DF_FTYPE_V2DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv8di_mask_1, 
"__builtin_ia32_broadcasti64x2_512_mask", IX86_BUILTIN_BROADCASTI64X2_512, 
UNKNOWN, (int) V8DI_FTYPE_V2DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16sf_mask_1, 
"__builtin_ia32_broadcastf32x8_512_mask", IX86_BUILTIN_BROADCASTF32X8_512, 
UNKNOWN, (int) V16SF_FTYPE_V8SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_broadcastv16si_mask_1, 
"__builtin_ia32_broadcasti32x8_512_mask", IX86_BUILTIN_BROADCASTI32X8_512, 
UNKNOWN, (int) V16SI_FTYPE_V8SI_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextractf64x2_mask, 
"__builtin_ia32_extractf64x2_512_mask", IX86_BUILTIN_EXTRACTF64X2_512, UNKNOWN, 
(int) V2DF_FTYPE_V8DF_INT_V2DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextractf32x8_mask, 
"__builtin_ia32_extractf32x8_mask", IX86_BUILTIN_EXTRACTF32X8, UNKNOWN, (int) 
V8SF_FTYPE_V16SF_INT_V8SF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextracti64x2_mask, 
"__builtin_ia32_extracti64x2_512_mask", IX86_BUILTIN_EXTRACTI64X2_512, UNKNOWN, 
(int) V2DI_FTYPE_V8DI_INT_V2DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vextracti32x8_mask, 
"__builtin_ia32_extracti32x8_mask", IX86_BUILTIN_EXTRACTI32X8, UNKNOWN, (int) 
V8SI_FTYPE_V16SI_INT_V8SI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv8df_mask, 
"__builtin_ia32_reducepd512_mask", IX86_BUILTIN_REDUCEPD512_MASK, UNKNOWN, 
(int) V8DF_FTYPE_V8DF_INT_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv16sf_mask, 
"__builtin_ia32_reduceps512_mask", IX86_BUILTIN_REDUCEPS512_MASK, UNKNOWN, 
(int) V16SF_FTYPE_V16SF_INT_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_mulv8di3_mask, 
"__builtin_ia32_pmullq512_mask", IX86_BUILTIN_PMULLQ512, UNKNOWN, (int) 
V8DI_FTYPE_V8DI_V8DI_V8DI_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_xorv8df3_mask, 
"__builtin_ia32_xorpd512_mask", IX86_BUILTIN_XORPD512, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_xorv16sf3_mask, 
"__builtin_ia32_xorps512_mask", IX86_BUILTIN_XORPS512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_iorv8df3_mask, 
"__builtin_ia32_orpd512_mask", IX86_BUILTIN_ORPD512, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_iorv16sf3_mask, 
"__builtin_ia32_orps512_mask", IX86_BUILTIN_ORPS512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_andv8df3_mask, 
"__builtin_ia32_andpd512_mask", IX86_BUILTIN_ANDPD512, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_andv16sf3_mask, 
"__builtin_ia32_andps512_mask", IX86_BUILTIN_ANDPS512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512f_andnotv8df3_mask, 
"__builtin_ia32_andnpd512_mask", IX86_BUILTIN_ANDNPD512, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_V8DF_UQI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512f_andnotv16sf3_mask, 
"__builtin_ia32_andnps512_mask", IX86_BUILTIN_ANDNPS512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V16SF_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vinsertf32x8_mask, 
"__builtin_ia32_insertf32x8_mask", IX86_BUILTIN_INSERTF32X8, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_vinserti32x8_mask, 
"__builtin_ia32_inserti32x8_mask", IX86_BUILTIN_INSERTI32X8, UNKNOWN, (int) 
V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512DQ, 0, 

[PATCH 00/18] Support -mevex512 for AVX512

2023-09-21 Thread Hu, Lin1
Hi all,

After previous discussion, instead of supporting option -mavx10.1, we
will first introduct option -m[no-]evex512, which will enable/disable
512 bit register and 64 bit mask register.

It will not change the current option behavior since if AVX512F is
enabled with no evex512 option specified, it will automatically enable
512 bit register and 64 bit mask register.

How the patches go comes following:

Patch 1 added initial support for option -mevex512.

Patch 2-6 refined current intrin file to push evex512 target for all
512 bit intrins. Those scalar intrins remained untouched.

Patch 7-11 added OPTION_MASK_ISA2_EVEX512 for all related builtins.

Patch 12 disabled zmm register, 512 bit libmvec call for no-evex512,
also requested evex512 for vectorization when using 512 bit register.

Patch 13-17 supported evex512 in related patterns.

Patch 18 added testcases for -mno-evex512 and allowed its usage.

The patches currently cause scan-asm fail for pr89229-{5,6,7}b.c since
we will emit scalar vmovss here. When trying to use x/ymm 16+ w/o
avx512vl but with avx512f+evex512, I suppose we could either emit scalar
or zmm instructions. It is quite a rare case on HW since there is no
HW w/o avx512vl but with avx512f, so I prefer to not to add maintainence
effort here to get a slightly perf improvement. But it could be changed
to former behavior.

Discussions are welcomed for all the patches.

Thx,
Haochen

Haochen Jiang (18):
  Initial support for -mevex512
  Push evex512 target for 512 bit intrins
  Push evex512 target for 512 bit intrins
  Push evex512 target for 512 bit intrins
  Push evex512 target for 512 bit intrins
  Push evex512 target for 512 bit intrins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Add OPTION_MASK_ISA2_EVEX512 for 512 bit builtins
  Disable zmm register and 512 bit libmvec call when !TARGET_EVEX512
  Support -mevex512 for AVX512F intrins
  Support -mevex512 for AVX512DQ intrins
  Support -mevex512 for AVX512BW intrins
  Support -mevex512 for

AVX512{IFMA,VBMI,VNNI,BF16,VPOPCNTDQ,VBMI2,BITALG,VP2INTERSECT},VAES,GFNI,VPCLMULQDQ
intrins
  Support -mevex512 for AVX512FP16 intrins
  Allow -mno-evex512 usage

 gcc/common/config/i386/i386-common.cc   |15 +
 gcc/config.gcc  |19 +-
 gcc/config/i386/avx5124fmapsintrin.h| 2 +-
 gcc/config/i386/avx5124vnniwintrin.h| 2 +-
 gcc/config/i386/avx512bf16intrin.h  |31 +-
 gcc/config/i386/avx512bitalgintrin.h|   155 +-
 gcc/config/i386/avx512bitalgvlintrin.h  |   180 +
 gcc/config/i386/avx512bwintrin.h|   291 +-
 gcc/config/i386/avx512dqintrin.h|  1840 +-
 gcc/config/i386/avx512erintrin.h| 2 +-
 gcc/config/i386/avx512fintrin.h | 19663 +-
 gcc/config/i386/avx512fp16intrin.h  |  8925 
 gcc/config/i386/avx512ifmaintrin.h  | 4 +-
 gcc/config/i386/avx512pfintrin.h| 2 +-
 gcc/config/i386/avx512vbmi2intrin.h | 4 +-
 gcc/config/i386/avx512vbmiintrin.h  | 4 +-
 gcc/config/i386/avx512vnniintrin.h  | 4 +-
 gcc/config/i386/avx512vp2intersectintrin.h  | 4 +-
 gcc/config/i386/avx512vpopcntdqintrin.h | 4 +-
 gcc/config/i386/gfniintrin.h|76 +-
 gcc/config/i386/i386-builtin.def|  1312 +-
 gcc/config/i386/i386-builtins.cc|96 +-
 gcc/config/i386/i386-c.cc   | 2 +
 gcc/config/i386/i386-expand.cc  |18 +-
 gcc/config/i386/i386-options.cc |33 +-
 gcc/config/i386/i386.cc |   168 +-
 gcc/config/i386/i386.h  | 7 +-
 gcc/config/i386/i386.md |   127 +-
 gcc/config/i386/i386.opt| 4 +
 gcc/config/i386/immintrin.h | 2 +
 gcc/config/i386/predicates.md   | 3 +-
 gcc/config/i386/sse.md  |   854 +-
 gcc/config/i386/vaesintrin.h| 4 +-
 gcc/config/i386/vpclmulqdqintrin.h  | 4 +-
 gcc/testsuite/gcc.target/i386/noevex512-1.c |13 +
 gcc/testsuite/gcc.target/i386/noevex512-2.c |13 +
 gcc/testsuite/gcc.target/i386/noevex512-3.c |13 +
 gcc/testsuite/gcc.target/i386/pr89229-5b.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr89229-6b.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr89229-7b.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr90096.c | 2 +-
 41 files changed, 17170 insertions(+), 16738 deletions(-)
 create mode 100644 gcc/config/i386/avx512bitalgvlintrin.h
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/noevex512-3.c

-- 
2.31.1



[PATCH 01/18] Initial support for -mevex512

2023-09-21 Thread Hu, Lin1
From: Haochen Jiang 

gcc/ChangeLog:

* common/config/i386/i386-common.cc
(OPTION_MASK_ISA2_EVEX512_SET): New.
(OPTION_MASK_ISA2_EVEX512_UNSET): Ditto.
(ix86_handle_option): Handle EVEX512.
* config/i386/i386-c.cc (ix86_target_macros_internal): Ditto.
* config/i386/i386-options.cc: (isa2_opts): Ditto.
(ix86_valid_target_attribute_inner_p): Ditto.
(ix86_option_override_internal): Set EVEX512 target if it is not
explicitly set when AVX512 is enabled. Disable
AVX512{PF,ER,4VNNIW,4FAMPS} for -mno-evex512.
* config/i386/i386.opt: Add mevex512. Temporaily RejectNegative.
---
 gcc/common/config/i386/i386-common.cc | 15 +++
 gcc/config/i386/i386-c.cc |  2 ++
 gcc/config/i386/i386-options.cc   | 19 ++-
 gcc/config/i386/i386.opt  |  4 
 4 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 95468b7c405..8cc59e08d06 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -123,6 +123,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_SM3_SET OPTION_MASK_ISA2_SM3
 #define OPTION_MASK_ISA2_SHA512_SET OPTION_MASK_ISA2_SHA512
 #define OPTION_MASK_ISA2_SM4_SET OPTION_MASK_ISA2_SM4
+#define OPTION_MASK_ISA2_EVEX512_SET OPTION_MASK_ISA2_EVEX512
 
 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2.  */
@@ -309,6 +310,7 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_SM3_UNSET OPTION_MASK_ISA2_SM3
 #define OPTION_MASK_ISA2_SHA512_UNSET OPTION_MASK_ISA2_SHA512
 #define OPTION_MASK_ISA2_SM4_UNSET OPTION_MASK_ISA2_SM4
+#define OPTION_MASK_ISA2_EVEX512_UNSET OPTION_MASK_ISA2_EVEX512
 
 /* SSE4 includes both SSE4.1 and SSE4.2.  -mno-sse4 should the same
as -mno-sse4.1. */
@@ -1341,6 +1343,19 @@ ix86_handle_option (struct gcc_options *opts,
}
   return true;
 
+case OPT_mevex512:
+  if (value)
+   {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_EVEX512_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_EVEX512_SET;
+   }
+  else
+   {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_EVEX512_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_EVEX512_UNSET;
+   }
+  return true;
+
 case OPT_mfma:
   if (value)
{
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 47768fa0940..93154efa7ff 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -707,6 +707,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
 def_or_undef (parse_in, "__SHA512__");
   if (isa_flag2 & OPTION_MASK_ISA2_SM4)
 def_or_undef (parse_in, "__SM4__");
+  if (isa_flag2 & OPTION_MASK_ISA2_EVEX512)
+def_or_undef (parse_in, "__EVEX512__");
   if (TARGET_IAMCU)
 {
   def_or_undef (parse_in, "__iamcu");
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index e47f9ed5d5f..a1a7a92da9f 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -250,7 +250,8 @@ static struct ix86_target_opts isa2_opts[] =
   { "-mavxvnniint16",  OPTION_MASK_ISA2_AVXVNNIINT16 },
   { "-msm3",   OPTION_MASK_ISA2_SM3 },
   { "-msha512",OPTION_MASK_ISA2_SHA512 },
-  { "-msm4",OPTION_MASK_ISA2_SM4 }
+  { "-msm4",OPTION_MASK_ISA2_SM4 },
+  { "-mevex512",OPTION_MASK_ISA2_EVEX512 }
 };
 static struct ix86_target_opts isa_opts[] =
 {
@@ -1109,6 +1110,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree 
args, char *p_strings[],
 IX86_ATTR_ISA ("sm3", OPT_msm3),
 IX86_ATTR_ISA ("sha512", OPT_msha512),
 IX86_ATTR_ISA ("sm4", OPT_msm4),
+IX86_ATTR_ISA ("evex512", OPT_mevex512),
 
 /* enum options */
 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
@@ -2559,6 +2561,21 @@ ix86_option_override_internal (bool main_args_p,
   &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
   & ~opts->x_ix86_isa_flags_explicit);
 
+  /* Set EVEX512 target if it is not explicitly set
+ when AVX512 is enabled.  */
+  if (TARGET_AVX512F_P(opts->x_ix86_isa_flags)
+  && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_EVEX512))
+opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_EVEX512;
+
+  /* Disable AVX512{PF,ER,4VNNIW,4FAMPS} for -mno-evex512.  */
+  if (!TARGET_EVEX512_P(opts->x_ix86_isa_flags2))
+{
+  opts->x_ix86_isa_flags
+   &= ~(OPTION_MASK_ISA_AVX512PF | OPTION_MASK_ISA_AVX512ER);
+  opts->x_ix86_isa_flags2
+   &= ~(OPTION_MASK_ISA2_AVX5124FMAPS | OPTION_MASK_ISA2_AVX5124VNNIW);
+}
+
   /* Validate -mpreferred-stack-boundary= value or default it to
  PREFERRED_STACK_BOUNDARY_DEFAULT.  */
   ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
diff 

[r14-4046 Regression] FAIL: 23_containers/vector/bool/110807.cc -std=gnu++17 (test for excess errors) on Linux/x86_64

2023-09-17 Thread Hu, Lin1 via Gcc-patches


On Linux/x86_64,

3a0e01f6bb1d6ec444001f2caea6ef43a4a83e3a is the first bad commit commit 
3a0e01f6bb1d6ec444001f2caea6ef43a4a83e3a
Author: Jonathan Wakely 
Date:   Fri Sep 1 21:27:57 2023 +0100

libstdc++: Add support for running tests with multiple -std options

caused

FAIL: 23_containers/vector/bool/110807.cc  -std=gnu++17 (test for excess errors)

with GCC configured with

../../gcc/configure 
--prefix=/export/users/haochenj/src/gcc-bisect/master/master/r14-4046/usr 
--enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check 
RUNTESTFLAGS="conformance.exp=23_containers/vector/bool/110807.cc 
--target_board='unix{-m32}'"
$ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check 
RUNTESTFLAGS="conformance.exp=23_containers/vector/bool/110807.cc 
--target_board='unix{-m32\ -march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at lin1 dot hu at intel.com.) (If you met problems with cascadelake related, 
disabling AVX512F in command line might save that.) (However, please make sure 
that there is no potential problems with AVX512.)


[PATCH] Add myself for write after approval

2023-07-31 Thread Hu, Lin1 via Gcc-patches
ChangeLog:

* MAINTAINERS (Write After Approval): Add myself.
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 49aa6bae73b..90e2c81f0c2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -460,6 +460,7 @@ Matthew Hiller  

 Kazu Hirata
 Manfred Hollstein  
 Cong Hou   
+Lin Hu 
 Falk Hueffner  
 Andrew John Hughes 
 Dominique d'Humieres   
-- 
2.31.1



[PATCH] i386: refactor macros.

2023-06-28 Thread Hu, Lin1 via Gcc-patches
Hi, all

This patch aims to refactor macros in case some other thing is added to
AMX_TILE_SET in future. OK for trunk?

BRs,
Lin

gcc/ChangeLog:

* common/config/i386/i386-common.cc (OPTION_MASK_ISA2_AMX_INT8_SET):
Change OPTION_MASK_ISA2_AMX_TILE to OPTION_MASK_ISA2_AMX_TILE_SET.
(OPTION_MASK_ISA2_AMX_FP16_SET): Ditto
(OPTION_MASK_ISA2_AMX_COMPLEX_SET): Ditto
(OPTION_MASK_ISA_ABM_SET):
Change OPTION_MASK_ISA_POPCNT to OPTION_MASK_ISA_POPCNT_SET.
---
 gcc/common/config/i386/i386-common.cc | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index bf126f14073..4f79afba917 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -107,18 +107,18 @@ along with GCC; see the file COPYING3.  If not see
 #define OPTION_MASK_ISA2_AVX512VP2INTERSECT_SET 
OPTION_MASK_ISA2_AVX512VP2INTERSECT
 #define OPTION_MASK_ISA2_AMX_TILE_SET OPTION_MASK_ISA2_AMX_TILE
 #define OPTION_MASK_ISA2_AMX_INT8_SET \
-  (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_INT8)
+  (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_INT8)
 #define OPTION_MASK_ISA2_AMX_BF16_SET \
-  (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_BF16)
+  (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_BF16)
 #define OPTION_MASK_ISA2_AVXVNNIINT8_SET OPTION_MASK_ISA2_AVXVNNIINT8
 #define OPTION_MASK_ISA2_AVXNECONVERT_SET OPTION_MASK_ISA2_AVXNECONVERT
 #define OPTION_MASK_ISA2_CMPCCXADD_SET OPTION_MASK_ISA2_CMPCCXADD
 #define OPTION_MASK_ISA2_AMX_FP16_SET \
-  (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_FP16)
+  (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_FP16)
 #define OPTION_MASK_ISA2_PREFETCHI_SET OPTION_MASK_ISA2_PREFETCHI
 #define OPTION_MASK_ISA2_RAOINT_SET OPTION_MASK_ISA2_RAOINT
 #define OPTION_MASK_ISA2_AMX_COMPLEX_SET \
-  (OPTION_MASK_ISA2_AMX_TILE | OPTION_MASK_ISA2_AMX_COMPLEX)
+  (OPTION_MASK_ISA2_AMX_TILE_SET | OPTION_MASK_ISA2_AMX_COMPLEX)
 
 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
as -msse4.2.  */
@@ -143,7 +143,7 @@ along with GCC; see the file COPYING3.  If not see
   (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2_SET)
 
 #define OPTION_MASK_ISA_ABM_SET \
-  (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)
+  (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT_SET)
 
 #define OPTION_MASK_ISA2_PCONFIG_SET OPTION_MASK_ISA2_PCONFIG
 #define OPTION_MASK_ISA2_WBNOINVD_SET OPTION_MASK_ISA2_WBNOINVD
-- 
2.31.1



RE: [PATCH] i386: Fix incorrect intrinsic signature for AVX512 s{lli|rai|rli}

2023-05-25 Thread Hu, Lin1 via Gcc-patches
OK, I update the change log and modify a part of format. The attached file is 
the new version.

-Original Message-
From: Hongtao Liu  
Sent: Thursday, May 25, 2023 11:40 AM
To: Hu, Lin1 
Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; 
ubiz...@gmail.com
Subject: Re: [PATCH] i386: Fix incorrect intrinsic signature for AVX512 
s{lli|rai|rli}

On Thu, May 25, 2023 at 10:55 AM Hu, Lin1 via Gcc-patches
 wrote:
>
> Hi all,
>
> This patch aims to fix incorrect intrinsic signature for 
> _mm{512|256|}_s{lli|rai|rli}_epi*. And it has been tested on 
> x86_64-pc-linux-gnu. OK for trunk?
>
> BRs,
> Lin
>
> gcc/ChangeLog:
>
> PR target/109173
> PR target/109174
> * config/i386/avx512bwintrin.h (_mm512_srli_epi16): Change type from
> int to const int.
int to unsigned int or const int to const unsigned int.
Others LGTM.
> (_mm512_mask_srli_epi16): Ditto.
> (_mm512_slli_epi16): Ditto.
> (_mm512_mask_slli_epi16): Ditto.
> (_mm512_maskz_slli_epi16): Ditto.
> (_mm512_srai_epi16): Ditto.
> (_mm512_mask_srai_epi16): Ditto.
> (_mm512_maskz_srai_epi16): Ditto.
> * config/i386/avx512vlintrin.h (_mm256_mask_srli_epi32): Ditto.
> (_mm256_maskz_srli_epi32): Ditto.
> (_mm_mask_srli_epi32): Ditto.
> (_mm_maskz_srli_epi32): Ditto.
> (_mm256_mask_srli_epi64): Ditto.
> (_mm256_maskz_srli_epi64): Ditto.
> (_mm_mask_srli_epi64): Ditto.
> (_mm_maskz_srli_epi64): Ditto.
> (_mm256_mask_srai_epi32): Ditto.
> (_mm256_maskz_srai_epi32): Ditto.
> (_mm_mask_srai_epi32): Ditto.
> (_mm_maskz_srai_epi32): Ditto.
> (_mm256_srai_epi64): Ditto.
> (_mm256_mask_srai_epi64): Ditto.
> (_mm256_maskz_srai_epi64): Ditto.
> (_mm_srai_epi64): Ditto.
> (_mm_mask_srai_epi64): Ditto.
> (_mm_maskz_srai_epi64): Ditto.
> (_mm_mask_slli_epi32): Ditto.
> (_mm_maskz_slli_epi32): Ditto.
> (_mm_mask_slli_epi64): Ditto.
> (_mm_maskz_slli_epi64): Ditto.
> (_mm256_mask_slli_epi32): Ditto.
> (_mm256_maskz_slli_epi32): Ditto.
> (_mm256_mask_slli_epi64): Ditto.
> (_mm256_maskz_slli_epi64): Ditto.
> (_mm_mask_srai_epi16): Ditto.
> (_mm_maskz_srai_epi16): Ditto.
> (_mm256_srai_epi16): Ditto.
> (_mm256_mask_srai_epi16): Ditto.
> (_mm_mask_slli_epi16): Ditto.
> (_mm_maskz_slli_epi16): Ditto.
> (_mm256_mask_slli_epi16): Ditto.
> (_mm256_maskz_slli_epi16): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> PR target/109173
> PR target/109174
> * gcc.target/i386/pr109173-1.c: New test.
> * gcc.target/i386/pr109174-1.c: Ditto.
> ---
>  gcc/config/i386/avx512bwintrin.h   |  32 +++---
>  gcc/config/i386/avx512fintrin.h|  58 +++
>  gcc/config/i386/avx512vlbwintrin.h |  36 ---
>  gcc/config/i386/avx512vlintrin.h   | 112 +++--
>  gcc/testsuite/gcc.target/i386/pr109173-1.c |  57 +++
>  gcc/testsuite/gcc.target/i386/pr109174-1.c |  45 +
>  6 files changed, 236 insertions(+), 104 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109173-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109174-1.c
>
> diff --git a/gcc/config/i386/avx512bwintrin.h 
> b/gcc/config/i386/avx512bwintrin.h
> index 89790f7917b..791d4e35f32 100644
> --- a/gcc/config/i386/avx512bwintrin.h
> +++ b/gcc/config/i386/avx512bwintrin.h
> @@ -2880,7 +2880,7 @@ _mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, 
> __m512i __B,
>
>  extern __inline __m512i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm512_srli_epi16 (__m512i __A, const int __imm)
> +_mm512_srli_epi16 (__m512i __A, const unsigned int __imm)
>  {
>return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
>   (__v32hi)
> @@ -2891,7 +2891,7 @@ _mm512_srli_epi16 (__m512i __A, const int __imm)
>  extern __inline __m512i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
> -   const int __imm)
> +   const unsigned int __imm)
>  {
>return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
>   (__v32hi) __W,
> @@ -2910,7 +2910,7 @@ _mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, 
> const int __imm)
>
>  extern __inline __m512i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)

[PATCH] i386: Fix incorrect intrinsic signature for AVX512 s{lli|rai|rli}

2023-05-24 Thread Hu, Lin1 via Gcc-patches
Hi all,

This patch aims to fix incorrect intrinsic signature for 
_mm{512|256|}_s{lli|rai|rli}_epi*. And it has been tested on 
x86_64-pc-linux-gnu. OK for trunk?

BRs,
Lin

gcc/ChangeLog:

PR target/109173
PR target/109174
* config/i386/avx512bwintrin.h (_mm512_srli_epi16): Change type from
int to const int.
(_mm512_mask_srli_epi16): Ditto.
(_mm512_slli_epi16): Ditto.
(_mm512_mask_slli_epi16): Ditto.
(_mm512_maskz_slli_epi16): Ditto.
(_mm512_srai_epi16): Ditto.
(_mm512_mask_srai_epi16): Ditto.
(_mm512_maskz_srai_epi16): Ditto.
* config/i386/avx512vlintrin.h (_mm256_mask_srli_epi32): Ditto.
(_mm256_maskz_srli_epi32): Ditto.
(_mm_mask_srli_epi32): Ditto.
(_mm_maskz_srli_epi32): Ditto.
(_mm256_mask_srli_epi64): Ditto.
(_mm256_maskz_srli_epi64): Ditto.
(_mm_mask_srli_epi64): Ditto.
(_mm_maskz_srli_epi64): Ditto.
(_mm256_mask_srai_epi32): Ditto.
(_mm256_maskz_srai_epi32): Ditto.
(_mm_mask_srai_epi32): Ditto.
(_mm_maskz_srai_epi32): Ditto.
(_mm256_srai_epi64): Ditto.
(_mm256_mask_srai_epi64): Ditto.
(_mm256_maskz_srai_epi64): Ditto.
(_mm_srai_epi64): Ditto.
(_mm_mask_srai_epi64): Ditto.
(_mm_maskz_srai_epi64): Ditto.
(_mm_mask_slli_epi32): Ditto.
(_mm_maskz_slli_epi32): Ditto.
(_mm_mask_slli_epi64): Ditto.
(_mm_maskz_slli_epi64): Ditto.
(_mm256_mask_slli_epi32): Ditto.
(_mm256_maskz_slli_epi32): Ditto.
(_mm256_mask_slli_epi64): Ditto.
(_mm256_maskz_slli_epi64): Ditto.
(_mm_mask_srai_epi16): Ditto.
(_mm_maskz_srai_epi16): Ditto.
(_mm256_srai_epi16): Ditto.
(_mm256_mask_srai_epi16): Ditto.
(_mm_mask_slli_epi16): Ditto.
(_mm_maskz_slli_epi16): Ditto.
(_mm256_mask_slli_epi16): Ditto.
(_mm256_maskz_slli_epi16): Ditto.

gcc/testsuite/ChangeLog:

PR target/109173
PR target/109174
* gcc.target/i386/pr109173-1.c: New test.
* gcc.target/i386/pr109174-1.c: Ditto.
---
 gcc/config/i386/avx512bwintrin.h   |  32 +++---
 gcc/config/i386/avx512fintrin.h|  58 +++
 gcc/config/i386/avx512vlbwintrin.h |  36 ---
 gcc/config/i386/avx512vlintrin.h   | 112 +++--
 gcc/testsuite/gcc.target/i386/pr109173-1.c |  57 +++
 gcc/testsuite/gcc.target/i386/pr109174-1.c |  45 +
 6 files changed, 236 insertions(+), 104 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109173-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109174-1.c

diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h
index 89790f7917b..791d4e35f32 100644
--- a/gcc/config/i386/avx512bwintrin.h
+++ b/gcc/config/i386/avx512bwintrin.h
@@ -2880,7 +2880,7 @@ _mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, 
__m512i __B,
 
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_srli_epi16 (__m512i __A, const int __imm)
+_mm512_srli_epi16 (__m512i __A, const unsigned int __imm)
 {
   return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
  (__v32hi)
@@ -2891,7 +2891,7 @@ _mm512_srli_epi16 (__m512i __A, const int __imm)
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-   const int __imm)
+   const unsigned int __imm)
 {
   return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
  (__v32hi) __W,
@@ -2910,7 +2910,7 @@ _mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, 
const int __imm)
 
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_slli_epi16 (__m512i __A, const int __B)
+_mm512_slli_epi16 (__m512i __A, const unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
  (__v32hi)
@@ -2921,7 +2921,7 @@ _mm512_slli_epi16 (__m512i __A, const int __B)
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
-   const int __B)
+   const unsigned int __B)
 {
   return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
  (__v32hi) __W,
@@ -2930,7 +2930,7 @@ _mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, 
__m512i __A,
 
 extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B)
+_mm512_maskz_slli_epi16 (__mmask32 __U, 

RE: [PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics

2023-04-18 Thread Hu, Lin1 via Gcc-patches
More details: Intrinsics guide add these 128/256-bit intrinsics as follow: 
https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=reduce__expand=5814.

So we intend to enable these intrinsics for GCC-14.

-Original Message-
From: Gcc-patches  On Behalf 
Of Hu, Lin1 via Gcc-patches
Sent: Tuesday, April 18, 2023 3:03 PM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao ; ubiz...@gmail.com
Subject: [PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics

Hi all,

The patch aims to support reduce_*_ep[i|u][8|16] series intrinsics, and has 
been tested on x86_64-pc-linux-gnu. OK for trunk?

BRs,
Lin

gcc/ChangeLog:

* config/i386/avx2intrin.h
(_MM_REDUCE_OPERATOR_BASIC_EPI16): New macro.
(_MM_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto.
(_MM256_REDUCE_OPERATOR_BASIC_EPI16): Ditto.
(_MM256_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto.
(_MM_REDUCE_OPERATOR_BASIC_EPI8): Ditto.
(_MM_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto.
(_MM256_REDUCE_OPERATOR_BASIC_EPI8): Ditto.
(_MM256_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto.
(_mm_reduce_add_epi16): New instrinsics.
(_mm_reduce_mul_epi16): Ditto.
(_mm_reduce_and_epi16): Ditto.
(_mm_reduce_or_epi16): Ditto.
(_mm_reduce_max_epi16): Ditto.
(_mm_reduce_max_epu16): Ditto.
(_mm_reduce_min_epi16): Ditto.
(_mm_reduce_min_epu16): Ditto.
(_mm256_reduce_add_epi16): Ditto.
(_mm256_reduce_mul_epi16): Ditto.
(_mm256_reduce_and_epi16): Ditto.
(_mm256_reduce_or_epi16): Ditto.
(_mm256_reduce_max_epi16): Ditto.
(_mm256_reduce_max_epu16): Ditto.
(_mm256_reduce_min_epi16): Ditto.
(_mm256_reduce_min_epu16): Ditto.
(_mm_reduce_add_epi8): Ditto.
(_mm_reduce_mul_epi8): Ditto.
(_mm_reduce_and_epi8): Ditto.
(_mm_reduce_or_epi8): Ditto.
(_mm_reduce_max_epi8): Ditto.
(_mm_reduce_max_epu8): Ditto.
(_mm_reduce_min_epi8): Ditto.
(_mm_reduce_min_epu8): Ditto.
(_mm256_reduce_add_epi8): Ditto.
(_mm256_reduce_mul_epi8): Ditto.
(_mm256_reduce_and_epi8): Ditto.
(_mm256_reduce_or_epi8): Ditto.
(_mm256_reduce_max_epi8): Ditto.
(_mm256_reduce_max_epu8): Ditto.
(_mm256_reduce_min_epi8): Ditto.
(_mm256_reduce_min_epu8): Ditto.
* config/i386/avx512vlbwintrin.h:
(_mm_mask_reduce_add_epi16): Ditto.
(_mm_mask_reduce_mul_epi16): Ditto.
(_mm_mask_reduce_and_epi16): Ditto.
(_mm_mask_reduce_or_epi16): Ditto.
(_mm_mask_reduce_max_epi16): Ditto.
(_mm_mask_reduce_max_epu16): Ditto.
(_mm_mask_reduce_min_epi16): Ditto.
(_mm_mask_reduce_min_epu16): Ditto.
(_mm256_mask_reduce_add_epi16): Ditto.
(_mm256_mask_reduce_mul_epi16): Ditto.
(_mm256_mask_reduce_and_epi16): Ditto.
(_mm256_mask_reduce_or_epi16): Ditto.
(_mm256_mask_reduce_max_epi16): Ditto.
(_mm256_mask_reduce_max_epu16): Ditto.
(_mm256_mask_reduce_min_epi16): Ditto.
(_mm256_mask_reduce_min_epu16): Ditto.
(_mm_mask_reduce_add_epi8): Ditto.
(_mm_mask_reduce_mul_epi8): Ditto.
(_mm_mask_reduce_and_epi8): Ditto.
(_mm_mask_reduce_or_epi8): Ditto.
(_mm_mask_reduce_max_epi8): Ditto.
(_mm_mask_reduce_max_epu8): Ditto.
(_mm_mask_reduce_min_epi8): Ditto.
(_mm_mask_reduce_min_epu8): Ditto.
(_mm256_mask_reduce_add_epi8): Ditto.
(_mm256_mask_reduce_mul_epi8): Ditto.
(_mm256_mask_reduce_and_epi8): Ditto.
(_mm256_mask_reduce_or_epi8): Ditto.
(_mm256_mask_reduce_max_epi8): Ditto.
(_mm256_mask_reduce_max_epu8): Ditto.
(_mm256_mask_reduce_min_epi8): Ditto.
(_mm256_mask_reduce_min_epu8): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512vlbw-reduce-op-1.c: New test.
---
 gcc/config/i386/avx2intrin.h  | 347 ++
 gcc/config/i386/avx512vlbwintrin.h| 256 +
 .../gcc.target/i386/avx512vlbw-reduce-op-1.c  | 206 +++
 3 files changed, 809 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c

diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 
1b9c8169a96..9b8c13b7233 100644
--- a/gcc/config/i386/avx2intrin.h
+++ b/gcc/config/i386/avx2intrin.h
@@ -1915,6 +1915,353 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const 
*__base,
   (int) (SCALE))
 #endif  /* __OPTIMIZE__ */
 
+#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \
+  __v8hi __T1 = (__v8hi)__W; \
+  __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, 
+6, 7); \
+  __v8hi __T3 = __T1 op __T2; \
+  __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, 
+6, 7); \
+  __v8hi __T5 = __T3 op __T4; \
+  __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1

[PATCH] i386: Add reduce_*_ep[i|u][8|16] series intrinsics

2023-04-18 Thread Hu, Lin1 via Gcc-patches
Hi all,

The patch aims to support reduce_*_ep[i|u][8|16] series intrinsics, and
has been tested on x86_64-pc-linux-gnu. OK for trunk?

BRs,
Lin

gcc/ChangeLog:

* config/i386/avx2intrin.h
(_MM_REDUCE_OPERATOR_BASIC_EPI16): New macro.
(_MM_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto.
(_MM256_REDUCE_OPERATOR_BASIC_EPI16): Ditto.
(_MM256_REDUCE_OPERATOR_MAX_MIN_EP16): Ditto.
(_MM_REDUCE_OPERATOR_BASIC_EPI8): Ditto.
(_MM_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto.
(_MM256_REDUCE_OPERATOR_BASIC_EPI8): Ditto.
(_MM256_REDUCE_OPERATOR_MAX_MIN_EP8): Ditto.
(_mm_reduce_add_epi16): New instrinsics.
(_mm_reduce_mul_epi16): Ditto.
(_mm_reduce_and_epi16): Ditto.
(_mm_reduce_or_epi16): Ditto.
(_mm_reduce_max_epi16): Ditto.
(_mm_reduce_max_epu16): Ditto.
(_mm_reduce_min_epi16): Ditto.
(_mm_reduce_min_epu16): Ditto.
(_mm256_reduce_add_epi16): Ditto.
(_mm256_reduce_mul_epi16): Ditto.
(_mm256_reduce_and_epi16): Ditto.
(_mm256_reduce_or_epi16): Ditto.
(_mm256_reduce_max_epi16): Ditto.
(_mm256_reduce_max_epu16): Ditto.
(_mm256_reduce_min_epi16): Ditto.
(_mm256_reduce_min_epu16): Ditto.
(_mm_reduce_add_epi8): Ditto.
(_mm_reduce_mul_epi8): Ditto.
(_mm_reduce_and_epi8): Ditto.
(_mm_reduce_or_epi8): Ditto.
(_mm_reduce_max_epi8): Ditto.
(_mm_reduce_max_epu8): Ditto.
(_mm_reduce_min_epi8): Ditto.
(_mm_reduce_min_epu8): Ditto.
(_mm256_reduce_add_epi8): Ditto.
(_mm256_reduce_mul_epi8): Ditto.
(_mm256_reduce_and_epi8): Ditto.
(_mm256_reduce_or_epi8): Ditto.
(_mm256_reduce_max_epi8): Ditto.
(_mm256_reduce_max_epu8): Ditto.
(_mm256_reduce_min_epi8): Ditto.
(_mm256_reduce_min_epu8): Ditto.
* config/i386/avx512vlbwintrin.h:
(_mm_mask_reduce_add_epi16): Ditto.
(_mm_mask_reduce_mul_epi16): Ditto.
(_mm_mask_reduce_and_epi16): Ditto.
(_mm_mask_reduce_or_epi16): Ditto.
(_mm_mask_reduce_max_epi16): Ditto.
(_mm_mask_reduce_max_epu16): Ditto.
(_mm_mask_reduce_min_epi16): Ditto.
(_mm_mask_reduce_min_epu16): Ditto.
(_mm256_mask_reduce_add_epi16): Ditto.
(_mm256_mask_reduce_mul_epi16): Ditto.
(_mm256_mask_reduce_and_epi16): Ditto.
(_mm256_mask_reduce_or_epi16): Ditto.
(_mm256_mask_reduce_max_epi16): Ditto.
(_mm256_mask_reduce_max_epu16): Ditto.
(_mm256_mask_reduce_min_epi16): Ditto.
(_mm256_mask_reduce_min_epu16): Ditto.
(_mm_mask_reduce_add_epi8): Ditto.
(_mm_mask_reduce_mul_epi8): Ditto.
(_mm_mask_reduce_and_epi8): Ditto.
(_mm_mask_reduce_or_epi8): Ditto.
(_mm_mask_reduce_max_epi8): Ditto.
(_mm_mask_reduce_max_epu8): Ditto.
(_mm_mask_reduce_min_epi8): Ditto.
(_mm_mask_reduce_min_epu8): Ditto.
(_mm256_mask_reduce_add_epi8): Ditto.
(_mm256_mask_reduce_mul_epi8): Ditto.
(_mm256_mask_reduce_and_epi8): Ditto.
(_mm256_mask_reduce_or_epi8): Ditto.
(_mm256_mask_reduce_max_epi8): Ditto.
(_mm256_mask_reduce_max_epu8): Ditto.
(_mm256_mask_reduce_min_epi8): Ditto.
(_mm256_mask_reduce_min_epu8): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512vlbw-reduce-op-1.c: New test.
---
 gcc/config/i386/avx2intrin.h  | 347 ++
 gcc/config/i386/avx512vlbwintrin.h| 256 +
 .../gcc.target/i386/avx512vlbw-reduce-op-1.c  | 206 +++
 3 files changed, 809 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vlbw-reduce-op-1.c

diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
index 1b9c8169a96..9b8c13b7233 100644
--- a/gcc/config/i386/avx2intrin.h
+++ b/gcc/config/i386/avx2intrin.h
@@ -1915,6 +1915,353 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const 
*__base,
   (int) (SCALE))
 #endif  /* __OPTIMIZE__ */
 
+#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) \
+  __v8hi __T1 = (__v8hi)__W; \
+  __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, 6, 7); \
+  __v8hi __T3 = __T1 op __T2; \
+  __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, 6, 7); \
+  __v8hi __T5 = __T3 op __T4; \
+  __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1, 1, 2, 3, 4, 5, 6, 7); \
+  __v8hi __T7 = __T5 op __T6; \
+  return __T7[0]
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_add_epi16 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (+);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_mul_epi16 (__m128i __W)
+{
+  _MM_REDUCE_OPERATOR_BASIC_EPI16 (*);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, 

[PATCH] i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm

2023-04-18 Thread Hu, Lin1 via Gcc-patches
Hi, all

The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128.
And it has regtested on x86_64-pc-linux-gnu. OK for trunk?

Thanks.
Lin

vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk.
We can optimze them to vblend, vmovaps when there's no cross-lane.

gcc/ChangeLog:

* config/i386/sse.md: Modify insn vperm{i,f}
and vshuf{i,f}.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test.
* gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto.
* gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto.
* gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto.
* gcc.target/i386/opt-vperm-vshuf-1.c: New test.
* gcc.target/i386/opt-vperm-vshuf-2.c: Ditto.
* gcc.target/i386/opt-vperm-vshuf-3.c: Ditto.
---
 gcc/config/i386/sse.md| 36 --
 .../gcc.target/i386/avx512vl-vshuff32x4-1.c   |  2 +-
 .../gcc.target/i386/avx512vl-vshuff64x2-1.c   |  2 +-
 .../gcc.target/i386/avx512vl-vshufi32x4-1.c   |  2 +-
 .../gcc.target/i386/avx512vl-vshufi64x2-1.c   |  2 +-
 .../gcc.target/i386/opt-vperm-vshuf-1.c   | 51 ++
 .../gcc.target/i386/opt-vperm-vshuf-2.c   | 68 +++
 .../gcc.target/i386/opt-vperm-vshuf-3.c   | 63 +
 8 files changed, 218 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 513960e8f33..5b6b2427460 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -18437,6 +18437,8 @@
   mask = INTVAL (operands[3]) / 2;
   mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
   operands[3] = GEN_INT (mask);
+  if (INTVAL (operands[3]) == 2 && !)
+return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
   return "vshuf64x2\t{%3, %2, %1, 
%0|%0, %1, %2, %3}";
 }
   [(set_attr "type" "sselog")
@@ -18595,6 +18597,9 @@
   mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
   operands[3] = GEN_INT (mask);
 
+  if (INTVAL (operands[3]) == 2 && !)
+return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+
   return "vshuf32x4\t{%3, %2, %1, 
%0|%0, %1, %2, %3}";
 }
   [(set_attr "type" "sselog")
@@ -25663,7 +25668,28 @@
   (match_operand:SI 3 "const_0_to_255_operand")]
  UNSPEC_VPERMTI))]
   "TARGET_AVX2"
-  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  {
+int mask = INTVAL (operands[3]);
+if ((mask & 0xbb) == 16)
+  {
+   if (rtx_equal_p (operands[0], operands[1]))
+ return "";
+   else
+ return "vmovaps\t{%1, %0|%0, %1}";
+  }
+if ((mask & 0xbb) == 50)
+  {
+   if (rtx_equal_p (operands[0], operands[2]))
+ return "";
+   else
+ return "vmovaps\t{%2, %0|%0, %2}";
+  }
+if ((mask & 0xbb) == 18)
+  return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+if ((mask & 0xbb) == 48)
+  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+  }
   [(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
@@ -26226,9 +26252,11 @@
&& avx_vperm2f128_parallel (operands[3], mode)"
 {
   int mask = avx_vperm2f128_parallel (operands[3], mode) - 1;
-  if (mask == 0x12)
-return "vinsert\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
-  if (mask == 0x20)
+  if ((mask & 0xbb) == 0x12)
+return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+  if ((mask & 0xbb) == 0x30)
+return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+  if ((mask & 0xbb) == 0x20)
 return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
   operands[3] = GEN_INT (mask);
   return "vperm2\t{%3, %2, %1, %0|%0, %1, %2, %3}";
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
index 6c2fb2f184a..02aecf4edce 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
@@ -12,7 +12,7 @@ volatile __mmask8 m;
 void extern
 avx512vl_test (void)
 {
-  x = _mm256_shuffle_f32x4 (x, x, 2);
+  x = _mm256_shuffle_f32x4 (x, x, 3);
   x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2);
   x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
index 1191b400134..563ded5d9df 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
@@ -12,7 +12,7 @@ volatile __mmask8 m;
 void extern
 avx512vl_test (void)
 {
-  x = _mm256_shuffle_f64x2 (x, x, 2);
+  x = _mm256_shuffle_f64x2 (x, x, 3);
   x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2);
   x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2);
 }
diff --git 

RE: [PATCH] i386:Add missing OPTION_MASK_ISA_AVX512VL in i386-builtin.def for VAES builtins

2023-03-14 Thread Hu, Lin1 via Gcc-patches
It has regtested on x86_64-pc-linux-gnu. OK for trunk?

Thanks.
Lin

-Original Message-
From: Uros Bizjak  
Sent: Tuesday, March 14, 2023 3:05 PM
To: Hu, Lin1 
Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao 
Subject: Re: [PATCH] i386:Add missing OPTION_MASK_ISA_AVX512VL in 
i386-builtin.def for VAES builtins

On Tue, Mar 14, 2023 at 7:2 AM Hu, Lin1  wrote:
>
> The implementation of these builtins requires support for both 
> AVX512VL and VAES. However, the builtins didn't request AVX512VL. As a 
> result, compiling pr109117-1.c with the options -mvaes -mno-avx512vl caused 
> an ICE.
>
> This patch aims to fix the bug.
>
> gcc/ChangeLog:
>
> PR target/109117
> * config/i386/i386-builtin.def (__builtin_ia32_vaesdec_v16qi,
> __builtin_ia32_vaesdeclast_v16qi,__builtin_ia32_vaesenc_v16qi,
> __builtin_ia32_vaesenclast_v16qi): Require OPTION_MASK_ISA_AVX512VL.
>
> gcc/testsuite/ChangeLog:
>
> PR target/109117
> * gcc.target/i386/pr109117-1.c: New test.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-builtin.def   |  8 
>  gcc/testsuite/gcc.target/i386/pr109117-1.c | 14 ++
>  2 files changed, 18 insertions(+), 4 deletions(-)  create mode 100644 
> gcc/testsuite/gcc.target/i386/pr109117-1.c
>
> diff --git a/gcc/config/i386/i386-builtin.def 
> b/gcc/config/i386/i386-builtin.def
> index f1c295c34f6..17dfe40fac7 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -2797,16 +2797,16 @@ BDESC (0, OPTION_MASK_ISA2_AVX5124VNNIW, 
> CODE_FOR_avx5124vnniw_vp4dpwssds_mask,
>  BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, 
> "__builtin_ia32_rdpid", IX86_BUILTIN_RDPID, UNKNOWN, (int) 
> UNSIGNED_FTYPE_VOID)
>
>  /* VAES.  */
> -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, 
> "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) 
> V16QI_FTYPE_V16QI_V16QI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
> +CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", 
> +IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
>  BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, 
> "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) 
> V32QI_FTYPE_V32QI_V32QI)  BDESC (0, OPTION_MASK_ISA2_VAES, 
> CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", 
> IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC 
> (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, 
> "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, 
> UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
> +CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", 
> +IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
>  BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, 
> "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, 
> UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)  BDESC (0, 
> OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, 
> "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, 
> UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC (0, 
> OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, 
> "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) 
> V16QI_FTYPE_V16QI_V16QI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
> +CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", 
> +IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
>  BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, 
> "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) 
> V32QI_FTYPE_V32QI_V32QI)  BDESC (0, OPTION_MASK_ISA2_VAES, 
> CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", 
> IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) -BDESC 
> (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, 
> "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, 
> UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
> +CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", 
> +IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
>  BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, 
> "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, 
> UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)  BDESC (0, 
> OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, 
> "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, 
> UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
>
> diff --git a/gcc/

[PATCH] i386:Add missing OPTION_MASK_ISA_AVX512VL in i386-builtin.def for VAES builtins

2023-03-14 Thread Hu, Lin1 via Gcc-patches
The implementation of these builtins requires support for both AVX512VL and
VAES. However, the builtins didn't request AVX512VL. As a result, compiling
pr109117-1.c with the options -mvaes -mno-avx512vl caused an ICE.

This patch aims to fix the bug.

gcc/ChangeLog:

PR target/109117
* config/i386/i386-builtin.def (__builtin_ia32_vaesdec_v16qi,
__builtin_ia32_vaesdeclast_v16qi,__builtin_ia32_vaesenc_v16qi,
__builtin_ia32_vaesenclast_v16qi): Require OPTION_MASK_ISA_AVX512VL.

gcc/testsuite/ChangeLog:

PR target/109117
* gcc.target/i386/pr109117-1.c: New test.
---
 gcc/config/i386/i386-builtin.def   |  8 
 gcc/testsuite/gcc.target/i386/pr109117-1.c | 14 ++
 2 files changed, 18 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109117-1.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index f1c295c34f6..17dfe40fac7 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2797,16 +2797,16 @@ BDESC (0, OPTION_MASK_ISA2_AVX5124VNNIW, 
CODE_FOR_avx5124vnniw_vp4dpwssds_mask,
 BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, "__builtin_ia32_rdpid", 
IX86_BUILTIN_RDPID, UNKNOWN, (int) UNSIGNED_FTYPE_VOID)
 
 /* VAES.  */
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, 
"__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, 
UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, 
"__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v64qi, 
"__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, 
"__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", 
IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, 
"__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, 
"__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, 
"__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, 
UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, 
"__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v64qi, 
"__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, 
"__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI)
+BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", 
IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, 
"__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
 BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, 
"__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 
diff --git a/gcc/testsuite/gcc.target/i386/pr109117-1.c 
b/gcc/testsuite/gcc.target/i386/pr109117-1.c
new file mode 100644
index 000..87a5c0e7fc9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109117-1.c
@@ -0,0 +1,14 @@
+/* PR target/109117 */
+/* { dg-do compile } */
+/* { dg-options "-mvaes -mno-avx512vl" } */
+
+typedef char __v16qi __attribute__ ((__vector_size__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16), 
__aligned__(16)));
+volatile __v16qi x, y;
+volatile __m128i res;
+
+void
+foo (void)
+{
+  res = __builtin_ia32_vaesdec_v16qi (x, y); /* { dg-warning "implicit 
declaration of function" } */
+} /* { dg-error "incompatible types when assigning to type" "" { target 
*-*-* } .-1 } */
-- 
2.31.1



RE: [PATCH] loading float member of parameter stored via int registers

2023-01-03 Thread Hu, Lin1 via Gcc-patches
Sorry for send this mail. I enter the wrong command line.

-Original Message-
From: Gcc-patches  On Behalf 
Of Segher Boessenkool
Sent: Tuesday, January 3, 2023 5:00 PM
To: Andrew Pinski 
Cc: Jiufu Guo ; Jiufu Guo via Gcc-patches 
; Richard Biener ; Richard 
Biener ; dje@gmail.com; li...@gcc.gnu.org; 
jeffreya...@gmail.com
Subject: Re: [PATCH] loading float member of parameter stored via int registers

Hi!

On Fri, Dec 30, 2022 at 12:30:04AM -0800, Andrew Pinski wrote:
> On Thu, Dec 29, 2022 at 11:45 PM Segher Boessenkool 
>  wrote:
> > Ah!  This simply shows rs6000_modes_tieable_p is decidedly non-optimal:
> > it does not allow tying a scalar float to anything else.  No such 
> > thing is required, or good apparently.  I wonder why we have such 
> > restrictions at all in rs6000; is it just unfortunate history, was 
> > it good at one point in time?
> 
> The documentation for TARGET_MODES_TIEABLE_P says the following:
> If TARGET_HARD_REGNO_MODE_OK (r, mode1) and TARGET_HARD_REGNO_MODE_OK 
> (r, mode2) are always the same for any r, then TARGET_MODES_TIEABLE_P 
> (mode1, mode2) should be true. If they differ for any r, you should 
> define this hook to return false unless some other mechanism ensures 
> the accessibility of the value in a narrower mode.
> 
> even though rs6000_hard_regno_mode_ok_uncached's comment has the following:
>   /* The float registers (except for VSX vector modes) can only hold floating
>  modes and DImode.  */

That comment is incorrect.  See fctiw for example, which defines only the 
SImode part of the result (the other bits are undefined).

> TARGET_P8_VECTOR and TARGET_P9_VECTOR has special cased different modes now:
>   if (TARGET_P8_VECTOR && (mode == SImode))
> return 1;
> 
>   if (TARGET_P9_VECTOR && (mode == QImode || mode == HImode))
> return 1;
> Which I suspect that means rs6000_modes_tieable_p should return true 
> for SImode and SFmode if TARGET_P8_VECTOR is true. Likewise for 
> TARGET_P9_VECTOR and SFmode and QImode/HImode too.

It means that older CPUs do not have as many instructions to do scalar integer 
operations in vector registers, making it (almost) always a losing proposition 
to put scalar integers there.  On newer CPUs it is not quite as bad, there is a 
full(er) complement of instructions to do such things in vector regs, just a 
bit slower than on GPRs.

But yeah we might need to fix hard_regno_mode_ok if we change tieable.


Segher


RE: [PATCH 2/4] Initial Emeraldrapids Support

2023-01-03 Thread Hu, Lin1 via Gcc-patches
"PATCH 2 Initial Emeraldrapids Support" aims to support Emeraldrapids for GCC. 
It's my mistake, resulting in the omission of its information.

-Original Message-
From: Liu, Hongtao  
Sent: Tuesday, January 3, 2023 4:48 PM
To: Hu, Lin1 ; gcc-patches@gcc.gnu.org
Cc: ubiz...@gmail.com
Subject: RE: [PATCH 2/4] Initial Emeraldrapids Support

There are actually only two patches, not four, and the subject *Patch 2/4* 
should be a typo.

> -Original Message-----
> From: Hu, Lin1 
> Sent: Tuesday, January 3, 2023 4:37 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Liu, Hongtao ; ubiz...@gmail.com
> Subject: [PATCH 2/4] Initial Emeraldrapids Support
> 
> gcc/ChangeLog:
> 
>   * common/config/i386/cpuinfo.h (get_intel_cpu): Handle Emeraldrapids.
>   * common/config/i386/i386-common.cc: Add Emeraldrapids.
> ---
>  gcc/common/config/i386/cpuinfo.h  | 2 ++
>  gcc/common/config/i386/i386-common.cc | 2 ++
>  2 files changed, 4 insertions(+)
> 
> diff --git a/gcc/common/config/i386/cpuinfo.h
> b/gcc/common/config/i386/cpuinfo.h
> index bde231c07ee..3729b0f14a5 100644
> --- a/gcc/common/config/i386/cpuinfo.h
> +++ b/gcc/common/config/i386/cpuinfo.h
> @@ -551,6 +551,8 @@ get_intel_cpu (struct __processor_model *cpu_model,
>break;
>  case 0x8f:
>/* Sapphire Rapids.  */
> +case 0xcf:
> +  /* Emerald Rapids.  */
>cpu = "sapphirerapids";
>CHECK___builtin_cpu_is ("corei7");
>CHECK___builtin_cpu_is ("sapphirerapids"); diff --git 
> a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386- 
> common.cc index 7751265aff4..026926d8b41 100644
> --- a/gcc/common/config/i386/i386-common.cc
> +++ b/gcc/common/config/i386/i386-common.cc
> @@ -2465,6 +2465,8 @@ const pta processor_alias_table[] =
>  M_CPU_SUBTYPE (INTEL_COREI7_COOPERLAKE), P_PROC_AVX512F},
>{"sapphirerapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, 
> PTA_SAPPHIRERAPIDS,
>  M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F},
> +  {"emeraldrapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL,
> PTA_SAPPHIRERAPIDS,
> +M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F},
>{"alderlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
>  M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
>{"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
> --
> 2.18.2



[PATCH 2/4] Initial Emeraldrapids Support

2023-01-03 Thread Hu, Lin1 via Gcc-patches
gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_intel_cpu): Handle Emeraldrapids.
* common/config/i386/i386-common.cc: Add Emeraldrapids.
---
 gcc/common/config/i386/cpuinfo.h  | 2 ++
 gcc/common/config/i386/i386-common.cc | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index bde231c07ee..3729b0f14a5 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -551,6 +551,8 @@ get_intel_cpu (struct __processor_model *cpu_model,
   break;
 case 0x8f:
   /* Sapphire Rapids.  */
+case 0xcf:
+  /* Emerald Rapids.  */
   cpu = "sapphirerapids";
   CHECK___builtin_cpu_is ("corei7");
   CHECK___builtin_cpu_is ("sapphirerapids");
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 7751265aff4..026926d8b41 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2465,6 +2465,8 @@ const pta processor_alias_table[] =
 M_CPU_SUBTYPE (INTEL_COREI7_COOPERLAKE), P_PROC_AVX512F},
   {"sapphirerapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, PTA_SAPPHIRERAPIDS,
 M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F},
+  {"emeraldrapids", PROCESSOR_SAPPHIRERAPIDS, CPU_HASWELL, PTA_SAPPHIRERAPIDS,
+M_CPU_SUBTYPE (INTEL_COREI7_SAPPHIRERAPIDS), P_PROC_AVX512F},
   {"alderlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
 M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
   {"raptorlake", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
-- 
2.18.2



[PATCH 1/4] i386: Remove Meteorlake's family_model

2023-01-03 Thread Hu, Lin1 via Gcc-patches
Hi all,

This patch aims to modified meteorlake's family_model.

Regtested on x86_64-pc-linux-gnu. Ok for trunk?

BRs,
Lin

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_intel_cpu): Remove case 0xb5
for meteorlake.
---
 gcc/common/config/i386/cpuinfo.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 099a02467e6..bde231c07ee 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -540,7 +540,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
   /* Alder Lake.  */
 case 0xb7:
   /* Raptor Lake.  */
-case 0xb5:
 case 0xaa:
 case 0xac:
   /* Meteor Lake.  */
-- 
2.18.2



[PATCH] testsuite: Fix up avx256-unaligned-store-3.c test.

2022-09-25 Thread Hu, Lin1 via Gcc-patches
Hi all,

This patch aims to fix a problem that avx256-unaligned-store-3.c test reports 
two unexpected fails under "-march=cascadelake".

Regtested on x86_64-pc-linux-gnu. Ok for trunk?

BRs,
Lin

gcc/testsuite/ChangeLog:

PR target/94962
* gcc.target/i386/avx256-unaligned-store-3.c: Add -mno-avx512f
---
 gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c 
b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
index f909099bcb1..67635fb9e66 100644
--- a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store -mtune=generic 
-fno-common" } */
+/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store -mtune=generic 
-fno-common -mno-avx512f" } */
 
 #define N 1024
 
-- 
2.18.2



RE: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))

2022-09-22 Thread Hu, Lin1 via Gcc-patches
Hi, Hongtao

I have modefied this patch and regtested on x86_64-pc-linux-gnu.

BRs.
Lin

-Original Message-
From: Hongtao Liu  
Sent: Friday, September 23, 2022 9:48 AM
To: Hu, Lin1 
Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao 
Subject: Re: [PATCH] i386: Optimize code generation of 
__mm256_zextsi128_si256(__mm_set1_epi8(-1))

On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches 
 wrote:
>
> Hi all,
>
> This patch aims to optimize code generation of 
> __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of 
> instructions required to achieve the final result.
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
>
> BRs,
> Lin
>
> gcc/ChangeLog:
>
> PR target/94962
> * config/i386/constraints.md (BH): New define_constraint.
> * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when 
> operand matches new predicate.
> (standard_sse_constant_opcode): Add new alternative branch to return 
> "vpcmpeqd".
> * config/i386/predicates.md 
> (vector_all_ones_zero_extend_half_operand): New define_predicate.
> (vector_all_ones_zero_extend_quarter_operand): Ditto.
> * config/i386/sse.md: Add constraint to insn "mov_internal".
(mov_internal): Add new constraint BH.
Put the insn name at first.
>
> gcc/testsuite/ChangeLog:
>
> PR target/94962
> * gcc.target/i386/avx256-unaligned-load-1.c: Modify test.
> * gcc.target/i386/avx256-unaligned-store-1.c: Ditto.
> * gcc.target/i386/avx256-unaligned-store-2.c: Ditto.
> * gcc.target/i386/avx256-unaligned-store-3.c: Ditto.
> * gcc.target/i386/pr94962-1.c: New test.
> * gcc.target/i386/pr94962-2.c: Ditto.
> * gcc.target/i386/pr94962-3.c: Ditto.
> * gcc.target/i386/pr94962-4.c: Ditto.
> ---
>  gcc/config/i386/constraints.md|  8 +++
>  gcc/config/i386/i386.cc   | 26 +++-
>  gcc/config/i386/predicates.md | 49 ++
>  gcc/config/i386/sse.md|  8 +--
>  .../gcc.target/i386/avx256-unaligned-load-1.c |  4 +-
>  .../i386/avx256-unaligned-store-1.c   |  4 +-
>  .../i386/avx256-unaligned-store-2.c   |  4 +-
>  .../i386/avx256-unaligned-store-3.c   |  4 +-
>  gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 
>  gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 +
>  gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++
>  gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++
>  12 files changed, 235 insertions(+), 13 deletions(-)  create mode 
> 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c
>
> diff --git a/gcc/config/i386/constraints.md 
> b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644
> --- a/gcc/config/i386/constraints.md
> +++ b/gcc/config/i386/constraints.md
> @@ -168,6 +168,9 @@
>  ;;  z  Constant call address operand.
>  ;;  C  Integer SSE constant with all bits set operand.
>  ;;  F  Floating-point SSE constant with all bits set operand.
> +;;  H  Integer SSE constant that is 128/256bit all ones
> +;; and zero-extand to 256/512bit, or 128bit all ones
> +;; and zero-extend to 512bit.
>  ;;  M  x86-64 memory operand.
>
>  (define_constraint "Bf"
> @@ -233,6 +236,11 @@
>(and (match_test "TARGET_SSE")
> (match_operand 0 "float_vector_all_ones_operand")))
>
> +(define_constraint "BH"
> +  "@internal integer constant with last half/quarter bits set operand."
> +  (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand")
> +   (match_operand 0 
> +"vector_all_ones_zero_extend_quarter_operand")))
> +
>  ;; NB: Similar to 'm', but don't use define_memory_constraint on 
> x86-64  ;; to prevent LRA from converting the operand to the form '(mem (reg 
> X))'
>  ;; where X is a base register.
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 
> dadf453d6c0..ca799da5d7e 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx)
>XFmode);  }
>
> -/* Return 1 if X is all bits 0 and 2 if X is all bits 1
> +/* Return 1 if X is all bits 0, 2 if X is all bits 1
> +   and 3 if X is all bits 1 with zero extend
> in supported SSE/AVX vector mode.  */
>
>  int
> @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode)
>   

[PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))

2022-09-22 Thread Hu, Lin1 via Gcc-patches
Hi all,

This patch aims to optimize code generation of 
__mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of instructions 
required to achieve the final result.

Regtested on x86_64-pc-linux-gnu. Ok for trunk?

BRs,
Lin

gcc/ChangeLog:

PR target/94962
* config/i386/constraints.md (BH): New define_constraint.
* config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when 
operand matches new predicate.
(standard_sse_constant_opcode): Add new alternative branch to return 
"vpcmpeqd".
* config/i386/predicates.md (vector_all_ones_zero_extend_half_operand): 
New define_predicate.
(vector_all_ones_zero_extend_quarter_operand): Ditto.
* config/i386/sse.md: Add constraint to insn "mov_internal".

gcc/testsuite/ChangeLog:

PR target/94962
* gcc.target/i386/avx256-unaligned-load-1.c: Modify test.
* gcc.target/i386/avx256-unaligned-store-1.c: Ditto.
* gcc.target/i386/avx256-unaligned-store-2.c: Ditto.
* gcc.target/i386/avx256-unaligned-store-3.c: Ditto.
* gcc.target/i386/pr94962-1.c: New test.
* gcc.target/i386/pr94962-2.c: Ditto.
* gcc.target/i386/pr94962-3.c: Ditto.
* gcc.target/i386/pr94962-4.c: Ditto.
---
 gcc/config/i386/constraints.md|  8 +++
 gcc/config/i386/i386.cc   | 26 +++-
 gcc/config/i386/predicates.md | 49 ++
 gcc/config/i386/sse.md|  8 +--
 .../gcc.target/i386/avx256-unaligned-load-1.c |  4 +-
 .../i386/avx256-unaligned-store-1.c   |  4 +-
 .../i386/avx256-unaligned-store-2.c   |  4 +-
 .../i386/avx256-unaligned-store-3.c   |  4 +-
 gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 
 gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 +
 gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++
 gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++
 12 files changed, 235 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 7361687632f..95b2b142d41 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -168,6 +168,9 @@
 ;;  z  Constant call address operand.
 ;;  C  Integer SSE constant with all bits set operand.
 ;;  F  Floating-point SSE constant with all bits set operand.
+;;  H  Integer SSE constant that is 128/256bit all ones
+;; and zero-extand to 256/512bit, or 128bit all ones
+;; and zero-extend to 512bit.
 ;;  M  x86-64 memory operand.
 
 (define_constraint "Bf"
@@ -233,6 +236,11 @@
   (and (match_test "TARGET_SSE")
(match_operand 0 "float_vector_all_ones_operand")))
 
+(define_constraint "BH"
+  "@internal integer constant with last half/quarter bits set operand."
+  (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand")
+   (match_operand 0 "vector_all_ones_zero_extend_quarter_operand")))
+
 ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64
 ;; to prevent LRA from converting the operand to the form '(mem (reg X))'
 ;; where X is a base register.
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index dadf453d6c0..ca799da5d7e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx)
   XFmode);
 }
 
-/* Return 1 if X is all bits 0 and 2 if X is all bits 1
+/* Return 1 if X is all bits 0, 2 if X is all bits 1
+   and 3 if X is all bits 1 with zero extend
in supported SSE/AVX vector mode.  */
 
 int
@@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode)
}
 }
 
+  if (vector_all_ones_zero_extend_half_operand (x, mode)
+  || vector_all_ones_zero_extend_quarter_operand (x, mode))
+return 3;
+
   return 0;
 }
 
@@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx 
*operands)
  gcc_unreachable ();
}
}
+  else if (vector_all_ones_zero_extend_half_operand (x, mode))
+{
+  if (GET_MODE_SIZE (mode) == 64)
+   {
+ gcc_assert (TARGET_AVX512F);
+ return "vpcmpeqd \t %t0, %t0, %t0";
+   }
+  else if (GET_MODE_SIZE (mode) == 32)
+   {
+ gcc_assert (TARGET_AVX);
+ return "vpcmpeqd \t %x0, %x0, %x0";
+   }
+  gcc_unreachable ();
+}
+  else if (vector_all_ones_zero_extend_quarter_operand (x, mode))
+{
+  gcc_assert (TARGET_AVX512F);
+  return "vpcmpeqd \t %x0, %x0, %x0";
+}
 
   gcc_unreachable ();
 }
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 4f16bb748b5..655eabf793b 100644
--- a/gcc/config/i386/predicates.md
+++