Re: [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

2024-05-23 Thread Hongtao Liu
CC for review.

On Tue, May 21, 2024 at 1:12 PM liuhongt  wrote:
>
> When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
> of A, then it can be simplified to LSHIFTRT.
>
> i.e Simplify
> (and:v8hi
>   (ashifrt:v8hi A 8)
>   (const_vector 0xff x8))
> to
> (lshifrt:v8hi A 8)
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok of trunk?
>
> gcc/ChangeLog:
>
> PR target/114428
> * simplify-rtx.cc
> (simplify_context::simplify_binary_operation_1):
> Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
> specific mask.
> ---
>  gcc/simplify-rtx.cc | 25 +
>  1 file changed, 25 insertions(+)
>
> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
> index 53f54d1d392..6c91409200e 100644
> --- a/gcc/simplify-rtx.cc
> +++ b/gcc/simplify-rtx.cc
> @@ -4021,6 +4021,31 @@ simplify_context::simplify_binary_operation_1 
> (rtx_code code,
> return tem;
> }
>
> +  /* (and:v4si
> +  (ashiftrt:v4si A 16)
> +  (const_vector: 0x x4))
> +is just (lshiftrt:v4si A 16).  */
> +  if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
> + && (CONST_INT_P (XEXP (op0, 1))
> + || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
> + && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
> + && GET_CODE (op1) == CONST_VECTOR
> + && CONST_VECTOR_DUPLICATE_P (op1))
> +   {
> + unsigned HOST_WIDE_INT shift_count
> +   = (CONST_INT_P (XEXP (op0, 1))
> +  ? UINTVAL (XEXP (op0, 1))
> +  : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
> + unsigned HOST_WIDE_INT inner_prec
> +   = GET_MODE_PRECISION (GET_MODE_INNER (mode));
> +
> + /* Avoid UD shift count.  */
> + if (shift_count < inner_prec
> + && (UINTVAL (XVECEXP (op1, 0, 0))
> + == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
> +   return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP 
> (op0, 1));
> +   }
> +
>tem = simplify_byte_swapping_operation (code, mode, op0, op1);
>if (tem)
> return tem;
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.

2024-05-23 Thread Hongtao Liu
On Thu, May 23, 2024 at 3:17 PM Hu, Lin1  wrote:
>
> > -Original Message-
> > From: Hongtao Liu 
> > Sent: Thursday, May 23, 2024 2:42 PM
> > To: Hu, Lin1 
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> > ubiz...@gmail.com; rguent...@suse.de
> > Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> >
> > On Thu, May 23, 2024 at 2:38 PM Hu, Lin1  wrote:
> > >
> > > gcc/ChangeLog:
> > >
> > > PR 107432
> > > * config/i386/i386-expand.cc 
> > > (ix86_expand_trunc_with_avx2_noavx512f):
> > > New function for generate a series of suitable insn.
> > > * config/i386/i386-protos.h 
> > > (ix86_expand_trunc_with_avx2_noavx512f):
> > > Define new function.
> > > * config/i386/sse.md: Extend trunc2 for x86-64-v3.
> > I have some concern for this patch since
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this
> > patch.
>
> OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some 
> better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5).
> Or we can disable some of the optimization via vpermq. In pr107432-8.c, there 
> are only 5 tests that use vpermq.
After a second thought, we may go ahead with the patch, for PR115069,
there's an alternative to avoid cross-lane truncation.
But for this one, there's no alternative. Although cross-lane
permutation is not very efficient, it should still be better than
original code.
>
> BRs,
> Lin
>
> > > gcc/testsuite/ChangeLog:
> > >
> > > PR 107432
> > > * gcc.target/i386/pr107432-8.c: New test.
> > > * gcc.target/i386/pr107432-9.c: Ditto.
> > > * gcc.target/i386/pr92645-4.c: Modify test.
> > > ---
> > >  gcc/config/i386/i386-expand.cc |  47 +++-
> > >  gcc/config/i386/i386-protos.h  |   3 +
> > >  gcc/config/i386/sse.md |  87 +++
> > >  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +
> > > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +
> > >  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
> > >  6 files changed, 304 insertions(+), 29 deletions(-)  create mode
> > > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc
> > > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
> > >emit_insn (gen_xorv4si3 (value, value, large));  }
> > >
> > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> > > -machine_mode mode, rtx 
> > > target,
> > > -rtx var, int one_var);
> > > -
> > >  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> > > Expects the 64-bit DImode to be supplied in a pair of integral
> > > registers.  Requires SSE2; will use SSE3 if available.  For
> > > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool
> > mmx_ok, machine_mode mode,
> > > whose ONE_VAR element is VAR, and other elements are zero.  Return 
> > > true
> > > if successful.  */
> > >
> > > -static bool
> > > +bool
> > >  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
> > >  rtx target, rtx var, int one_var)
> > > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
> > >return ret;
> > >  }
> > >
> > > +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> > > +
> > > +bool
> > > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) {
> > > +  machine_mode out_mode = GET_MODE (output);
> > > +  machine_mode in_mode = GET_MODE (input);
> > > +  int len = GET_MODE_SIZE (in_mode);
> > > +  gcc_assert (len == 16 || len == 32);
> > > +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> > > +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> > > +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> > > +
> > > +  struct expand_vec_

Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.

2024-05-23 Thread Hongtao Liu
On Thu, May 23, 2024 at 2:38 PM Hu, Lin1  wrote:
>
> gcc/ChangeLog:
>
> PR 107432
> * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
> New function for generate a series of suitable insn.
> * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
> Define new function.
> * config/i386/sse.md: Extend trunc2 for x86-64-v3.
I have some concern for this patch since
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to
this patch.
> gcc/testsuite/ChangeLog:
>
> PR 107432
> * gcc.target/i386/pr107432-8.c: New test.
> * gcc.target/i386/pr107432-9.c: Ditto.
> * gcc.target/i386/pr92645-4.c: Modify test.
> ---
>  gcc/config/i386/i386-expand.cc |  47 +++-
>  gcc/config/i386/i386-protos.h  |   3 +
>  gcc/config/i386/sse.md |  87 +++
>  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +
>  gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +
>  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
>  6 files changed, 304 insertions(+), 29 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..bca8b85c9d1 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
>emit_insn (gen_xorv4si3 (value, value, large));
>  }
>
> -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> -machine_mode mode, rtx 
> target,
> -rtx var, int one_var);
> -
>  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> Expects the 64-bit DImode to be supplied in a pair of integral
> registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
> @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
> machine_mode mode,
> whose ONE_VAR element is VAR, and other elements are zero.  Return true
> if successful.  */
>
> -static bool
> +bool
>  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
>  rtx target, rtx var, int one_var)
>  {
> @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
>return ret;
>  }
>
> +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> +
> +bool
> +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input)
> +{
> +  machine_mode out_mode = GET_MODE (output);
> +  machine_mode in_mode = GET_MODE (input);
> +  int len = GET_MODE_SIZE (in_mode);
> +  gcc_assert (len == 16 || len == 32);
> +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> +
> +  struct expand_vec_perm_d d;
> +  d.target = gen_reg_rtx (cvt_mode);
> +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode);
> +  d.op1 = d.op0;
> +  d.vmode = cvt_mode;
> +  d.nelt = len;
> +  d.testing_p = false;
> +  d.one_operand_p = true;
> +
> +  /* Init perm. Put the needed bits of input in order and
> + fill the rest of bits by default.  */
> +  int tot = 0;
> +  for (int i = 0; i < len; ++i)
> +{
> +  d.perm[i] = i;
> +  if ((i % in_innersize) < out_innersize)
> +   d.perm[tot++] = i;
> +}
> +
> +  if (ix86_expand_vec_perm_const_1())
> +{
> +  emit_move_insn (output, gen_lowpart (out_mode, d.target));
> +  return true;
> +}
> +
> +  return false;
> +}
> +
>  #include "gt-i386-expand.h"
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index dbc861fb1ea..ac29fb34028 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, 
> rtx, enum rtx_code,
>  extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
>   bool, rtx_code_label *);
>  extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
> +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
>  extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
>  extern bool ix86_memtag_can_tag_addresses (void);
>
> @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
>  extern void ix86_expand_sse2_abs (rtx, rtx);
>  extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
>rtx);
> +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
> +rtx, int);
>  extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
>
>  /* In i386-c.cc  */
> diff --git 

Re: [PATCH] Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

2024-05-22 Thread Hongtao Liu
On Wed, May 22, 2024 at 3:59 PM Jakub Jelinek  wrote:
>
> On Wed, May 22, 2024 at 09:46:41AM +0200, Richard Biener wrote:
> > On Wed, May 22, 2024 at 3:58 AM liuhongt  wrote:
> > >
> > > According to IEEE standard, for conversions from floating point to
> > > integer. When a NaN or infinite operand cannot be represented in the
> > > destination format and this cannot otherwise be indicated, the invalid
> > > operation exception shall be signaled. When a numeric operand would
> > > convert to an integer outside the range of the destination format, the
> > > invalid operation exception shall be signaled if this situation cannot
> > > otherwise be indicated.
> > >
> > > The patch prevent simplication of the conversion from floating point
> > > to integer for NAN/INF/out-of-range constant when flag_trapping_math.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> > > Ok for trunk?
> >
> > OK if there are no further comments today.
>
> As I wrote in the PR, I don't think this is the right fix for the PR,
> the simplify-rtx.cc change is the right thing to do, the C standard
> in F.4 says that the out of range conversions to integers should raise
> exceptions, but still says that the resulting value in those cases is
> unspecified.
> So, for the C part we should verify that with -ftrapping-math we don't
> constant fold it and cover it both by pure C and perhaps backend specific
> testcases which just search asm for the conversion instructions
> or even runtime test which tests that the exceptions are triggered,
> verify that we don't fold it either during GIMPLE opts or RTL opts
> (dunno whether they can be folded in e.g. C constant initializers or not).
>
There're lots of warnings for Wconversion that rely on gimple fold,
lots of new failures after the patch below.
Should we also add -fno-trapping-math to those testcases?

-  t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  /* According to IEEE standard, for conversions from floating point to
+ integer. When a NaN or infinite operand cannot be represented in the
+ destination format and this cannot otherwise be indicated, the invalid
+ operation exception shall be signaled. When a numeric operand would
+ convert to an integer outside the range of the destination format, the
+ invalid operation exception shall be signaled if this situation cannot
+ otherwise be indicated.  */
+  if (!flag_trapping_math || !overflow)
+t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  else
+t = NULL_TREE;
+
   return t;
 }


g++: c-c++-common/Wconversion-1.c  -std=gnu++14  (test for warnings, line 13)

g++: c-c++-common/Wconversion-1.c  -std=gnu++17  (test for warnings, line 13)

g++: c-c++-common/Wconversion-1.c  -std=gnu++20  (test for warnings, line 13)

g++: c-c++-common/Wconversion-1.c  -std=gnu++98  (test for warnings, line 13)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14  (test for
warnings, line 26)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14  (test for
warnings, line 30)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14  (test for
warnings, line 34)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14  (test for
warnings, line 39)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14  (test for
warnings, line 43)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14  (test for
warnings, line 47)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14  (test for
warnings, line 51)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14  (test for
warnings, line 55)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14  (test for
warnings, line 59)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++14 execution test

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17  (test for
warnings, line 26)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17  (test for
warnings, line 30)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17  (test for
warnings, line 34)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17  (test for
warnings, line 39)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17  (test for
warnings, line 43)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17  (test for
warnings, line 47)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17  (test for
warnings, line 51)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17  (test for
warnings, line 55)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17  (test for
warnings, line 59)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++17 execution test

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++20  (test for
warnings, line 26)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++20  (test for
warnings, line 30)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++20  (test for
warnings, line 34)

g++: c-c++-common/dfp/convert-int-saturate.c  -std=c++20  (test for
warnings, line 39)

g++: 

Re: [V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli.

2024-05-22 Thread Hongtao Liu
On Wed, May 22, 2024 at 1:07 PM liuhongt  wrote:
>
> >> Hard to find a default value satisfying all testcases.
> >> some require loop unroll with 7 insns increment, some don't want loop
> >> unroll w/ 5 insn increment.
> >> The original 2/3 reduction happened to meet all those testcases(or the
> >> testcases are constructed based on the old 2/3).
> >> Can we define the parameter as the size of the loop, below the size we
> >> still do the reduction, so the small loop can be unrolled?
>
> >Yeah, that's also a sensible possibility.  Does it work to have a parameter
> >for the unrolled body size?  Thus, amend the existing
> >--param max-completely-peeled-insns with a --param
> >max-completely-peeled-insns-nogrowth?
>
> Update V2:
> It's still hard to find a default value for loop boday size. So I move the
> 2 / 3 reduction from estimated_unrolled_size to try_unroll_loop_completely.
> For the check of body size shrink, 2 / 3 reduction is added, so small loops
> can still be unrolled.
> For the check of comparison between body size and 
> param_max_completely_peeled_insns,
> 2 / 3 is conditionally added for loop->inner || !cunrolli.
> Then the patch avoid gcc testsuite regression, and also prevent big inner loop
> completely unrolled at cunrolli.
The patch regressed arm-*-eabi for

FAIL: 3 regressions



regressions.sum:

=== gcc tests ===



Running gcc:gcc.dg/tree-ssa/tree-ssa.exp ...

FAIL: gcc.dg/tree-ssa/pr83403-1.c scan-tree-dump-times lim2 "Executing
store motion of" 10

FAIL: gcc.dg/tree-ssa/pr83403-2.c scan-tree-dump-times lim2 "Executing
store motion of" 10

=== gfortran tests ===



Running gfortran:gfortran.dg/dg.exp ...

FAIL: gfortran.dg/reassoc_4.f -O   scan-tree-dump-times reassoc1 "[0-9] \\* " 22

for 32-bit arm, estimate_num_insns_seq returns more for load/store of double.

The loop in pr83403-1.c
 198Estimating sizes for loop 4
 199 BB: 6, after_exit: 0
 200  size:   2 if (m_23 != 10)
 201   Exit condition will be eliminated in peeled copies.
 202   Exit condition will be eliminated in last copy.
 203   Constant conditional.
 204 BB: 5, after_exit: 1
 205  size:   1 _5 = n_24 * 10;
 206  size:   1 _6 = _5 + m_23;
 207  size:   1 _7 = _6 * 8;
 208  size:   1 _8 = C_35 + _7;
 209  size:   2 _9 = *_8;
 210  size:   1 _10 = k_25 * 20;
 211  size:   1 _11 = _10 + m_23;
 212  size:   1 _12 = _11 * 8;
 213  size:   1 _13 = A_31 + _12;
 214  size:   2 _14 = *_13;
 215  size:   1 _15 = n_24 * 20;
 216  size:   1 _16 = _15 + k_25;
 217  size:   1 _17 = _16 * 8;
 218  size:   1 _18 = B_33 + _17;
 219  size:   2 _19 = *_18;
 220  size:   1 _20 = _14 * _19;
 221  size:   1 _21 = _9 + _20;
 222  size:   2 *_8 = _21;
 223  size:   1 m_40 = m_23 + 1;
 224   Induction variable computation will be folded away.
 225size: 25-3, last_iteration: 2-2
 226  Loop size: 25
 227  Estimated size after unrolling: 220

For aarch64 and x86, it's ok

 198Estimating sizes for loop 4
 199 BB: 6, after_exit: 0
 200  size:   2 if (m_27 != 10)
 201   Exit condition will be eliminated in peeled copies.
 202   Exit condition will be eliminated in last copy.
 203   Constant conditional.
 204 BB: 5, after_exit: 1
 205  size:   1 _6 = n_28 * 10;
 206  size:   1 _7 = _6 + m_27;
 207  size:   0 _8 = (long unsigned int) _7;
 208  size:   1 _9 = _8 * 8;
 209  size:   1 _10 = C_39 + _9;
 210  size:   1 _11 = *_10;
 211  size:   1 _12 = k_29 * 20;
 212  size:   1 _13 = _12 + m_27;
 213  size:   0 _14 = (long unsigned int) _13;
 214  size:   1 _15 = _14 * 8;
 215  size:   1 _16 = A_35 + _15;
 216  size:   1 _17 = *_16;
 217  size:   1 _18 = n_28 * 20;
 218  size:   1 _19 = _18 + k_29;
 219  size:   0 _20 = (long unsigned int) _19;
 220  size:   1 _21 = _20 * 8;
 221  size:   1 _22 = B_37 + _21;
 222  size:   1 _23 = *_22;
 223  size:   1 _24 = _17 * _23;
 224  size:   1 _25 = _11 + _24;
 225  size:   1 *_10 = _25;
 226  size:   1 m_44 = m_27 + 1;
 227   Induction variable computation will be folded away.
 228size: 21-3, last_iteration: 2-2
 229  Loop size: 21
 230  Estimated size after unrolling: 180

>
> --
>
> For the innermost loop, after completely loop unroll, it will most likely
> not be able to reduce the body size to 2/3. The current 2/3 reduction
> will make some of the larger loops completely unrolled during
> cunrolli, which will then result in them not being able to be
> vectorized. It also increases the register pressure. The patch move
> from estimated_unrolled_size to
> the 2/3 reduction at cunrolli.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR tree-optimization/112325
> * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
> 2 / 3 loop body size reduction to ..
> (try_unroll_loop_completely): .. here, add it for the check of
> body size shrink, and the check of comparison against
> param_max_completely_peeled_insns when
> 

Re: [PATCH v2] i386: Disable ix86_expand_vecop_qihi2 when !TARGET_AVX512BW

2024-05-21 Thread Hongtao Liu
On Tue, May 21, 2024 at 3:14 PM Haochen Jiang  wrote:
>
> Hi all,
>
> This is the v2 patch to fix PR115069. The new testcase has passed.
>
> Changes in v2:
>   - Added a testcase.
>   - Change the comment for the early exit.
>
> Thx,
> Haochen
>
> Since vpermq is really slow, we should avoid using it for permutation
> when vpmovwb is not available (needs AVX512BW) for ix86_expand_vecop_qihi2
> and fall back to ix86_expand_vecop_qihi.
>
> gcc/ChangeLog:
>
> PR target/115069
> * config/i386/i386-expand.cc (ix86_expand_vecop_qihi2):
> Do not enable the optimization when AVX512BW is not enabled.
>
> gcc/testsuite/ChangeLog:
>
> PR target/115069
> * gcc.target/i386/pr115069.c: New.
> ---
>  gcc/config/i386/i386-expand.cc   |  7 +++
>  gcc/testsuite/gcc.target/i386/pr115069.c | 78 
>  2 files changed, 85 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr115069.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index a6132911e6a..f7939761879 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -24323,6 +24323,13 @@ ix86_expand_vecop_qihi2 (enum rtx_code code, rtx 
> dest, rtx op1, rtx op2)
>bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
>bool uns_p = code != ASHIFTRT;
>
> +  /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
> + generic permutation to merge the data back into the right place.  This
> + permutation results in VPERMQ, which is slow, so better fall back to
> + ix86_expand_vecop_qihi.  */
> +  if (!TARGET_AVX512BW)
> +return false;
> +
>if ((qimode == V16QImode && !TARGET_AVX2)
>|| (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512))
>/* There are no V64HImode instructions.  */
> diff --git a/gcc/testsuite/gcc.target/i386/pr115069.c 
> b/gcc/testsuite/gcc.target/i386/pr115069.c
> new file mode 100644
> index 000..c4b48b602ef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr115069.c
> @@ -0,0 +1,78 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2" } */
> +/* { dg-final { scan-assembler-not "vpermq" } } */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +typedef int8_t  stress_vint8_t  __attribute__ ((vector_size (16)));
No need for such big testcase,

typedef char v16qi __attribute__((vector_size(16)));
v16qi
foo (v16qi a, v16qi b)
{
return a * b;
}

should be enough, with -mavx2 -mno-avx512f
> +
> +#define OPS(a, b, c, s, v23, v3) \
> +do {   \
> +   a += b; \
> +   a |= b; \
> +   a -= b; \
> +   a &= ~b;\
> +   a *= c; \
> +   a = ~a; \
> +   a *= s; \
> +   a ^= c; \
> +   a <<= 1;\
> +   b >>= 1;\
> +   b += c; \
> +   a %= v23;   \
> +   c /= v3;\
> +   b = b ^ c;  \
> +   c = b ^ c;  \
> +   b = b ^ c;  \
> +} while (0)
> +
> +volatile uint8_t csum8_put;
> +
> +void stress_vecmath(void)
> +{
> +   const stress_vint8_t v23_8 = {
> +   0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
> +   0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17
> +   };
> +   const stress_vint8_t v3_8 = {
> +   0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
> +   0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
> +   };
> +   stress_vint8_t a8 = {
> +   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> +   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
> +   };
> +   stress_vint8_t b8 = {
> +   0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
> +   0x0f, 0x1e, 0x2d, 0x3c, 0x4b, 0x5a, 0x69, 0x78
> +   };
> +   stress_vint8_t c8 = {
> +   0x01, 0x02, 0x03, 0x02, 0x01, 0x02, 0x03, 0x02,
> +   0x03, 0x02, 0x01, 0x02, 0x03, 0x02, 0x01, 0x02
> +   };
> +   stress_vint8_t s8 = {
> +   0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02,
> +   0x01, 0x01, 0x02, 0x02, 0x01, 0x01, 0x02, 0x02,
> +   };
> +   const uint8_t csum8_val =  (uint8_t)0x1b;
> +   int i;
> +   uint8_t csum8;
> +
> +   for (i = 1000; i; i--) {
> +   OPS(a8, b8, c8, s8, v23_8, v3_8);
> +   OPS(a8, b8, c8, s8, v23_8, v3_8);
> +   OPS(a8, b8, c8, s8, v23_8, v3_8);
> +   OPS(a8, b8, c8, s8, v23_8, v3_8);
> +   OPS(a8, b8, c8, s8, v23_8, v3_8);
> +   OPS(a8, b8, c8, s8, v23_8, v3_8);
> +   }
> +
> +   csum8 = a8[0]  ^ a8[1]  ^ a8[2]  ^ a8[3]  ^
> +   a8[4]  ^ a8[5]  ^ a8[6]  ^ a8[7]  ^
> +   a8[8]  ^ a8[9]  ^ a8[10] ^ a8[11] ^
> +   a8[12] 

Re: [PATCH] i386: Disable ix86_expand_vecop_qihi2 when !TARGET_AVX512BW

2024-05-21 Thread Hongtao Liu
On Tue, May 21, 2024 at 2:16 PM Haochen Jiang  wrote:
>
> Hi all,
>
> Since vpermq is really slow, we should avoid using it when it is
> the only instruction could be used for ix86_expand_vecop_qihi2.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu. Ok for trunk?
Please add a testcase for it.
>
> Thx,
> Haochen
>
> gcc/ChangeLog:
>
> PR target/115069
> * config/i386/i386-expand.cc (ix86_expand_vecop_qihi2):
> Do not enable the optimization when AVX512BW is not enabled.
> ---
>  gcc/config/i386/i386-expand.cc | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index a6132911e6a..f24c800bb4f 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -24323,6 +24323,11 @@ ix86_expand_vecop_qihi2 (enum rtx_code code, rtx 
> dest, rtx op1, rtx op2)
>bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
>bool uns_p = code != ASHIFTRT;
>
> +  /* vpermq is slow and we should not fall into the optimization when
> + it is the only instruction to be selected.  */
> +  if (!TARGET_AVX512BW)
> +return false;
> +
>if ((qimode == V16QImode && !TARGET_AVX2)
>|| (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512))
>/* There are no V64HImode instructions.  */
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] Don't reduce estimated unrolled size for innermost loop.

2024-05-20 Thread Hongtao Liu
On Wed, May 15, 2024 at 5:24 PM Richard Biener
 wrote:
>
> On Wed, May 15, 2024 at 4:15 AM Hongtao Liu  wrote:
> >
> > On Mon, May 13, 2024 at 3:40 PM Richard Biener
> >  wrote:
> > >
> > > On Mon, May 13, 2024 at 4:29 AM liuhongt  wrote:
> > > >
> > > > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > > > innermost loop and increase register pressure.
> > > > The patch removes the 1/3 reduction of unr_insn for innermost loop for 
> > > > UL_ALL.
> > > > ul != UR_ALL is needed since some small loop complete unrolling at O2 
> > > > relies
> > > > the reduction.
> > > >
> > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > No big impact for SPEC2017.
> > > > Ok for trunk?
> > >
> > > This removes the 1/3 reduction when unrolling a loop nest (the case I was
> > > concerned about).  Unrolling of a nest is by iterating in
> > > tree_unroll_loops_completely
> > > so the to be unrolled loop appears innermost.  So I think you need a new
> > > parameter on tree_unroll_loops_completely_1 indicating whether we're in 
> > > the
> > > first iteration (or whether to assume inner most loops will "simplify").
> > yes, it would be better.
> > >
> > > Few comments below
> > >
> > > > gcc/ChangeLog:
> > > >
> > > > PR tree-optimization/112325
> > > > * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> > > > new parameters: loop and ul, and remove unr_insns reduction
> > > > for innermost loop.
> > > > (try_unroll_loop_completely): Pass loop and ul to
> > > > estimated_unrolled_size.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > * gcc.dg/tree-ssa/pr112325.c: New test.
> > > > * gcc.dg/vect/pr69783.c: Add extra option --param
> > > > max-completely-peeled-insns=300.
> > > > ---
> > > >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 
> > > >  gcc/testsuite/gcc.dg/vect/pr69783.c  |  2 +-
> > > >  gcc/tree-ssa-loop-ivcanon.cc | 16 +--
> > > >  3 files changed, 71 insertions(+), 4 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > >
> > > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c 
> > > > b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > new file mode 100644
> > > > index 000..14208b3e7f8
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > > > @@ -0,0 +1,57 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > > > +
> > > > +typedef unsigned short ggml_fp16_t;
> > > > +static float table_f32_f16[1 << 16];
> > > > +
> > > > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > > > +unsigned short s;
> > > > +__builtin_memcpy(, , sizeof(unsigned short));
> > > > +return table_f32_f16[s];
> > > > +}
> > > > +
> > > > +typedef struct {
> > > > +ggml_fp16_t d;
> > > > +ggml_fp16_t m;
> > > > +unsigned char qh[4];
> > > > +unsigned char qs[32 / 2];
> > > > +} block_q5_1;
> > > > +
> > > > +typedef struct {
> > > > +float d;
> > > > +float s;
> > > > +char qs[32];
> > > > +} block_q8_1;
> > > > +
> > > > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const 
> > > > void * restrict vx, const void * restrict vy) {
> > > > +const int qk = 32;
> > > > +const int nb = n / qk;
> > > > +
> > > > +const block_q5_1 * restrict x = vx;
> > > > +const block_q8_1 * restrict y = vy;
> > > > +
> > > > +float sumf = 0.0;
> > > > +
> > > > +for (int i = 0; i < nb; i++) {
> > > > +unsigned qh;
> > > > +__builtin_memcpy(, x[i].qh, sizeof(qh));
> > > > +
> > > > +int sumi = 0;
> > > > +
> > > > +for (int j = 0; j < qk/2; ++j) {
> > > > +const unsigned

Re: [PATCH 0/2] Align tight loops to solve cross cacheline issue

2024-05-19 Thread Hongtao Liu
On Wed, May 15, 2024 at 11:30 AM Jiang, Haochen  wrote:
>
> Also cc Honza and Richard since we touched generic tune.
>
> Thx,
> Haochen
>
> > -Original Message-
> > From: Haochen Jiang 
> > Sent: Wednesday, May 15, 2024 11:04 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Liu, Hongtao ; ubiz...@gmail.com
> > Subject: [PATCH 0/2] Align tight loops to solve cross cacheline issue
> >
> > Hi all,
> >
> > Recently, we have encountered several random performance regressions in
> > benchmarks commit to commit. It is caused by cross cacheline issue for tight
> > loops.
> >
> > We are trying to solve the issue by two patches. One is adjusting the loop
> > alignment for generic tune, the other is aligning tight and hot loops more
> > aggressively.
> >
> > For SPECINT, we get a 0.85% improvement overall in rates, under option
> > -O2 -march=x86-64-v3 -mtune=generic on Emerald Rapids.
> >
> > BenchMarks  EMR Rates
> > 500.perlbench_r -1.21%
> > 502.gcc_r   0.78%
> > 505.mcf_r   0.00%
> > 520.omnetpp_r   0.41%
> > 523.xalancbmk_r 1.33%
> > 525.x264_r  2.83%
> > 531.deepsjeng_r 1.11%
> > 541.leela_r 0.00%
> > 548.exchange2_r 2.36%
> > 557.xz_r0.98%
> > Geomean-int 0.85%
> >
> > Side effect is that we get a 1.40% increase in codesize.
> >
> > BenchMarks  EMR Codesize
> > 500.perlbench_r 0.70%
> > 502.gcc_r   0.67%
> > 505.mcf_r   3.26%
> > 520.omnetpp_r   0.31%
> > 523.xalancbmk_r 1.15%
> > 525.x264_r  1.11%
> > 531.deepsjeng_r 1.40%
> > 541.leela_r 1.31%
> > 548.exchange2_r 3.06%
> > 557.xz_r1.04%
> > Geomean-int 1.40%
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu.
> >
> > After we committed into trunk for a month, if there isn't any unexpected
> > happen. We planned to backport it to GCC14.2.
> >
> > Thx,
> > Haochen
> >
> > Haochen Jiang (1):
> >   Adjust generic loop alignment from 16:11:8 to 16 for Intel processors
For this one, current znver{1,2,3,4,5}_cost already set loop align as
16, so I think it should be fine set it to generic_cost.
> >
> > liuhongt (1):
> >   Align tight loop without considering max skipping bytes.
For this one, although we have seen similar growth on AMD's
processors, it's still nice to have someone from AMD to look at this
to see if it's what they need.
> >
> >  gcc/config/i386/i386.cc  | 148 ++-
> >  gcc/config/i386/i386.md  |  10 ++-
> >  gcc/config/i386/x86-tune-costs.h |   2 +-
> >  3 files changed, 154 insertions(+), 6 deletions(-)
> >
> > --
> > 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] i386: Rename sat_plusminus expanders to standard names [PR11260]

2024-05-19 Thread Hongtao Liu
On Fri, May 17, 2024 at 3:55 PM Uros Bizjak  wrote:
>
> Rename _3 expander to a standard ssadd,
> usadd, sssub and ussub name to enable corresponding optab expansion.
>
> Also add named expander for MMX modes.
LGTM.
>
> PR middle-end/112600
>
> gcc/ChangeLog:
>
> * config/i386/mmx.md (3): New expander.
> * config/i386/sse.md
> (_3):
> Rename expander to 3.
> (3): Update for rename.
> * config/i386/i386-builtin.def (BDESC): Update for rename.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr112600-1a.c: New test.
> * gcc.target/i386/pr112600-1b.c: New test.
>
> Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.
>
> Uros.



-- 
BR,
Hongtao


Re: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer

2024-05-16 Thread Hongtao Liu
> >
> Sorry to chime in, for x86 backend, we defined usdot_prodv16hi, and
> 2-way dot_prod operations can be generated
>
This is the link https://godbolt.org/z/hcWr64vx3, x86 define
udot_prodv16qi/udot_prod8hi and both 2-way and 4-way dot_prod
instructions are generated


-- 
BR,
Hongtao


Re: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer

2024-05-16 Thread Hongtao Liu
On Thu, May 16, 2024 at 10:40 PM Victor Do Nascimento
 wrote:
>
> From: Victor Do Nascimento 
>
> At present, the compiler offers the `{u|s|us}dot_prod_optab' direct
> optabs for dealing with vectorizable dot product code sequences.  The
> consequence of using a direct optab for this is that backend-pattern
> selection is only ever able to match against one datatype - Either
> that of the operands or of the accumulated value, never both.
>
> With the introduction of the 2-way (un)signed dot-product insn [1][2]
> in AArch64 SVE2, the existing direct opcode approach is no longer
> sufficient for full specification of all the possible dot product
> machine instructions to be matched to the code sequence; a dot product
> resulting in VNx4SI may result from either dot products on VNx16QI or
> VNx8HI values for the 4- and 2-way dot product operations, respectively.
>
> This means that the following example fails autovectorization:
>
> uint32_t foo(int n, uint16_t* data) {
>   uint32_t sum = 0;
>   for (int i=0; i sum += data[i] * data[i];
>   }
>   return sum;
> }
>
Sorry to chime in, for x86 backend, we defined usdot_prodv16hi, and
2-way dot_prod operations can be generated

> To remedy the issue a new optab is added, tentatively named
> `udot_prod_twoway_optab', whose selection is dependent upon checking
> of both input and output types involved in the operation.
>
> In order to minimize changes to the existing codebase,
> `optab_for_tree_code' is renamed `optab_for_tree_code_1' and a new
> argument is added to its signature - `const_tree otype', allowing type
> information to be specified for both input and output types.  The
> existing nterface is retained by defining a new `optab_for_tree_code',
> which serves as a shim to `optab_for_tree_code_1', passing old
> parameters as-is and setting the new `optype' argument to `NULL_TREE'.
>
> For DOT_PROD_EXPR tree codes, we can call `optab_for_tree_code_1'
> directly, passing it both types, adding the internal logic to the
> function to distinguish between competing optabs.
>
> Finally, necessary changes are made to `expand_widen_pattern_expr' to
> ensure the new icode can be correctly selected, given the new optab.
>
> [1] 
> https://developer.arm.com/documentation/ddi0602/2024-03/SVE-Instructions/UDOT--2-way--vectors---Unsigned-integer-dot-product-
> [2] 
> https://developer.arm.com/documentation/ddi0602/2024-03/SVE-Instructions/SDOT--2-way--vectors---Signed-integer-dot-product-
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-sve2.md (@aarch64_sve_dotvnx4sivnx8hi):
> renamed to `dot_prod_twoway_vnx8hi'.
> * config/aarch64/aarch64-sve-builtins-base.cc (svdot_impl.expand):
> update icodes used in line with above rename.
> * optabs-tree.cc (optab_for_tree_code_1): Renamed
> `optab_for_tree_code' and added new argument.
> (optab_for_tree_code): Now a call to `optab_for_tree_code_1'.
> * optabs-tree.h (optab_for_tree_code_1): New.
> * optabs.cc (expand_widen_pattern_expr): Expand support for
> DOT_PROD_EXPR patterns.
> * optabs.def (udot_prod_twoway_optab): New.
> (sdot_prod_twoway_optab): Likewise.
> * tree-vect-patterns.cc (vect_supportable_direct_optab_p): Add
> support for misc optabs that use two modes.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/vect-dotprod-twoway.c: New.
> ---
>  .../aarch64/aarch64-sve-builtins-base.cc  |  4 ++--
>  gcc/config/aarch64/aarch64-sve2.md|  2 +-
>  gcc/optabs-tree.cc| 23 --
>  gcc/optabs-tree.h |  2 ++
>  gcc/optabs.cc |  2 +-
>  gcc/optabs.def|  2 ++
>  .../gcc.dg/vect/vect-dotprod-twoway.c | 24 +++
>  gcc/tree-vect-patterns.cc |  2 +-
>  8 files changed, 54 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index 0d2edf3f19e..e457db09f66 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -764,8 +764,8 @@ public:
>icode = (e.type_suffix (0).float_p
>? CODE_FOR_aarch64_sve_fdotvnx4sfvnx8hf
>: e.type_suffix (0).unsigned_p
> -  ? CODE_FOR_aarch64_sve_udotvnx4sivnx8hi
> -  : CODE_FOR_aarch64_sve_sdotvnx4sivnx8hi);
> +  ? CODE_FOR_udot_prod_twoway_vnx8hi
> +  : CODE_FOR_sdot_prod_twoway_vnx8hi);
>  return e.use_unpred_insn (icode);
>}
>  };
> diff --git a/gcc/config/aarch64/aarch64-sve2.md 
> b/gcc/config/aarch64/aarch64-sve2.md
> index 934e57055d3..5677de7108d 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -2021,7 +2021,7 @@ 

Re: [PATCH] Don't reduce estimated unrolled size for innermost loop.

2024-05-15 Thread Hongtao Liu
C  -std=gnu++14 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++17 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++20 note (test for
> >
> > g++warnings, line 66)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 LP64 note (test for
> >
> > g++warnings, line 56)
> >
> > g++: g++.dg/warn/Warray-bounds-20.C  -std=gnu++98 note (test for
> >
> > g++warnings, line 66)
>
> This seems to expect unrolling for an init loop rolling 1 times.  I don't
> see 1/3 of the stmts vanishing but it's definitely an interesting corner
> case.  That's why I was thinking of maybe adding a --param specifying
> an absolute growth we consider "no growth" - but of course that's
> ugly as well but it would cover these small loops.
>
> How do the sizes play out here after your change?  Before it's
>
> size: 13-3, last_iteration: 2-2
>   Loop size: 13
>   Estimated size after unrolling: 13
After:
size: 13-3, last_iteration: 2-2
  Loop size: 13
  Estimated size after unrolling: 20
Not unrolling loop 1: size would grow.

>
> and the init is quite complex with virtual pointer inits.  We do have
>
>   size:   1 _14 = _5 + -1;
>Induction variable computation will be folded away.
>   size:   1 _15 = _4 + 40;
>  BB: 3, after_exit: 1
>
> where we don't realize the + 40 of _15 will be folded into the dereferences
> but that would only subtract 1.
>
>   size:   3 C::C (_23,   [(void *)&_ZTT2D1 + 48B]);
>
> that's the biggest cost.
>
> To diagnose the array bound issue we rely on early unrolling since we avoid
> -Warray-bounds after late unrolling due to false positives.
>
> This is definitely not an unrolling that preserves code size.
>
> > gcc: gcc.dg/Warray-bounds-68.c  (test for warnings, line 18)
> >
> > gcc: gcc.dg/graphite/interchange-8.c execution test
>
> An execute fail is bad ... can we avoid this (but file a bugreport!) when
It's PR115101
> placing #pragma GCC unroll before the innermost loop?  We should
> probably honor that in early unrolling (not sure if we do).
>
> > gcc: gcc.dg/tree-prof/update-cunroll-2.c scan-tree-dump-not optimized
> > "Invalid sum"
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "Last
> > iteration exit edge was proved true."
> >
> > gcc: gcc.dg/tree-ssa/cunroll-1.c scan-tree-dump cunrolli "loop with 2
> > iterations completely unrolled"
>
> again the current estimate is the same before/after unrolling, here
> we expect to retain one compare & branch.
>
> > gcc: gcc.dg/tree-ssa/dump-6.c scan-tree-dump store-merging "MEM
> >  \\[\\(char \\*\\)\\] = "
> >
> > gcc: gcc.dg/tree-ssa/loop-36.c scan-tree-dump-not dce3 "c.array"
>
> again the 2/3 scaling is difficult to warrant.  The goal of the early 
> unrolling
> pass was abstraction penalty removal which works for low trip-count loops.
> So maybe that new --param for allowed growth should scale but instead
> of scaling by the loop size as 2/3 does it should scale by the number of
> times we peel which means offsetting the body size estimate by a constant.
>
> Honza?  Any idea how to go forward here?
>
> Thanks,
> Richard.
>
> > gcc: gcc.dg/tree-ssa/ssa-dom-cse-5.c scan-tree-dump-times dom2 "return 3;" 1
> >
> > gcc: gcc.dg/tree-ssa/update-cunroll.c scan-tree-dump-times optimized
> > "Invalid sum" 0
> >
> > gcc: gcc.dg/tree-ssa/vrp88.c scan-tree-dump vrp1 "Folded into: if.*"
> >
> > gcc: gcc.dg/vect/no-vfa-vect-dv-2.c scan-tree-dump-times vect
> > "vectorized 3 loops" 1
> >
> > >
> > > If we need some extra leeway for UL_NO_GROWTH for what we expect
> > > to unroll it might be better to add sth like --param
> > > nogrowth-completely-peeled-insns
> > > specifying a fixed surplus size?  Or we need to look at what's the problem
> > > with the testcases regressing or the one you are trying to fix.
> > >
> > > I did experiment with better estimating cleanup done at some point
> > > (see attached),
> > > but didn't get to finishing that (and as said, as we're running VN on the 
> > > result
> > > we'd ideally do that as part of the estimation somehow).
> > >
> > > Richard.
> > >
> > > > +unr_insns = unr_insns * 2 / 3;
> > > > +
> > > >if (unr_insns <= 0)
> > > >  unr_insns = 1;
> > > >
> > > > @@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
> > > >
> > > >   unsigned HOST_WIDE_INT ninsns = size.overall;
> > > >   unsigned HOST_WIDE_INT unr_insns
> > > > -   = estimated_unrolled_size (, n_unroll);
> > > > +   = estimated_unrolled_size (, n_unroll, ul, loop);
> > > >   if (dump_file && (dump_flags & TDF_DETAILS))
> > > > {
> > > >  

Re: [PATCH] Don't reduce estimated unrolled size for innermost loop.

2024-05-14 Thread Hongtao Liu
On Mon, May 13, 2024 at 3:40 PM Richard Biener
 wrote:
>
> On Mon, May 13, 2024 at 4:29 AM liuhongt  wrote:
> >
> > As testcase in the PR, O3 cunrolli may prevent vectorization for the
> > innermost loop and increase register pressure.
> > The patch removes the 1/3 reduction of unr_insn for innermost loop for 
> > UL_ALL.
> > ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
> > the reduction.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > No big impact for SPEC2017.
> > Ok for trunk?
>
> This removes the 1/3 reduction when unrolling a loop nest (the case I was
> concerned about).  Unrolling of a nest is by iterating in
> tree_unroll_loops_completely
> so the to be unrolled loop appears innermost.  So I think you need a new
> parameter on tree_unroll_loops_completely_1 indicating whether we're in the
> first iteration (or whether to assume inner most loops will "simplify").
yes, it would be better.
>
> Few comments below
>
> > gcc/ChangeLog:
> >
> > PR tree-optimization/112325
> > * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
> > new parameters: loop and ul, and remove unr_insns reduction
> > for innermost loop.
> > (try_unroll_loop_completely): Pass loop and ul to
> > estimated_unrolled_size.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.dg/tree-ssa/pr112325.c: New test.
> > * gcc.dg/vect/pr69783.c: Add extra option --param
> > max-completely-peeled-insns=300.
> > ---
> >  gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 
> >  gcc/testsuite/gcc.dg/vect/pr69783.c  |  2 +-
> >  gcc/tree-ssa-loop-ivcanon.cc | 16 +--
> >  3 files changed, 71 insertions(+), 4 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> >
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > new file mode 100644
> > index 000..14208b3e7f8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
> > @@ -0,0 +1,57 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
> > +
> > +typedef unsigned short ggml_fp16_t;
> > +static float table_f32_f16[1 << 16];
> > +
> > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> > +unsigned short s;
> > +__builtin_memcpy(, , sizeof(unsigned short));
> > +return table_f32_f16[s];
> > +}
> > +
> > +typedef struct {
> > +ggml_fp16_t d;
> > +ggml_fp16_t m;
> > +unsigned char qh[4];
> > +unsigned char qs[32 / 2];
> > +} block_q5_1;
> > +
> > +typedef struct {
> > +float d;
> > +float s;
> > +char qs[32];
> > +} block_q8_1;
> > +
> > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
> > restrict vx, const void * restrict vy) {
> > +const int qk = 32;
> > +const int nb = n / qk;
> > +
> > +const block_q5_1 * restrict x = vx;
> > +const block_q8_1 * restrict y = vy;
> > +
> > +float sumf = 0.0;
> > +
> > +for (int i = 0; i < nb; i++) {
> > +unsigned qh;
> > +__builtin_memcpy(, x[i].qh, sizeof(qh));
> > +
> > +int sumi = 0;
> > +
> > +for (int j = 0; j < qk/2; ++j) {
> > +const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> > +const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> > +
> > +const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> > +const int x1 = (x[i].qs[j] >> 4) | xh_1;
> > +
> > +sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> > +}
> > +
> > +sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
> > ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> > +}
> > +
> > +*s = sumf;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param 
> > max-completely-peel-times limit reached} "cunrolli"} } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c 
> > b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > index 5df95d0ce4e..a1f75514d72 100644
> > --- a/gcc/testsuite/gcc.dg/vect/pr69783.c
> > +++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
> > @@ -1,6 +1,6 @@
> >  /* { dg-do compile } */
> >  /* { dg-require-effective-target vect_float } */
> > -/* { dg-additional-options "-Ofast -funroll-loops" } */
> > +/* { dg-additional-options "-Ofast -funroll-loops --param 
> > max-completely-peeled-insns=300" } */
>
> If we rely on unrolling of a loop can you put #pragma unroll [N]
> before the respective loop
> instead?
>
> >  #define NXX 516
> >  #define NYY 516
> > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > index bf017137260..5e0eca647a1 100644
> > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > @@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, 
> > edge edge_to_cancel,
> >
> >  static unsigned HOST_WIDE_INT
> >  estimated_unrolled_size (struct loop_size *size,
> > - 

Re: [x86 SSE] Improve handling of ternlog instructions in i386/sse.md

2024-05-14 Thread Hongtao Liu
On Mon, May 13, 2024 at 5:57 AM Roger Sayle  wrote:
>
>
> This patch improves the way that the x86 backend recognizes and
> expands AVX512's bitwise ternary logic (vpternlog) instructions.
I like the patch.

1 file changed, 25 insertions(+), 1 deletion(-)
gcc/config/i386/i386-expand.cc | 26 +-

modified   gcc/config/i386/i386-expand.cc
@@ -25601,6 +25601,7 @@ ix86_gen_bcst_mem (machine_mode mode, rtx x)
 int
 ix86_ternlog_idx (rtx op, rtx *args)
 {
+  /* Nice dynamic programming:)  */
   int idx0, idx1;

   if (!op)
@@ -25651,6 +25652,7 @@ ix86_ternlog_idx (rtx op, rtx *args)
return 0xaa;
  }
   /* Maximum of one volatile memory reference per expression.  */
+  /* According to comments, it should be && ?  */
   if (side_effects_p (op) || side_effects_p (args[2]))
  return -1;
   if (rtx_equal_p (op, args[2]))
@@ -25666,6 +25668,8 @@ ix86_ternlog_idx (rtx op, rtx *args)

 case SUBREG:
   if (!VECTOR_MODE_P (GET_MODE (SUBREG_REG (op)))
+   /* It could be TI/OI/XImode since it's just bit operations,
+  So no need for VECTOR_MODE_P?  */
|| GET_MODE_SIZE (GET_MODE (SUBREG_REG (op)))
   != GET_MODE_SIZE (GET_MODE (op)))
  return -1;
@@ -25701,7 +25705,7 @@ ix86_ternlog_idx (rtx op, rtx *args)
 case UNSPEC:
   if (XINT (op, 1) != UNSPEC_VTERNLOG
|| XVECLEN (op, 0) != 4
-   || CONST_INT_P (XVECEXP (op, 0, 3)))
+   || !CONST_INT_P (XVECEXP (op, 0, 3)))
  return -1;

   /* TODO: Handle permuted operands.  */
@@ -25778,10 +25782,13 @@ ix86_ternlog_operand_p (rtx op)
   /* Prefer pxor.  */
   if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
&& (ix86_ternlog_leaf_p (op1, mode)
+   /* Add some comments, it's because we already have
one_cmpl2.  */
|| vector_all_ones_operand (op1, mode)))
  return false;
   break;

+  /* Wouldn't pternlog match (SUBREG: (REG))???,and it should
also be excluded.
+Similar for SUBREG: (AND/IOR/XOR)?   */
 default:
   break;
 }
@@ -25865,25 +25872,35 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,

 case 0x0a: /* ~a */
   if ((!op1 || !side_effects_p (op1))
+   /* shouldn't op1 always be register_operand with no side effects
when it exists?
+  _vternlog_mask only supports register_operand for op1.
+  ix86_ternlog_idx only assigns REG to args[1].
+  Ditto for op0, also we should add op2 && register_operand (op2, mode)
+  to avoid segment fault?   */
&& register_operand (op0, mode)
&& register_operand (op2, mode))
  return ix86_expand_ternlog_andnot (mode, op0, op1, target);
+  /* op2 instead of op1??? */
   break;

 case 0x0c: /* ~a */
   if ((!op2 || !side_effects_p (op2))
&& register_operand (op0, mode)
&& register_operand (op1, mode))
+ /* If op0 and op1 exist, they must be register_operand? So just op0
&& op1?  */
  return ix86_expand_ternlog_andnot (mode, op0, op1, target);
   break;

 case 0x0f:  /* ~a */
   if ((!op1 || !side_effects_p (op1))
+   /* No need for !side_effects for op1?  */
+   /* Ditto.  */
&& (!op2 || !side_effects_p (op2)))
  {
if (GET_MODE (op0) != mode)
  op0 = gen_lowpart (mode, op0);
if (!TARGET_64BIT && !register_operand (op0, mode))
+ /* It must be register_operand for op0 when it exists, no? */
  op0 = force_reg (mode, op0);
emit_move_insn (target, gen_rtx_XOR (mode, op0, CONSTM1_RTX (mode)));
return target;
@@ -25894,6 +25911,7 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
   if ((!op0 || !side_effects_p (op0))
&& register_operand (op1, mode)
&& register_operand (op2, mode))
+ /* op1 && op2 && register_operand (op2, mode)??  */
  return ix86_expand_ternlog_andnot (mode, op1, op2, target);
   break;

@@ -25901,12 +25919,14 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
   if ((!op2 || !side_effects_p (op2))
&& register_operand (op0, mode)
&& register_operand (op1, mode))
+ /* op0 && op1? */
  return ix86_expand_ternlog_andnot (mode, op1, op0, target);
   break;

 case 0x33:  /* ~b */
   if ((!op0 || !side_effects_p (op0))
&& (!op2 || !side_effects_p (op2)))
+ /* op1 && (!op2 || !side_effects_p (op2)) ?  */
  {
if (GET_MODE (op1) != mode)
  op1 = gen_lowpart (mode, op1);
@@ -26051,6 +26071,10 @@ ix86_expand_ternlog (machine_mode mode, rtx
op0, rtx op1, rtx op2, int idx,
   tmp2 = ix86_gen_bcst_mem (mode, op2);
   if (!tmp2)
  tmp2 = validize_mem (force_const_mem (mode, op2));
+  /* Can we use ix86_expand_vector_move here, it will try move
integer to gpr,
+ and broadcast gpr to the vector register.
+ It should be faster than a constant pool, and PR115021 should be solved by
+ another way instead of this walkaround.  */
 }
   else
 tmp2 = op2;




-- 
BR,
Hongtao


Re: [x86 PATCH] Improve V[48]QI shifts on AVX512

2024-05-10 Thread Hongtao Liu
On Fri, May 10, 2024 at 3:41 PM Roger Sayle  wrote:
>
>
> Many thanks for the speedy review and correction/improvement.
> It's interesting that you spotted the ternlog "spill"...
> I have a patch that rewrites ternlog handling that's been
> waiting for stage 1, that would also fix this mem operand
> issue.  I hope to submit it for review this weekend.
I opened a PR for that. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115021
>
> Thanks again,
> Roger
>
> > From: Hongtao Liu 
> > On Fri, May 10, 2024 at 6:26 AM Roger Sayle 
> > wrote:
> > >
> > >
> > > The following one line patch improves the code generated for V8QI and
> > > V4QI shifts when AV512BW and AVX512VL functionality is available.
> > +  /* With AVX512 its cheaper to do vpmovsxbw/op/vpmovwb.  */
> > +  && !(TARGET_AVX512BW && TARGET_AVX512VL && TARGET_SSE4_1)
> >&& ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2)) I 
> > think
> > TARGET_SSE4_1 is enough, it's always better w/ sse4.1 and above when not 
> > going
> > into ix86_expand_vec_shift_qihi_constant.
> > Others LGTM.
> > >
> > > For the testcase (from gcc.target/i386/vect-shiftv8qi.c):
> > >
> > > typedef signed char v8qi __attribute__ ((__vector_size__ (8))); v8qi
> > > foo (v8qi x) { return x >> 5; }
> > >
> > > GCC with -O2 -march=cascadelake currently generates:
> > >
> > > foo:movl$67372036, %eax
> > > vpsraw  $5, %xmm0, %xmm2
> > > vpbroadcastd%eax, %xmm1
> > > movl$117901063, %eax
> > > vpbroadcastd%eax, %xmm3
> > > vmovdqa %xmm1, %xmm0
> > > vmovdqa %xmm3, -24(%rsp)
> > > vpternlogd  $120, -24(%rsp), %xmm2, %xmm0
> > It looks like a miss-optimization under AVX512, but it's a separate issue.
> > > vpsubb  %xmm1, %xmm0, %xmm0
> > > ret
> > >
> > > with this patch we now generate the much improved:
> > >
> > > foo:vpmovsxbw   %xmm0, %xmm0
> > > vpsraw  $5, %xmm0, %xmm0
> > > vpmovwb %xmm0, %xmm0
> > > ret
> > >
> > > This patch also fixes the FAILs of gcc.target/i386/vect-shiftv[48]qi.c
> > > when run with the additional -march=cascadelake flag, by splitting
> > > these tests into two; one form testing code generation with -msse2
> > > (and
> > > -mno-avx512vl) as originally intended, and the other testing AVX512
> > > code generation with an explicit -march=cascadelake.
> > >
> > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > > and make -k check, both with and without --target_board=unix{-m32}
> > > with no new failures.  Ok for mainline?
> > >
> > >
> > > 2024-05-09  Roger Sayle  
> > >
> > > gcc/ChangeLog
> > > * config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial):
> > > Don't attempt ix86_expand_vec_shift_qihi_constant on AVX512.
> > >
> > > gcc/testsuite/ChangeLog
> > > * gcc.target/i386/vect-shiftv4qi.c: Specify -mno-avx512vl.
> > > * gcc.target/i386/vect-shiftv8qi.c: Likewise.
> > > * gcc.target/i386/vect-shiftv4qi-2.c: New test case.
> > > * gcc.target/i386/vect-shiftv8qi-2.c: Likewise.
> > >
> > >
> > > Thanks in advance,
> > > Roger
> > > --
> > >
> > --
> > BR,
> > Hongtao
>


-- 
BR,
Hongtao


Re: [x86 PATCH] Improve V[48]QI shifts on AVX512

2024-05-09 Thread Hongtao Liu
On Fri, May 10, 2024 at 6:26 AM Roger Sayle  wrote:
>
>
> The following one line patch improves the code generated for V8QI and V4QI
> shifts when AV512BW and AVX512VL functionality is available.
+  /* With AVX512 its cheaper to do vpmovsxbw/op/vpmovwb.  */
+  && !(TARGET_AVX512BW && TARGET_AVX512VL && TARGET_SSE4_1)
   && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
I think TARGET_SSE4_1 is enough, it's always better w/ sse4.1 and
above when not going into ix86_expand_vec_shift_qihi_constant.
Others LGTM.
>
> For the testcase (from gcc.target/i386/vect-shiftv8qi.c):
>
> typedef signed char v8qi __attribute__ ((__vector_size__ (8)));
> v8qi foo (v8qi x) { return x >> 5; }
>
> GCC with -O2 -march=cascadelake currently generates:
>
> foo:movl$67372036, %eax
> vpsraw  $5, %xmm0, %xmm2
> vpbroadcastd%eax, %xmm1
> movl$117901063, %eax
> vpbroadcastd%eax, %xmm3
> vmovdqa %xmm1, %xmm0
> vmovdqa %xmm3, -24(%rsp)
> vpternlogd  $120, -24(%rsp), %xmm2, %xmm0
It looks like a miss-optimization under AVX512, but it's a separate issue.
> vpsubb  %xmm1, %xmm0, %xmm0
> ret
>
> with this patch we now generate the much improved:
>
> foo:vpmovsxbw   %xmm0, %xmm0
> vpsraw  $5, %xmm0, %xmm0
> vpmovwb %xmm0, %xmm0
> ret
>
> This patch also fixes the FAILs of gcc.target/i386/vect-shiftv[48]qi.c
> when run with the additional -march=cascadelake flag, by splitting these
> tests into two; one form testing code generation with -msse2 (and
> -mno-avx512vl) as originally intended, and the other testing AVX512
> code generation with an explicit -march=cascadelake.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
>
> 2024-05-09  Roger Sayle  
>
> gcc/ChangeLog
> * config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial):
> Don't attempt ix86_expand_vec_shift_qihi_constant on AVX512.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/vect-shiftv4qi.c: Specify -mno-avx512vl.
> * gcc.target/i386/vect-shiftv8qi.c: Likewise.
> * gcc.target/i386/vect-shiftv4qi-2.c: New test case.
> * gcc.target/i386/vect-shiftv8qi-2.c: Likewise.
>
>
> Thanks in advance,
> Roger
> --
>


-- 
BR,
Hongtao


Re: [PATCH] i386: Fix some intrinsics without alignment requirements.

2024-05-08 Thread Hongtao Liu
On Wed, May 8, 2024 at 10:13 AM Hu, Lin1  wrote:
>
> Hi all,
>
> This patch aims to fix some intrinsics without alignment requirement, but
> raised runtime error's problem.
>
> Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?
Ok.
>
> BRs,
> Lin
>
> gcc/ChangeLog:
>
> PR target/84508
> * config/i386/emmintrin.h
> (_mm_load_sd): Remove alignment requirement.
> (_mm_store_sd): Ditto.
> (_mm_loadh_pd): Ditto.
> (_mm_loadl_pd): Ditto.
> (_mm_storel_pd): Add alignment requirement.
> * config/i386/xmmintrin.h
> (_mm_loadh_pi): Remove alignment requirement.
> (_mm_loadl_pi): Ditto.
> (_mm_load_ss): Ditto.
> (_mm_store_ss): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> PR target/84508
> * gcc.target/i386/pr84508-1.c: New test.
> * gcc.target/i386/pr84508-2.c: Ditto.
> ---
>  gcc/config/i386/emmintrin.h   | 11 ++-
>  gcc/config/i386/xmmintrin.h   |  9 +
>  gcc/testsuite/gcc.target/i386/pr84508-1.c | 11 +++
>  gcc/testsuite/gcc.target/i386/pr84508-2.c | 11 +++
>  4 files changed, 33 insertions(+), 9 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr84508-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr84508-2.c
>
> diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
> index 915a5234c38..d7fc1af9687 100644
> --- a/gcc/config/i386/emmintrin.h
> +++ b/gcc/config/i386/emmintrin.h
> @@ -56,6 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ 
> (16), __may_alias__));
>  /* Unaligned version of the same types.  */
>  typedef long long __m128i_u __attribute__ ((__vector_size__ (16), 
> __may_alias__, __aligned__ (1)));
>  typedef double __m128d_u __attribute__ ((__vector_size__ (16), 
> __may_alias__, __aligned__ (1)));
> +typedef double double_u __attribute__ ((__may_alias__, __aligned__ (1)));
>
>  /* Create a selector for use with the SHUFPD instruction.  */
>  #define _MM_SHUFFLE2(fp1,fp0) \
> @@ -145,7 +146,7 @@ _mm_load1_pd (double const *__P)
>  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_load_sd (double const *__P)
>  {
> -  return _mm_set_sd (*__P);
> +  return __extension__ (__m128d){ *(double_u *)__P, 0.0 };
>  }
>
>  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> @@ -180,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A)
>  extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_store_sd (double *__P, __m128d __A)
>  {
> -  *__P = ((__v2df)__A)[0];
> +  *(double_u *)__P = ((__v2df)__A)[0] ;
>  }
>
>  extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> @@ -192,7 +193,7 @@ _mm_cvtsd_f64 (__m128d __A)
>  extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_storel_pd (double *__P, __m128d __A)
>  {
> -  _mm_store_sd (__P, __A);
> +  *__P = ((__v2df)__A)[0];
>  }
>
>  /* Stores the upper DPFP value.  */
> @@ -973,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
>  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_loadh_pd (__m128d __A, double const *__B)
>  {
> -  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
> +  return __extension__ (__m128d) { ((__v2df)__A)[0], *(double_u*)__B };
>  }
>
>  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_loadl_pd (__m128d __A, double const *__B)
>  {
> -  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
> +  return __extension__ (__m128d) { *(double_u*)__B, ((__v2df)__A)[1] };
>  }
>
>  extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
> index 71b9955b843..9e20f262839 100644
> --- a/gcc/config/i386/xmmintrin.h
> +++ b/gcc/config/i386/xmmintrin.h
> @@ -73,6 +73,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), 
> __may_alias__));
>
>  /* Unaligned version of the same type.  */
>  typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, 
> __aligned__ (1)));
> +typedef float float_u __attribute__ ((__may_alias__, __aligned__ (1)));
>
>  /* Internal data types for implementing the intrinsics.  */
>  typedef float __v4sf __attribute__ ((__vector_size__ (16)));
> @@ -774,7 +775,7 @@ _mm_unpacklo_ps (__m128 __A, __m128 __B)
>  /* Sets the upper two SPFP values with 64-bits of data loaded from P;
> the lower two values are passed through from A.  */
>  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> -_mm_loadh_pi (__m128 __A, __m64 const *__P)
> +_mm_loadh_pi (__m128 __A, __m64_u const *__P)
>  {
>return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
>  }
> @@ -803,7 

Re: [PATCH] i386: fix ix86_hardreg_mov_ok with lra_in_progress

2024-05-07 Thread Hongtao Liu
On Mon, May 6, 2024 at 3:40 PM Kong, Lingling  wrote:
>
> Hi,
> Originally eliminate_regs_in_insn will transform
> (parallel [
>   (set (reg:QI 130)
> (plus:QI (subreg:QI (reg:DI 19 frame) 0)
>   (const_int 96)))
>   (clobber (reg:CC 17 flag))]) {*addqi_1}
> to
> (set (reg:QI 130)
>   (subreg:QI (reg:DI 19 frame) 0)) {*movqi_internal}
> when verify_changes.
>
> But with No Flags add, it transforms
> (set (reg:QI 5 di)
>   (plus:QI (subreg:QI (reg:DI 19 frame) 0)
>(const_int 96))) {*addqi_1_nf}
> to
> (set (reg:QI 5 di)
>  (subreg:QI (reg:DI 19 frame) 0)) {*addqi_1_nf}.
> there is no extra clobbers at the end, and its dest reg just is a hardreg. 
> For ix86_hardreg_mov_ok, it returns false. So it fails to update insn and 
> causes the ICE when transform to movqi_internal.
>
> But actually it is ok and safe for ix86_hardreg_mov_ok when lra_in_progress.
>
> And tested the spec2017, the performance was not affected.
> Bootstrapped and regtested on x86_64-pc-linux-gnu. OK for trunk?
Ok.
>
> gcc/ChangeLog:
>
> * config/i386/i386.cc (ix86_hardreg_mov_ok): Relax
> hard reg mov restriction when lra in progress.
> ---
>  gcc/config/i386/i386.cc | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 
> 4d6b2b98761..ca4348a18bf 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20357,7 +20357,8 @@ ix86_hardreg_mov_ok (rtx dst, rtx src)
>? standard_sse_constant_p (src, GET_MODE (dst))
>: x86_64_immediate_operand (src, GET_MODE (dst)))
>&& ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))
> -  && !reload_completed)
> +  && !reload_completed
> +  && !lra_in_progress)
>  return false;
>return true;
>  }
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] x86: Fix cmov cost model issue [PR109549]

2024-05-05 Thread Hongtao Liu
CC uros.

On Mon, May 6, 2024 at 11:03 AM Kong, Lingling  wrote:
>
> Hi,
> (if_then_else:SI (eq (reg:CCZ 17 flags)
> (const_int 0 [0]))
> (reg/v:SI 101 [ e ])
> (reg:SI 102))
> The cost is 8 for the rtx, the cost for
> (eq (reg:CCZ 17 flags) (const_int 0 [0])) is 4, but this is just an operator 
> do not need to compute it's cost in cmov.
It looks like a reasonable change to me, for cmov, the first operand
of if_then_else is not a mask.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu.
> OK for trunk?
>
> gcc/ChangeLog:
>
> PR target/109549
> * config/i386/i386.cc (ix86_rtx_costs): The XEXP (x, 0) for cmov
> is an operator do not need to compute cost.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/cmov6.c: Fixed.
> ---
>  gcc/config/i386/i386.cc   | 2 +-
>  gcc/testsuite/gcc.target/i386/cmov6.c | 5 +
>  2 files changed, 2 insertions(+), 5 deletions(-)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 
> 4d6b2b98761..59b4ce3bfbf 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -22237,7 +22237,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
> outer_code_i, int opno,
> {
>   /* cmov.  */
>   *total = COSTS_N_INSNS (1);
> - if (!REG_P (XEXP (x, 0)))
> + if (!COMPARISON_P (XEXP (x, 0)) && !REG_P (XEXP (x, 0)))
> *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
>   if (!REG_P (XEXP (x, 1)))
> *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed); diff 
> --git a/gcc/testsuite/gcc.target/i386/cmov6.c 
> b/gcc/testsuite/gcc.target/i386/cmov6.c
> index 5111c8a9099..535326e4c2a 100644
> --- a/gcc/testsuite/gcc.target/i386/cmov6.c
> +++ b/gcc/testsuite/gcc.target/i386/cmov6.c
> @@ -1,9 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -march=k8" } */
> -/* if-converting this sequence would require two cmov
> -   instructions and seems to always cost more independent
> -   of the TUNE_ONE_IF_CONV setting.  */
> -/* { dg-final { scan-assembler-not "cmov\[^6\]" } } */
> +/* { dg-final { scan-assembler "cmov\[^6\]" } } */
>
>  /* Verify that blocks are converted to conditional moves.  */  extern int 
> bar (int, int);
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] Don't assert for IFN_COND_{MIN, MAX} in vect_transform_reduction

2024-04-30 Thread Hongtao Liu
On Tue, Apr 30, 2024 at 3:38 PM Jakub Jelinek  wrote:
>
> On Tue, Apr 30, 2024 at 09:30:00AM +0200, Richard Biener wrote:
> > On Mon, Apr 29, 2024 at 5:30 PM H.J. Lu  wrote:
> > >
> > > On Mon, Apr 29, 2024 at 6:47 AM liuhongt  wrote:
> > > >
> > > > The Fortran standard does not specify what the result of the MAX
> > > > and MIN intrinsics are if one of the arguments is a NaN. So it
> > > > should be ok to tranform reduction for IFN_COND_MIN with vectorized
> > > > COND_MIN and REDUC_MIN.
> > >
> > > The commit subject isn't very clear.   This patch isn't about "Don't 
> > > assert
> > > for IFN_COND_{MIN,MAX}".  It allows IFN_COND_{MIN,MAX} in
> > > vect_transform_reduction.
> >
> > Well, we allow it elsewhere, we just fail to enumerate all COND_* we allow
> > here correctly.
> >
> > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > Ok for trunk and backport to GCC14?
> >
> > OK for trunk and branch.
>
> Oops, I've just sent the same patch, just with a different testcase
> (reduced and which tests both the min and max).
> I think the reduced testcase is better.
Yes, please commit your patch :)
>
> > > > gcc/ChangeLog:
> > > >
> > > > PR 114883
>
> Missing tree-optimization/
>
> > > > * tree-vect-loop.cc (vect_transform_reduction): Don't assert
> > > > for IFN_COND_{MIN, MAX}.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > * gfortran.dg/pr114883.f90: New test.
>
> Jakub
>


-- 
BR,
Hongtao


Re: [PATCH] i386: Fix behavior for both using AVX10.1-256 in options and function attribute

2024-04-24 Thread Hongtao Liu
On Wed, Apr 24, 2024 at 1:46 PM Haochen Jiang  wrote:
>
> Hi all,
>
> When we are using -mavx10.1-256 in command line and avx10.1-256 in
> target attribute together, zmm should never be generated. But current
> GCC will generate zmm since it wrongly enables EVEX512 for non-explicitly
> set AVX512. This patch will fix that issue.
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
Ok.
>
> gcc/ChangeLog:
>
> * config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
> Check whether AVX512F is explicitly enabled.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx10_1-24.c: New test.
> ---
>  gcc/config/i386/i386-options.cc| 1 +
>  gcc/testsuite/gcc.target/i386/avx10_1-24.c | 7 +++
>  2 files changed, 8 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx10_1-24.c
>
> diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
> index 68a2e1c6910..ac48b5c61c4 100644
> --- a/gcc/config/i386/i386-options.cc
> +++ b/gcc/config/i386/i386-options.cc
> @@ -1431,6 +1431,7 @@ ix86_valid_target_attribute_tree (tree fndecl, tree 
> args,
>   scenario.  */
>if ((def->x_ix86_isa_flags2 & OPTION_MASK_ISA2_AVX10_1_256)
>&& (opts->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512F)
> +  && (opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F)
>&& !(def->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_EVEX512)
>&& !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA2_EVEX512))
>  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA2_EVEX512;
> diff --git a/gcc/testsuite/gcc.target/i386/avx10_1-24.c 
> b/gcc/testsuite/gcc.target/i386/avx10_1-24.c
> new file mode 100644
> index 000..2e93f041760
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx10_1-24.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=x86-64 -mavx10.1" } */
> +/* { dg-final { scan-assembler-not "%zmm" } } */
> +
> +typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
> +
> +void __attribute__((target("avx10.1-256"))) callee256(__m512 *a, __m512 *b) 
> { *a = *b; }
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] x86: Allow TImode offsettable memory only with 8-bit constant

2024-04-14 Thread Hongtao Liu
On Sat, Apr 13, 2024 at 6:42 AM H.J. Lu  wrote:
>
> The x86 instruction size limit is 15 bytes.  If a NDD instruction has
> a segment prefix byte, a 4-byte opcode prefix, a MODRM byte, a SIB byte,
> a 4-byte displacement and a 4-byte immediate, adding an address size
> prefix will exceed the size limit.  Change TImode ADD, AND, OR and XOR
> to allow offsettable memory only with 8-bit signed integer constant,
> which is encoded with a 1-byte immediate, if the address size prefix
> is used.
Ok.
>
> gcc/
>
> PR target/114696
> * config/i386/i386.md (isa): Add apx_ndd_64.
> (enabled): Likewise.
> (*add3_doubleword): Change rjO to r,ro,jO with 8-bit
> signed integer constant and enable jO only for apx_ndd_64.
> (*add3_doubleword_cc_overflow_1): Likewise.
> (*and3_doubleword): Likewise.
> (*3_doubleword): Likewise.
>
> gcc/testsuite/
>
> PR target/114696
> * gcc.target/i386/apx-ndd-x32-2a.c: New test.
> * gcc.target/i386/apx-ndd-x32-2b.c: Likewise.
> * gcc.target/i386/apx-ndd-x32-2c.c: Likewise.
> * gcc.target/i386/apx-ndd-x32-2d.c: Likewise.
> ---
>  gcc/config/i386/i386.md   | 36 ++-
>  .../gcc.target/i386/apx-ndd-x32-2a.c  | 13 +++
>  .../gcc.target/i386/apx-ndd-x32-2b.c  |  6 
>  .../gcc.target/i386/apx-ndd-x32-2c.c  |  6 
>  .../gcc.target/i386/apx-ndd-x32-2d.c  |  6 
>  5 files changed, 50 insertions(+), 17 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-x32-2a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-x32-2b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-x32-2c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-x32-2d.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index d4ce3809e6d..adab1ef9e04 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -568,7 +568,7 @@ (define_attr "unit" "integer,i387,sse,mmx,unknown"
>
>  ;; Used to control the "enabled" attribute on a per-instruction basis.
>  (define_attr "isa" "base,x64,nox64,x64_sse2,x64_sse4,x64_sse4_noavx,
> -   x64_avx,x64_avx512bw,x64_avx512dq,apx_ndd,
> +   x64_avx,x64_avx512bw,x64_avx512dq,apx_ndd,apx_ndd_64,
> sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx,
> 
> avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,avx512f_512,
> noavx512f,avx512bw,avx512bw_512,noavx512bw,avx512dq,
> @@ -968,6 +968,8 @@ (define_attr "enabled" ""
>(symbol_ref "TARGET_VPCLMULQDQ && TARGET_AVX512VL")
>  (eq_attr "isa" "apx_ndd")
>(symbol_ref "TARGET_APX_NDD")
> +(eq_attr "isa" "apx_ndd_64")
> +  (symbol_ref "TARGET_APX_NDD && Pmode == DImode")
>  (eq_attr "isa" "vaes_avx512vl")
>(symbol_ref "TARGET_VAES && TARGET_AVX512VL")
>
> @@ -6302,10 +6304,10 @@ (define_expand "add3"
>  })
>
>  (define_insn_and_split "*add3_doubleword"
> -  [(set (match_operand: 0 "nonimmediate_operand" "=ro,r,,,")
> +  [(set (match_operand: 0 "nonimmediate_operand" "=ro,r,")
> (plus:
> - (match_operand: 1 "nonimmediate_operand" "%0,0,ro,rjO,r")
> - (match_operand: 2 "x86_64_hilo_general_operand" 
> "r,o,r,,r")))
> + (match_operand: 1 "nonimmediate_operand" "%0,0,ro,r,ro,jO,r")
> + (match_operand: 2 "x86_64_hilo_general_operand" 
> "r,o,r,,K,,r")))
> (clobber (reg:CC FLAGS_REG))]
>"ix86_binary_operator_ok (PLUS, mode, operands, TARGET_APX_NDD)"
>"#"
> @@ -6344,7 +6346,7 @@ (define_insn_and_split "*add3_doubleword"
>DONE;
>  }
>  }
> -[(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd")])
> +[(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd_64,apx_ndd")])
>
>  (define_insn_and_split "*add3_doubleword_zext"
>[(set (match_operand: 0 "nonimmediate_operand" "=r,o,,")
> @@ -9515,10 +9517,10 @@ (define_insn_and_split 
> "*add3_doubleword_cc_overflow_1"
>[(set (reg:CCC FLAGS_REG)
> (compare:CCC
>   (plus:
> -   (match_operand: 1 "nonimmediate_operand" "%0,0,ro,rjO,r")
> -   (match_operand: 2 "x86_64_hilo_general_operand" 
> "r,o,r,,o"))
> +   (match_operand: 1 "nonimmediate_operand" "%0,0,ro,r,ro,jO,r")
> +   (match_operand: 2 "x86_64_hilo_general_operand" 
> "r,o,r,,K,,o"))
>   (match_dup 1)))
> -   (set (match_operand: 0 "nonimmediate_operand" "=ro,r,,,")
> +   (set (match_operand: 0 "nonimmediate_operand" "=ro,r,")
> (plus: (match_dup 1) (match_dup 2)))]
>"ix86_binary_operator_ok (PLUS, mode, operands, TARGET_APX_NDD)"
>"#"
> @@ -9560,7 +9562,7 @@ (define_insn_and_split 
> "*add3_doubleword_cc_overflow_1"
>else
>  operands[6] = gen_rtx_ZERO_EXTEND (mode, operands[5]);
>  }
> -[(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd")])
> +[(set_attr "isa" 

Re: [PATCH] Prohibit SHA/KEYLOCKER usage of EGPR when APX enabled

2024-04-09 Thread Hongtao Liu
On Tue, Apr 9, 2024 at 3:05 PM Hongyu Wang  wrote:
>
> The latest APX spec announced removal of SHA/KEYLOCKER evex promotion [1],
> which means the SHA/KEYLOCKER insn does not support EGPR when APX
> enabled. Update the corresponding constraints to their EGPR-disabled
> counterparts.
>
> Bootstrapped and regtested on x86-64-pc-linux-gnu.
>
> Ok for trunk?
Ok.
>
> [1].https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html
>
> gcc/ChangeLog:
>
> * config/i386/sse.md (sha1msg1): Use "ja" instead of "Bm" for
> memory constraint.
> (sha1msg2): Likewise.
> (sha1nexte): Likewise.
> (sha1rnds4): Likewise.
> (sha256msg1): Likewise.
> (sha256msg2): Likewise.
> (sha256rnds2): Likewise.
> (aesu8): Use "jm" instead of "m" for memory
> constraint.
> (*aesu8): Likewise.
> (*encodekey128u32): Use "jr" instead of "r" for register
> constraints.
> (*encodekey256u32): Likewise.
> ---
>  gcc/config/i386/sse.md | 26 +-
>  1 file changed, 13 insertions(+), 13 deletions(-)
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 3286d3a4fac..4b8d5342707 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -29104,7 +29104,7 @@ (define_insn "sha1msg1"
>[(set (match_operand:V4SI 0 "register_operand" "=x")
> (unspec:V4SI
>   [(match_operand:V4SI 1 "register_operand" "0")
> -  (match_operand:V4SI 2 "vector_operand" "xBm")]
> +  (match_operand:V4SI 2 "vector_operand" "xja")]
>   UNSPEC_SHA1MSG1))]
>"TARGET_SHA"
>"sha1msg1\t{%2, %0|%0, %2}"
> @@ -29115,7 +29115,7 @@ (define_insn "sha1msg2"
>[(set (match_operand:V4SI 0 "register_operand" "=x")
> (unspec:V4SI
>   [(match_operand:V4SI 1 "register_operand" "0")
> -  (match_operand:V4SI 2 "vector_operand" "xBm")]
> +  (match_operand:V4SI 2 "vector_operand" "xja")]
>   UNSPEC_SHA1MSG2))]
>"TARGET_SHA"
>"sha1msg2\t{%2, %0|%0, %2}"
> @@ -29126,7 +29126,7 @@ (define_insn "sha1nexte"
>[(set (match_operand:V4SI 0 "register_operand" "=x")
> (unspec:V4SI
>   [(match_operand:V4SI 1 "register_operand" "0")
> -  (match_operand:V4SI 2 "vector_operand" "xBm")]
> +  (match_operand:V4SI 2 "vector_operand" "xja")]
>   UNSPEC_SHA1NEXTE))]
>"TARGET_SHA"
>"sha1nexte\t{%2, %0|%0, %2}"
> @@ -29137,7 +29137,7 @@ (define_insn "sha1rnds4"
>[(set (match_operand:V4SI 0 "register_operand" "=x")
> (unspec:V4SI
>   [(match_operand:V4SI 1 "register_operand" "0")
> -  (match_operand:V4SI 2 "vector_operand" "xBm")
> +  (match_operand:V4SI 2 "vector_operand" "xja")
>(match_operand:SI 3 "const_0_to_3_operand")]
>   UNSPEC_SHA1RNDS4))]
>"TARGET_SHA"
> @@ -29150,7 +29150,7 @@ (define_insn "sha256msg1"
>[(set (match_operand:V4SI 0 "register_operand" "=x")
> (unspec:V4SI
>   [(match_operand:V4SI 1 "register_operand" "0")
> -  (match_operand:V4SI 2 "vector_operand" "xBm")]
> +  (match_operand:V4SI 2 "vector_operand" "xja")]
>   UNSPEC_SHA256MSG1))]
>"TARGET_SHA"
>"sha256msg1\t{%2, %0|%0, %2}"
> @@ -29161,7 +29161,7 @@ (define_insn "sha256msg2"
>[(set (match_operand:V4SI 0 "register_operand" "=x")
> (unspec:V4SI
>   [(match_operand:V4SI 1 "register_operand" "0")
> -  (match_operand:V4SI 2 "vector_operand" "xBm")]
> +  (match_operand:V4SI 2 "vector_operand" "xja")]
>   UNSPEC_SHA256MSG2))]
>"TARGET_SHA"
>"sha256msg2\t{%2, %0|%0, %2}"
> @@ -29172,7 +29172,7 @@ (define_insn "sha256rnds2"
>[(set (match_operand:V4SI 0 "register_operand" "=x")
> (unspec:V4SI
>   [(match_operand:V4SI 1 "register_operand" "0")
> -  (match_operand:V4SI 2 "vector_operand" "xBm")
> +  (match_operand:V4SI 2 "vector_operand" "xja")
>(match_operand:V4SI 3 "register_operand" "Yz")]
>   UNSPEC_SHA256RNDS2))]
>"TARGET_SHA"
> @@ -30575,9 +30575,9 @@ (define_expand "encodekey128u32"
>
>  (define_insn "*encodekey128u32"
>[(match_parallel 2 "encodekey128_operation"
> -[(set (match_operand:SI 0 "register_operand" "=r")
> +[(set (match_operand:SI 0 "register_operand" "=jr")
>   (unspec_volatile:SI
> -   [(match_operand:SI   1 "register_operand" "r")
> +   [(match_operand:SI   1 "register_operand" "jr")
>  (reg:V2DI XMM0_REG)]
> UNSPECV_ENCODEKEY128U32))])]
>"TARGET_KL"
> @@ -30632,9 +30632,9 @@ (define_expand "encodekey256u32"
>
>  (define_insn "*encodekey256u32"
>[(match_parallel 2 "encodekey256_operation"
> -[(set (match_operand:SI 0 "register_operand" "=r")
> +[(set (match_operand:SI 0 "register_operand" "=jr")
>   (unspec_volatile:SI
> -   [(match_operand:SI   1 

Re: [PATCH] i386, v2: Fix aes/vaes patterns [PR114576]

2024-04-09 Thread Hongtao Liu
On Tue, Apr 9, 2024 at 5:18 PM Jakub Jelinek  wrote:
>
> On Tue, Apr 09, 2024 at 11:23:40AM +0800, Hongtao Liu wrote:
> > I think we can merge alternative 2 with 3 to
> > *  return TARGET_AES ? \"vaesenc\t{%2, %1, %0|%0, %1, %2}"\" :
> > \"%{evex%} vaesenc\t{%2, %1, %0|%0, %1, %2}\";
> > Then it can handle vaes_avx512vl + -mno-aes case.
>
> Ok, done in the patch below.
>
> > > @@ -30246,44 +30250,60 @@ (define_insn "vpdpwssds__maskz_1"
> > > [(set_attr ("prefix") ("evex"))])
> > >
> > >  (define_insn "vaesdec_"
> > > -  [(set (match_operand:VI1_AVX512VL_F 0 "register_operand" "=v")
> > > +  [(set (match_operand:VI1_AVX512VL_F 0 "register_operand" "=x,v")
> > > (unspec:VI1_AVX512VL_F
> > > - [(match_operand:VI1_AVX512VL_F 1 "register_operand" "v")
> > > -  (match_operand:VI1_AVX512VL_F 2 "vector_operand" "vm")]
> > > + [(match_operand:VI1_AVX512VL_F 1 "register_operand" "x,v")
> > > +  (match_operand:VI1_AVX512VL_F 2 "vector_operand" "xjm,vm")]
> > >   UNSPEC_VAESDEC))]
> > >"TARGET_VAES"
> > > -  "vaesdec\t{%2, %1, %0|%0, %1, %2}"
> > > -)
> > > +{
> > > +  if (which_alternative == 0 && mode == V16QImode)
> > > +return "%{evex%} vaesdec\t{%2, %1, %0|%0, %1, %2}";
> > Similar, but something like
> > *  return TARGET_AES || mode != V16QImode ? \"vaesenc\t{%2, %1,
> > %0|%0, %1, %2}"\" : \"%{evex%} vaesenc\t{%2, %1, %0|%0, %1, %2}\";
>
> For a single alternative, it would need to be
> {
>   return x86_evex_reg_mentioned_p (operands, 3)
>  ? \"vaesenc\t{%2, %1, %0|%0, %1, %2}\"
>  : \"%{evex%} vaesenc\t{%2, %1, %0|%0, %1, %2}\";
> }
> (* return would just mean uselessly too long line).
> Is that what you want instead?  I thought the 2 separate alternatives
> where only the latter covers those cases is more readable...
>
> The following patch just changes the aes* patterns, not the vaes* ones.
Patch LGTM.
>
> 2024-04-09  Jakub Jelinek  
>
> PR target/114576
> * config/i386/i386.md (isa): Remove aes, add vaes_avx512vl.
> (enabled): Remove aes isa check, add vaes_avx512vl.
> * config/i386/sse.md (aesenc, aesenclast, aesdec, aesdeclast): Use
> jm instead of m for second alternative and emit {evex} prefix
> for it if !TARGET_AES.  Use noavx,avx,vaes_avx512vl isa attribute.
> (vaesdec_, vaesdeclast_, vaesenc_,
> vaesenclast_): Add second alternative with x instead of v
> and jm instead of m.
>
> * gcc.target/i386/aes-pr114576.c: New test.
>
> --- gcc/config/i386/i386.md.jj  2024-04-09 08:12:29.259451422 +0200
> +++ gcc/config/i386/i386.md 2024-04-09 10:53:24.965516804 +0200
> @@ -568,13 +568,14 @@ (define_attr "unit" "integer,i387,sse,mm
>
>  ;; Used to control the "enabled" attribute on a per-instruction basis.
>  (define_attr "isa" "base,x64,nox64,x64_sse2,x64_sse4,x64_sse4_noavx,
> -   x64_avx,x64_avx512bw,x64_avx512dq,aes,apx_ndd,
> +   x64_avx,x64_avx512bw,x64_avx512dq,apx_ndd,
> sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx,
> 
> avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,avx512f_512,
> noavx512f,avx512bw,avx512bw_512,noavx512bw,avx512dq,
> noavx512dq,fma_or_avx512vl,avx512vl,noavx512vl,avxvnni,
> avx512vnnivl,avx512fp16,avxifma,avx512ifmavl,avxneconvert,
> -   avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl"
> +   avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl,
> +   vaes_avx512vl"
>(const_string "base"))
>
>  ;; The (bounding maximum) length of an instruction immediate.
> @@ -915,7 +916,6 @@ (define_attr "enabled" ""
>(symbol_ref "TARGET_64BIT && TARGET_AVX512BW")
>  (eq_attr "isa" "x64_avx512dq")
>(symbol_ref "TARGET_64BIT && TARGET_AVX512DQ")
> -(eq_attr "isa" "aes") (symbol_ref "TARGET_AES")
>  (eq_attr "isa" "sse_noavx")
>(symbol_ref "TARGET_SSE && !TARGET_AVX")
>  (eq_attr "isa" "sse2") (symbol_ref "T

Re: [PATCH] i386: Fix aes/vaes patterns [PR114576]

2024-04-08 Thread Hongtao Liu
On Thu, Apr 4, 2024 at 4:42 PM Jakub Jelinek  wrote:
>
> On Wed, Apr 19, 2023 at 02:40:59AM +, Jiang, Haochen via Gcc-patches 
> wrote:
> > > >  (define_insn "aesenc"
> > > > -  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
> > > > -   (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
> > > > -  (match_operand:V2DI 2 "vector_operand" "xBm,xm")]
> > > > +  [(set (match_operand:V2DI 0 "register_operand" "=x,x,v")
> > > > +   (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x,v")
> > > > +  (match_operand:V2DI 2 "vector_operand"
> > > > + "xBm,xm,vm")]
> > > >   UNSPEC_AESENC))]
> > > > -  "TARGET_AES"
> > > > +  "TARGET_AES || (TARGET_VAES && TARGET_AVX512VL)"
> > > >"@
> > > > aesenc\t{%2, %0|%0, %2}
> > > > +   vaesenc\t{%2, %1, %0|%0, %1, %2}
> > > > vaesenc\t{%2, %1, %0|%0, %1, %2}"
> > > > -  [(set_attr "isa" "noavx,avx")
> > > > +  [(set_attr "isa" "noavx,aes,avx512vl")
> > > Shouldn't it be vaes_avx512vl and then remove " || (TARGET_VAES &&
> > > TARGET_AVX512VL)" from condition.
> >
> > Since VAES should not imply AES, we need that "|| (TARGET_VAES &&
> > TARGET_AVX512VL)"
> >
> > And there is no need to add vaes_avx512vl since the last alternative will 
> > only
> > be hit when there is no aes. When there is no aes, the pattern will need 
> > vaes
> > and avx512vl both or we could not use this pattern. avx512vl here is just 
> > like
> > a placeholder.
>
> As the following testcase shows, the above change was incorrect.
>
> Using aes isa for the second alternative is obviously wrong, aes is enabled
> whenever -maes is, regardless of -mavx or -mno-avx, so the above change
> means that for -maes -mno-avx RA can choose, either it matches the first
> alternative with the dup operand, or it matches the second one (but that
> is of course wrong because vaesenc VEX encoded insn needs AES & AVX CPUID).
>
> The big question is if "Since VAES should not imply AES" is the case or not.
> Looking around at what LLVM does on godbolt, seems since clang 6 which added
> -mvaes support -mvaes there implies -maes, but GCC treats those two
> independent.
>
> Now, if we'd take the LLVM path of making -mvaes imply -maes and -mno-aes
> imply -mno-vaes, then we should probably just revert the above patch and
> tweak common/config/i386/ to do the implications (+ add the testcase from
> this patch).
>
> If we keep the current behavior, where AES and VAES are completely
> independent extensions, then we need to do more changes as the following
> patch attempts to do.
> We should use the aesenc etc. insns for noavx as before, we know at that
> point that TARGET_AES must be true because (TARGET_VAES && TARGET_AVX512VL)
> won't be true when !TARGET_AVX - TARGET_AVX512VL implies TARGET_AVX.
> For the second alternative, i.e. the AVX AES VEX encoded case, the patch
> uses aes_avx isa which requires both.  Now, for the third one we can't
> use avx512vl isa attribute, because one could compile with
> -maes -mavx512vl -mno-vaes and in that case we want VEX encoded vaesenc
> which can't use %xmm16+ (nor EGPRs), so we need vaes_avx512vl isa to
> ensure it is enabled only for -mvaes -mavx512vl.  And there is another
> problem, with -mno-aes -mvaes -mavx512vl we could emit VEX encoded vaesenc
> which requires AES and AVX ISAs rather than the VAES and AVX512VL which
> are enabled.  So the patch uses the {evex} prefix for those cases.
> And similarly for the vaes*_ instructions, if they aren't 128-bit
> or use %xmm16+ registers, the current case is fine, but if they are 128-bit
> and use only %xmm0-15 registers, assembler would again emit VEX encoded insn
> which needs AES & AVX CPUID, rather than the EVEX encoded ones which need
> VAES & AVX512VL CPUIDs.
> Still, I wonder if -mvaes shouldn't imply at least -mavx512f and
> -mno-avx512f shouldn't imply -mno-vaes, because otherwise can't see how
> it could use 512-bit registers (this part not done in the patch).
>
> The following patch has been successfully bootstrapped/regtested on
> x86_64-linux and i686-linux.
>
> 2024-04-04  Jakub Jelinek  
>
> PR target/114576
> * config/i386/i386.md (isa): Remove aes, add aes_avx, vaes_avx512vl.
> (enabled): Remove aes isa check, add aes_avx and vaes_avx512vl.
> * config/i386/sse.md (aesenc, aesenclast, aesdec, aesdeclast): Add
> 4th alternative, emit {evex} prefix for the third one, use
> noavx,aes_avx,vaes_avx512vl,vaes_avx512vl isa attribute, use jm
> rather than m constraint on the 2nd and 3rd alternative input.
> (vaesdec_, vaesdeclast_, vaesenc_,
> vaesenclast_): Add second alternative with x instead of v
> and jm instead of m.
>
> * gcc.target/i386/aes-pr114576.c: New test.
>
> --- gcc/config/i386/i386.md.jj  2024-03-18 22:15:43.165839479 +0100
> +++ gcc/config/i386/i386.md 2024-04-04 00:48:46.575511556 +0200
> @@ -568,13 +568,14 @@ 

Re: [PATCH v2] x86: Define __APX_INLINE_ASM_USE_GPR32__

2024-04-08 Thread Hongtao Liu
On Tue, Apr 9, 2024 at 9:58 AM H.J. Lu  wrote:
>
> Define __APX_INLINE_ASM_USE_GPR32__ for -mapx-inline-asm-use-gpr32.
> When __APX_INLINE_ASM_USE_GPR32__ is defined, inline asm statements
> should contain only instructions compatible with r16-r31.
Ok.
>
> gcc/
>
> PR target/114587
> * config/i386/i386-c.cc (ix86_target_macros_internal): Define
> __APX_INLINE_ASM_USE_GPR32__ for -mapx-inline-asm-use-gpr32.
>
> gcc/testsuite/
>
> PR target/114587
> * gcc.target/i386/apx-3.c: Likewise.
> ---
>  gcc/config/i386/i386-c.cc | 2 ++
>  gcc/testsuite/gcc.target/i386/apx-3.c | 6 ++
>  2 files changed, 8 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-3.c
>
> diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
> index 226d277676c..07f4936ba91 100644
> --- a/gcc/config/i386/i386-c.cc
> +++ b/gcc/config/i386/i386-c.cc
> @@ -751,6 +751,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
>  def_or_undef (parse_in, "__AVX10_1_512__");
>if (isa_flag2 & OPTION_MASK_ISA2_APX_F)
>  def_or_undef (parse_in, "__APX_F__");
> +  if (ix86_apx_inline_asm_use_gpr32)
> +def_or_undef (parse_in, "__APX_INLINE_ASM_USE_GPR32__");
>if (TARGET_IAMCU)
>  {
>def_or_undef (parse_in, "__iamcu");
> diff --git a/gcc/testsuite/gcc.target/i386/apx-3.c 
> b/gcc/testsuite/gcc.target/i386/apx-3.c
> new file mode 100644
> index 000..1ba4ac036fc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-3.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mapx-inline-asm-use-gpr32" } */
> +
> +#ifndef __APX_INLINE_ASM_USE_GPR32__
> +# error __APX_INLINE_ASM_USE_GPR32__ not defined
> +#endif
> --
> 2.44.0
>


-- 
BR,
Hongtao


Re: [PATCH] x86: Define macros for APX options

2024-04-08 Thread Hongtao Liu
On Mon, Apr 8, 2024 at 11:44 PM H.J. Lu  wrote:
>
> Define following macros for APX options:
>
> 1. __APX_EGPR__: -mapx-features=egpr.
> 2. __APX_PUSH2POP2__: -mapx-features=push2pop2.
> 3. __APX_NDD__: -mapx-features=ndd.
> 4. __APX_PPX__: -mapx-features=ppx.
For -mapx-features=, we haven't decided to expose this option to users
yet, we want users to just use -mapxf, so I think __APXF__ should be
enough?
> 5. __APX_INLINE_ASM_USE_GPR32__: -mapx-inline-asm-use-gpr32.
I'm ok for this one.
>
> They can be used to make assembly codes compatible with APX options.
> Some use cases are:
>
> 1. When __APX_PUSH2POP2__ is defined, assembly codes should always align
> the outgoing stack to 16 bytes.
> 2. When __APX_INLINE_ASM_USE_GPR32__ is defined, inline asm statements
> should contain only instructions compatible with r16-r31.
>
> gcc/
>
> PR target/114587
> * config/i386/i386-c.cc (ix86_target_macros_internal): Define
> __APX_XXX__ for APX options.
>
> gcc/testsuite/
>
> PR target/114587
> * gcc.target/i386/apx-3a.c: New test.
> * gcc.target/i386/apx-3b.c: Likewise.
> * gcc.target/i386/apx-3c.c: Likewise.
> * gcc.target/i386/apx-3d.c: Likewise.
> * gcc.target/i386/apx-3e.c: Likewise.
> * gcc.target/i386/apx-4.c: Likewise.
> ---
>  gcc/config/i386/i386-c.cc  | 10 ++
>  gcc/testsuite/gcc.target/i386/apx-3a.c |  6 ++
>  gcc/testsuite/gcc.target/i386/apx-3b.c |  6 ++
>  gcc/testsuite/gcc.target/i386/apx-3c.c |  6 ++
>  gcc/testsuite/gcc.target/i386/apx-3d.c |  6 ++
>  gcc/testsuite/gcc.target/i386/apx-3e.c | 18 ++
>  gcc/testsuite/gcc.target/i386/apx-4.c  |  6 ++
>  7 files changed, 58 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-3a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-3b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-3c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-3d.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-3e.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-4.c
>
> diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
> index 226d277676c..b8cfba90fdc 100644
> --- a/gcc/config/i386/i386-c.cc
> +++ b/gcc/config/i386/i386-c.cc
> @@ -751,6 +751,16 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
>  def_or_undef (parse_in, "__AVX10_1_512__");
>if (isa_flag2 & OPTION_MASK_ISA2_APX_F)
>  def_or_undef (parse_in, "__APX_F__");
> +  if (TARGET_APX_EGPR)
> +def_or_undef (parse_in, "__APX_EGPR__");
> +  if (TARGET_APX_PUSH2POP2)
> +def_or_undef (parse_in, "__APX_PUSH2POP2__");
> +  if (TARGET_APX_NDD)
> +def_or_undef (parse_in, "__APX_NDD__");
> +  if (TARGET_APX_PPX)
> +def_or_undef (parse_in, "__APX_PPX__");
> +  if (ix86_apx_inline_asm_use_gpr32)
> +def_or_undef (parse_in, "__APX_INLINE_ASM_USE_GPR32__");
>if (TARGET_IAMCU)
>  {
>def_or_undef (parse_in, "__iamcu");
> diff --git a/gcc/testsuite/gcc.target/i386/apx-3a.c 
> b/gcc/testsuite/gcc.target/i386/apx-3a.c
> new file mode 100644
> index 000..86d3ef2061d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-3a.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mapx-features=egpr" } */
> +
> +#ifndef __APX_EGPR__
> +# error __APX_EGPR__ not defined
> +#endif
> diff --git a/gcc/testsuite/gcc.target/i386/apx-3b.c 
> b/gcc/testsuite/gcc.target/i386/apx-3b.c
> new file mode 100644
> index 000..611727a389a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-3b.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mapx-features=push2pop2" } */
> +
> +#ifndef __APX_PUSH2POP2__
> +# error __APX_PUSH2POP2__ not defined
> +#endif
> diff --git a/gcc/testsuite/gcc.target/i386/apx-3c.c 
> b/gcc/testsuite/gcc.target/i386/apx-3c.c
> new file mode 100644
> index 000..52655b6cfa5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-3c.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mapx-features=ndd" } */
> +
> +#ifndef __APX_NDD__
> +# error __APX_NDD__ not defined
> +#endif
> diff --git a/gcc/testsuite/gcc.target/i386/apx-3d.c 
> b/gcc/testsuite/gcc.target/i386/apx-3d.c
> new file mode 100644
> index 000..9b91af1d377
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-3d.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mapx-features=ppx" } */
> +
> +#ifndef __APX_PPX__
> +# error __APX_PPX__ not defined
> +#endif
> diff --git a/gcc/testsuite/gcc.target/i386/apx-3e.c 
> b/gcc/testsuite/gcc.target/i386/apx-3e.c
> new file mode 100644
> index 000..7278428e5c4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-3e.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-mapx-features=egpr,push2pop2,ndd,ppx" } */
> +
> +#ifndef __APX_EGPR__

Re: [PATCH] sanitizer: [PR110027] Align asan_vec[0] to MAX (alignb, ASAN_RED_ZONE_SIZE)

2024-03-25 Thread Hongtao Liu
On Tue, Mar 26, 2024 at 11:26 AM Hongtao Liu  wrote:
>
> On Mon, Mar 25, 2024 at 8:51 PM Jakub Jelinek  wrote:
> >
> > On Tue, Mar 12, 2024 at 07:57:59PM +0800, liuhongt wrote:
> > > if alignb > ASAN_RED_ZONE_SIZE and offset[0] is not multiple of
> > > alignb. (base_align_bias - base_offset) may not aligned to alignb, and
> > > caused segement fault.
> > >
> > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > Ok for trunk and backport to GCC13?
> > >
> > > gcc/ChangeLog:
> > >
> > >   PR sanitizer/110027
> > >   * cfgexpand.cc (expand_stack_vars): Align frame offset to
> > >   MAX (alignb, ASAN_RED_ZONE_SIZE).
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >   * g++.dg/asan/pr110027.C: New test.
> > > ---
> > >  gcc/cfgexpand.cc |  2 +-
> > >  gcc/testsuite/g++.dg/asan/pr110027.C | 20 
> > >  2 files changed, 21 insertions(+), 1 deletion(-)
> > >  create mode 100644 gcc/testsuite/g++.dg/asan/pr110027.C
> > >
> > > diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
> > > index 0de299c62e3..92062378d8e 100644
> > > --- a/gcc/cfgexpand.cc
> > > +++ b/gcc/cfgexpand.cc
> > > @@ -1214,7 +1214,7 @@ expand_stack_vars (bool (*pred) (size_t), class 
> > > stack_vars_data *data)
> > >   {
> > > if (data->asan_vec.is_empty ())
> > >   {
> > > -   align_frame_offset (ASAN_RED_ZONE_SIZE);
> > > +   align_frame_offset (MAX (alignb, ASAN_RED_ZONE_SIZE));
> > > prev_offset = frame_offset.to_constant ();
> > >   }
> > > prev_offset = align_base (prev_offset,
> >
> > This doesn't look correct to me.
> > The above is done just once for the first var partition.  And
> > var partitions are sorted by stack_var_cmp, which puts > 
> > MAX_SUPPORTED_STACK_ALIGNMENT
> > alignment vars first (that should be none on x86, the above is quite huge
> > alignment), then on size decreasing and only after that on alignment
> > decreasing.
> >
> > So, try to add some other variable with larger size and smaller alignment
> > to the frame (and make sure it isn't optimized away).
> >
> > alignb above is the alignment of the first partition's var, if
> > align_frame_offset really needs to depend on the var alignment, it probably
> > should be the maximum alignment of all the vars with alignment
> > alignb * BITS_PER_UNIT <= MAX_SUPPORTED_STACK_ALIGNMENT
>
> In asan_emit_stack_protection, when it allocated fake stack, it assume
> bottom of stack is also aligned to alignb. And the place violated this
> is the first var partition. which is 32 bytes offsets,  it should be
> MAX_SUPPORTED_STACK_ALIGNMENT / BITS_PER_UNIT.
> So I think we need to use MAX (MAX_SUPPORTED_STACK_ALIGNMENT /
> BITS_PER_UNIT, ASAN_RED_ZONE_SIZE) for the first var partition.
It should be MAX (BIGGEST_ALIGNMENT / BITS_PER_UNIT, ASAN_RED_ZONE_SIZE).
MAX_SUPPORTED_STACK_ALIGNMENT is huge.
>
> >
> > > diff --git a/gcc/testsuite/g++.dg/asan/pr110027.C 
> > > b/gcc/testsuite/g++.dg/asan/pr110027.C
> > > new file mode 100644
> > > index 000..0067781bc89
> > > --- /dev/null
> > > +++ b/gcc/testsuite/g++.dg/asan/pr110027.C
> > > @@ -0,0 +1,20 @@
> > > +/* PR sanitizer/110027 */
> > > +/* { dg-do run } */
> > > +/* { dg-require-effective-target avx512f_runtime } */
> > > +/* { dg-options "-std=gnu++23 -mavx512f -fsanitize=address -O0 -g 
> > > -fstack-protector-strong" } */
> > > +
> > > +#include 
> > > +#include 
> > > +
> > > +template 
> > > +using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
> > > +
> > > +auto foo() {
> > > +  Vec<8, int64_t> ret{};
> > > +  return ret;
> > > +}
> > > +
> > > +int main() {
> > > +  foo();
> > > +  return 0;
> > > +}
> > > --
> > > 2.31.1
> >
> > Jakub
> >
>
>
> --
> BR,
> Hongtao



-- 
BR,
Hongtao


Re: [PATCH] sanitizer: [PR110027] Align asan_vec[0] to MAX (alignb, ASAN_RED_ZONE_SIZE)

2024-03-25 Thread Hongtao Liu
On Mon, Mar 25, 2024 at 8:51 PM Jakub Jelinek  wrote:
>
> On Tue, Mar 12, 2024 at 07:57:59PM +0800, liuhongt wrote:
> > if alignb > ASAN_RED_ZONE_SIZE and offset[0] is not multiple of
> > alignb. (base_align_bias - base_offset) may not aligned to alignb, and
> > caused segement fault.
> >
> > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > Ok for trunk and backport to GCC13?
> >
> > gcc/ChangeLog:
> >
> >   PR sanitizer/110027
> >   * cfgexpand.cc (expand_stack_vars): Align frame offset to
> >   MAX (alignb, ASAN_RED_ZONE_SIZE).
> >
> > gcc/testsuite/ChangeLog:
> >
> >   * g++.dg/asan/pr110027.C: New test.
> > ---
> >  gcc/cfgexpand.cc |  2 +-
> >  gcc/testsuite/g++.dg/asan/pr110027.C | 20 
> >  2 files changed, 21 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/g++.dg/asan/pr110027.C
> >
> > diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
> > index 0de299c62e3..92062378d8e 100644
> > --- a/gcc/cfgexpand.cc
> > +++ b/gcc/cfgexpand.cc
> > @@ -1214,7 +1214,7 @@ expand_stack_vars (bool (*pred) (size_t), class 
> > stack_vars_data *data)
> >   {
> > if (data->asan_vec.is_empty ())
> >   {
> > -   align_frame_offset (ASAN_RED_ZONE_SIZE);
> > +   align_frame_offset (MAX (alignb, ASAN_RED_ZONE_SIZE));
> > prev_offset = frame_offset.to_constant ();
> >   }
> > prev_offset = align_base (prev_offset,
>
> This doesn't look correct to me.
> The above is done just once for the first var partition.  And
> var partitions are sorted by stack_var_cmp, which puts > 
> MAX_SUPPORTED_STACK_ALIGNMENT
> alignment vars first (that should be none on x86, the above is quite huge
> alignment), then on size decreasing and only after that on alignment
> decreasing.
>
> So, try to add some other variable with larger size and smaller alignment
> to the frame (and make sure it isn't optimized away).
>
> alignb above is the alignment of the first partition's var, if
> align_frame_offset really needs to depend on the var alignment, it probably
> should be the maximum alignment of all the vars with alignment
> alignb * BITS_PER_UNIT <= MAX_SUPPORTED_STACK_ALIGNMENT

In asan_emit_stack_protection, when it allocated fake stack, it assume
bottom of stack is also aligned to alignb. And the place violated this
is the first var partition. which is 32 bytes offsets,  it should be
MAX_SUPPORTED_STACK_ALIGNMENT / BITS_PER_UNIT.
So I think we need to use MAX (MAX_SUPPORTED_STACK_ALIGNMENT /
BITS_PER_UNIT, ASAN_RED_ZONE_SIZE) for the first var partition.

>
> > diff --git a/gcc/testsuite/g++.dg/asan/pr110027.C 
> > b/gcc/testsuite/g++.dg/asan/pr110027.C
> > new file mode 100644
> > index 000..0067781bc89
> > --- /dev/null
> > +++ b/gcc/testsuite/g++.dg/asan/pr110027.C
> > @@ -0,0 +1,20 @@
> > +/* PR sanitizer/110027 */
> > +/* { dg-do run } */
> > +/* { dg-require-effective-target avx512f_runtime } */
> > +/* { dg-options "-std=gnu++23 -mavx512f -fsanitize=address -O0 -g 
> > -fstack-protector-strong" } */
> > +
> > +#include 
> > +#include 
> > +
> > +template 
> > +using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
> > +
> > +auto foo() {
> > +  Vec<8, int64_t> ret{};
> > +  return ret;
> > +}
> > +
> > +int main() {
> > +  foo();
> > +  return 0;
> > +}
> > --
> > 2.31.1
>
> Jakub
>


-- 
BR,
Hongtao


Re: [PATCH] Document -fexcess-precision=16.

2024-03-18 Thread Hongtao Liu
On Tue, Mar 19, 2024 at 12:16 AM Joseph Myers  wrote:
>
> On Mon, 18 Mar 2024, liuhongt wrote:
>
> > +If @option{-fexcess-precision=16} is specified, casts and assignments of
> > +@code{_Float16} and @code{bfloat16_t} cause value to be rounded to their
> > +semantic types if they're supported by the target.
>
> Isn't that option about rounding results of all operations, whether or not
> a cast or assignment is involved?  That's certainly what the brief mention
> of this option in extend.texi says, and fits the intent that
> -fexcess-precision=16 corresponds to FLT_EVAL_METHOD == 16.
Yes, how about this.


+If @option{-fexcess-precision=16} is specified, each operation of
+@code{_Float16} and @code{bfloat16_t} causes value to be rounded to their
+semantic types if they're supported by the target.

>
> --
> Joseph S. Myers
> josmy...@redhat.com
>


-- 
BR,
Hongtao


Re: [PATCH] i386 [stv]: Handle REG_EH_REGION note [pr111822].

2024-03-18 Thread Hongtao Liu
On Mon, Mar 18, 2024 at 6:59 PM Uros Bizjak  wrote:
>
> On Mon, Mar 18, 2024 at 11:52 AM liuhongt  wrote:
> >
> > Commit r14-9459-g618e34d56cc38e only handles
> > general_scalar_chain::convert_op. The patch also handles
> > timode_scalar_chain::convert_op to avoid potential similar bug.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk and backport to releases/gcc-13 branch?
>
> I have the following patch in testing that merges
> {general,timode}_scalar_chain::convert_op, so in addition to less code
> duplication, it will fix the issue for both chains. WDYT?
It would be better for maintenance, I prefer your patch.
>
> Uros.
>
> >
> > gcc/ChangeLog:
> >
> > PR target/111822
> > * config/i386/i386-features.cc
> > (timode_scalar_chain::convert_op): Handle REG_EH_REGION note.
> > ---
> >  gcc/config/i386/i386-features.cc | 20 +---
> >  1 file changed, 17 insertions(+), 3 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386-features.cc 
> > b/gcc/config/i386/i386-features.cc
> > index c7d7a965901..38f57d96df5 100644
> > --- a/gcc/config/i386/i386-features.cc
> > +++ b/gcc/config/i386/i386-features.cc
> > @@ -1794,12 +1794,26 @@ timode_scalar_chain::convert_op (rtx *op, rtx_insn 
> > *insn)
> >  *op = gen_rtx_SUBREG (V1TImode, *op, 0);
> >else if (MEM_P (*op))
> >  {
> > +  rtx_insn* eh_insn;
> >rtx tmp = gen_reg_rtx (V1TImode);
> > -  emit_insn_before (gen_rtx_SET (tmp,
> > -gen_gpr_to_xmm_move_src (V1TImode, 
> > *op)),
> > -   insn);
> > +  eh_insn
> > +   = emit_insn_before (gen_rtx_SET (tmp,
> > +gen_gpr_to_xmm_move_src (V1TImode,
> > + *op)),
> > +   insn);
> >*op = tmp;
> >
> > +  if (cfun->can_throw_non_call_exceptions)
> > +   {
> > + /* Handle REG_EH_REGION note.  */
> > + rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
> > + if (note)
> > +   {
> > + control_flow_insns.safe_push (eh_insn);
> > + add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
> > +   }
> > +   }
> > +
> >if (dump_file)
> > fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
> >  INSN_UID (insn), REGNO (tmp));
> > --
> > 2.31.1
> >



-- 
BR,
Hongtao


Re: [PATCH] vect: Use xor to invert oversized vector masks

2024-03-14 Thread Hongtao Liu
On Thu, Mar 14, 2024 at 11:42 PM Andrew Stubbs  wrote:
>
> Don't enable excess lanes when inverting vector bit-masks smaller than the
> integer mode.  This is yet another case of wrong-code due to mishandling
> of oversized bitmasks.
>
> This issue shows up in vect/tsvc/vect-tsvc-s278.c and
> vect/tsvc/vect-tsvc-s279.c if I set the preferred vector size to V32
> (down from V64) on amdgcn.
>
> OK for mainline?
>
> Andrew
>
> gcc/ChangeLog:
>
> * expr.cc (expand_expr_real_2): Use xor to invert vector masks.
> ---
>  gcc/expr.cc | 11 +++
>  1 file changed, 11 insertions(+)
>
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index 403eeaa108e4..3540327d879e 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -10497,6 +10497,17 @@ expand_expr_real_2 (sepops ops, rtx target, 
> machine_mode tmode,
>immed_wide_int_const (mask, int_mode),
>target, 1, OPTAB_LIB_WIDEN);
> }
> +  /* If it's a vector mask don't enable excess bits.  */
> +  else if (VECTOR_BOOLEAN_TYPE_P (type)
> +  && SCALAR_INT_MODE_P (mode)
> +  && maybe_ne (GET_MODE_PRECISION (mode),
> +   TYPE_VECTOR_SUBPARTS (type).to_constant ()))
> +   {
> + auto nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> + temp = expand_binop (mode, xor_optab, op0,
> +  GEN_INT ((HOST_WIDE_INT_1U << nunits) - 1),
> +  target, true, OPTAB_WIDEN);
> +   }
Not review, just curious, should the issue be fixed by the commit in PR113576.
Also wonder besides cbranch, excess land bits also matter?
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113576#c35
>else
> temp = expand_unop (mode, one_cmpl_optab, op0, target, 1);
>gcc_assert (temp);
> --
> 2.41.0
>


-- 
BR,
Hongtao


Re: [PATCH] i386[stv]: Handle REG_EH_REGION note

2024-03-14 Thread Hongtao Liu
On Thu, Mar 14, 2024 at 10:46 PM Uros Bizjak  wrote:
>
> On Thu, Mar 14, 2024 at 8:42 AM Uros Bizjak  wrote:
> >
> > On Thu, Mar 14, 2024 at 8:32 AM Hongtao Liu  wrote:
> > >
> > > On Thu, Mar 14, 2024 at 3:22 PM Uros Bizjak  wrote:
> > > >
> > > > On Thu, Mar 14, 2024 at 2:33 AM liuhongt  wrote:
> > > > >
> > > > > When we split
> > > > > (insn 37 36 38 10 (set (reg:DI 104 [ _18 ])
> > > > > (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 
> > > > > MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 
> > > > > A32])) "test.C":22:42 84 {*movdi_internal}
> > > > >  (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
> > > > >
> > > > > into
> > > > >
> > > > > (insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0)
> > > > > (vec_concat:V2DI (mem:DI (reg/f:SI 98 [ 
> > > > > CallNative_nclosure.0_1 ]) [6 MEM[(struct SQRefCounted 
> > > > > *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])
> > > > > (const_int 0 [0]))) "test.C":22:42 -1
> > > > > (nil)))
> > > > > (insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0)
> > > > > (subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 
> > > > > {movv2di_internal}
> > > > >  (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
> > > > > (nil)))
> > > > >
> > > > > we must copy the REG_EH_REGION note to the first insn and split the 
> > > > > block
> > > > > after the newly added insn.  The REG_EH_REGION on the second insn 
> > > > > will be
> > > > > removed later since it no longer traps.
> > > > >
> > > > > Currently we only handle memory_operand, are there any other insns
> > > > > need to be handled???
> > > >
> > > > I think memory access is the only thing that can trap.
> > > >
> > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} for trunk 
> > > > > and gcc-13/gcc-12 release branch.
> > > > > Ok for trunk and backport?
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > * config/i386/i386-features.cc
> > > > > (general_scalar_chain::convert_op): Handle REG_EH_REGION note.
> > > > > (convert_scalars_to_vector): Ditto.
> > > > > * config/i386/i386-features.h (class scalar_chain): New
> > > > > memeber control_flow_insns.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > * g++.target/i386/pr111822.C: New test.
> > > > > ---
> > > > >  gcc/config/i386/i386-features.cc | 48 
> > > > > ++--
> > > > >  gcc/config/i386/i386-features.h  |  1 +
> > > > >  gcc/testsuite/g++.target/i386/pr111822.C | 45 ++
> > > > >  3 files changed, 90 insertions(+), 4 deletions(-)
> > > > >  create mode 100644 gcc/testsuite/g++.target/i386/pr111822.C
> > > > >
> > > > > diff --git a/gcc/config/i386/i386-features.cc 
> > > > > b/gcc/config/i386/i386-features.cc
> > > > > index 1de2a07ed75..2ed27a9ebdd 100644
> > > > > --- a/gcc/config/i386/i386-features.cc
> > > > > +++ b/gcc/config/i386/i386-features.cc
> > > > > @@ -998,20 +998,36 @@ general_scalar_chain::convert_op (rtx *op, 
> > > > > rtx_insn *insn)
> > > > >  }
> > > > >else if (MEM_P (*op))
> > > > >  {
> > > > > +  rtx_insn* eh_insn, *movabs = NULL;
> > > > >rtx tmp = gen_reg_rtx (GET_MODE (*op));
> > > > >
> > > > >/* Handle movabs.  */
> > > > >if (!memory_operand (*op, GET_MODE (*op)))
> > > > > {
> > > > >   rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
> > > > > + movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
> > > > >
> > > > > - emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
> > > > >   *op = tmp2;
> > > > > }
> > > >
> > > > I may be missing something, but isn't the above a dead code? We have
> > > > if (MEM_p(*op)) and then if (!memory_operand (*op, ...)).
> > > It's PR91814 #c1, memory_operand will also check invalid memory addresses.
> >
> > Oh, it is even my comment ;)
> >
> > Perhaps the comment should be improved to something like:
> >
> > "Emit MOVABS to load from a 64-bit absolute address to a GPR."
> >
> > LGTM then.
>
> BTW: Do we need to also fix timode_scalar_chain::convert_op ? There we
> also preload operand, so a similar fix should be applied there.
Yes, I'll make another patch. Didn't realize there are 2 of them.
>
> Uros.



-- 
BR,
Hongtao


Re: [PATCH] i386[stv]: Handle REG_EH_REGION note

2024-03-14 Thread Hongtao Liu
On Thu, Mar 14, 2024 at 3:22 PM Uros Bizjak  wrote:
>
> On Thu, Mar 14, 2024 at 2:33 AM liuhongt  wrote:
> >
> > When we split
> > (insn 37 36 38 10 (set (reg:DI 104 [ _18 ])
> > (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct 
> > SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 
> > 84 {*movdi_internal}
> >  (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
> >
> > into
> >
> > (insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0)
> > (vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) 
> > [6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])
> > (const_int 0 [0]))) "test.C":22:42 -1
> > (nil)))
> > (insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0)
> > (subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal}
> >  (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
> > (nil)))
> >
> > we must copy the REG_EH_REGION note to the first insn and split the block
> > after the newly added insn.  The REG_EH_REGION on the second insn will be
> > removed later since it no longer traps.
> >
> > Currently we only handle memory_operand, are there any other insns
> > need to be handled???
>
> I think memory access is the only thing that can trap.
>
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} for trunk and 
> > gcc-13/gcc-12 release branch.
> > Ok for trunk and backport?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386-features.cc
> > (general_scalar_chain::convert_op): Handle REG_EH_REGION note.
> > (convert_scalars_to_vector): Ditto.
> > * config/i386/i386-features.h (class scalar_chain): New
> > memeber control_flow_insns.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * g++.target/i386/pr111822.C: New test.
> > ---
> >  gcc/config/i386/i386-features.cc | 48 ++--
> >  gcc/config/i386/i386-features.h  |  1 +
> >  gcc/testsuite/g++.target/i386/pr111822.C | 45 ++
> >  3 files changed, 90 insertions(+), 4 deletions(-)
> >  create mode 100644 gcc/testsuite/g++.target/i386/pr111822.C
> >
> > diff --git a/gcc/config/i386/i386-features.cc 
> > b/gcc/config/i386/i386-features.cc
> > index 1de2a07ed75..2ed27a9ebdd 100644
> > --- a/gcc/config/i386/i386-features.cc
> > +++ b/gcc/config/i386/i386-features.cc
> > @@ -998,20 +998,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn 
> > *insn)
> >  }
> >else if (MEM_P (*op))
> >  {
> > +  rtx_insn* eh_insn, *movabs = NULL;
> >rtx tmp = gen_reg_rtx (GET_MODE (*op));
> >
> >/* Handle movabs.  */
> >if (!memory_operand (*op, GET_MODE (*op)))
> > {
> >   rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
> > + movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
> >
> > - emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
> >   *op = tmp2;
> > }
>
> I may be missing something, but isn't the above a dead code? We have
> if (MEM_p(*op)) and then if (!memory_operand (*op, ...)).
It's PR91814 #c1, memory_operand will also check invalid memory addresses.
>
> Uros.
>
> >
> > -  emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
> > -gen_gpr_to_xmm_move_src (vmode, *op)),
> > -   insn);
> > +  eh_insn
> > +   = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
> > +gen_gpr_to_xmm_move_src (vmode, 
> > *op)),
> > +   insn);
> > +
> > +  if (cfun->can_throw_non_call_exceptions)
> > +   {
> > + /* Handle REG_EH_REGION note.  */
> > + rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
> > + if (note)
> > +   {
> > + if (movabs)
> > +   eh_insn = movabs;
> > + control_flow_insns.safe_push (eh_insn);
> > + add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
> > +   }
> > +   }
> > +
> >*op = gen_rtx_SUBREG (vmode, tmp, 0);
> >
> >if (dump_file)
> > @@ -2494,6 +2510,7 @@ convert_scalars_to_vector (bool timode_p)
> >  {
> >basic_block bb;
> >int converted_insns = 0;
> > +  auto_vec control_flow_insns;
> >
> >bitmap_obstack_initialize (NULL);
> >const machine_mode cand_mode[3] = { SImode, DImode, TImode };
> > @@ -2575,6 +2592,11 @@ convert_scalars_to_vector (bool timode_p)
> >  chain->chain_id);
> > }
> >
> > + rtx_insn* iter_insn;
> > + unsigned int ii;
> > + FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
> > +   control_flow_insns.safe_push (iter_insn);
> > +
> >   delete chain;
> > }
> >  }
> > @@ -2643,6 +2665,24 @@ convert_scalars_to_vector (bool timode_p)
> >   DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG 

Re: [PATCH] sanitizer: [PR110027] Align asan_vec[0] to MAX (alignb, ASAN_RED_ZONE_SIZE)

2024-03-12 Thread Hongtao Liu
On Tue, Mar 12, 2024 at 8:00 PM liuhongt  wrote:
>
> if alignb > ASAN_RED_ZONE_SIZE and offset[0] is not multiple of
> alignb. (base_align_bias - base_offset) may not aligned to alignb, and
> caused segement fault.
>
> Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> Ok for trunk and backport to GCC13?
CC jakub, I see the code was added by
https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512313.html
The issue in the PR is similar, but __m512 requires bigger
alignment(64 > ASAN_RED_ZONE_SIZE(32)), in that case we need to insert
MAX (alignb, ASAN_RED_ZONE_SIZE) instead of ASAN_RED_ZONE_SIZE.
Assume when alignb > ASAN_RED_ZONE_SIZE, it must be multiple of
ASAN_RED_ZONE_SIZE.
>
> gcc/ChangeLog:
>
> PR sanitizer/110027
> * cfgexpand.cc (expand_stack_vars): Align frame offset to
> MAX (alignb, ASAN_RED_ZONE_SIZE).
>
> gcc/testsuite/ChangeLog:
>
> * g++.dg/asan/pr110027.C: New test.
> ---
>  gcc/cfgexpand.cc |  2 +-
>  gcc/testsuite/g++.dg/asan/pr110027.C | 20 
>  2 files changed, 21 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/g++.dg/asan/pr110027.C
>
> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
> index 0de299c62e3..92062378d8e 100644
> --- a/gcc/cfgexpand.cc
> +++ b/gcc/cfgexpand.cc
> @@ -1214,7 +1214,7 @@ expand_stack_vars (bool (*pred) (size_t), class 
> stack_vars_data *data)
> {
>   if (data->asan_vec.is_empty ())
> {
> - align_frame_offset (ASAN_RED_ZONE_SIZE);
> + align_frame_offset (MAX (alignb, ASAN_RED_ZONE_SIZE));
>   prev_offset = frame_offset.to_constant ();
> }
>   prev_offset = align_base (prev_offset,
> diff --git a/gcc/testsuite/g++.dg/asan/pr110027.C 
> b/gcc/testsuite/g++.dg/asan/pr110027.C
> new file mode 100644
> index 000..0067781bc89
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/asan/pr110027.C
> @@ -0,0 +1,20 @@
> +/* PR sanitizer/110027 */
> +/* { dg-do run } */
> +/* { dg-require-effective-target avx512f_runtime } */
> +/* { dg-options "-std=gnu++23 -mavx512f -fsanitize=address -O0 -g 
> -fstack-protector-strong" } */
> +
> +#include 
> +#include 
> +
> +template 
> +using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
> +
> +auto foo() {
> +  Vec<8, int64_t> ret{};
> +  return ret;
> +}
> +
> +int main() {
> +  foo();
> +  return 0;
> +}
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] i386: Guard noreturn no-callee-saved-registers optimization with -mnoreturn-no-callee-saved-registers [PR38534]

2024-03-04 Thread Hongtao Liu
On Thu, Feb 29, 2024 at 2:20 PM Hongtao Liu  wrote:
>
> On Wed, Feb 28, 2024 at 4:54 PM Jakub Jelinek  wrote:
> >
> > Hi!
> >
> > Adding Hongtao and Honza into the loop as the ones who acked the original
> > patch.
> >
> > The no_callee_saved_registers by default for noreturn functions change can
> > break in-process backtrace(3) or backtraces from debugger or other process
> > (quite often, any time the noreturn function decides to use the bp register
> > and any of the parent frames uses a frame pointer; the unwinder just crashes
> > in the libgcc unwinder case, gdb prints stack corrupted message), so I'd
> > like to save bp register in that case:
> >
> > https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646591.html
> I think this patch makes sense and LGTM, we save and restore frame
> pointer for noreturn.
> >
> > and additionally the no_callee_saved_registers by default for noreturn
> > functions change can make debugging harder, again not localized to the
> > noreturn function, but any of its callers.  So, if say glibc abort function
> > implementation needs a lot of normally callee-saved registers, no matter how
> > users recompile their apps, they will see garbage or optimized out
> > vars/parameters in their code unless they rebuild their glibc with -O0.
> > So, I think we should guard that by a non-default option:
>From what has been discussed so far, I am inclined to this proposal.
If there are no additional objections(or concerns) in a few days, ok
for the trunk.
> >
>
>
> --
> BR,
> Hongtao



-- 
BR,
Hongtao


Re: [PATCH] i386: Guard noreturn no-callee-saved-registers optimization with -mnoreturn-no-callee-saved-registers [PR38534]

2024-02-28 Thread Hongtao Liu
On Wed, Feb 28, 2024 at 4:54 PM Jakub Jelinek  wrote:
>
> Hi!
>
> Adding Hongtao and Honza into the loop as the ones who acked the original
> patch.
>
> The no_callee_saved_registers by default for noreturn functions change can
> break in-process backtrace(3) or backtraces from debugger or other process
> (quite often, any time the noreturn function decides to use the bp register
> and any of the parent frames uses a frame pointer; the unwinder just crashes
> in the libgcc unwinder case, gdb prints stack corrupted message), so I'd
> like to save bp register in that case:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646591.html
I think this patch makes sense and LGTM, we save and restore frame
pointer for noreturn.
>
> and additionally the no_callee_saved_registers by default for noreturn
> functions change can make debugging harder, again not localized to the
> noreturn function, but any of its callers.  So, if say glibc abort function
> implementation needs a lot of normally callee-saved registers, no matter how
> users recompile their apps, they will see garbage or optimized out
> vars/parameters in their code unless they rebuild their glibc with -O0.
> So, I think we should guard that by a non-default option:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646649.html
So it turns off the optimization for noreturn functions by default,
I'm not sure about this.
Any comments, H.J?
>
> Plus we need to somehow make sure to emit DW_CFA_undefined for the modified
> but not saved normally callee-saved registers, so that we at least don't get
> garbage in debug info.  H.J. posted some patches for that, so far I wasn't
> happy about the implementation but the actual change is desirable.
>
> Your thoughts on this?
>
> Jakub
>


-- 
BR,
Hongtao


Re: [r14-9173 Regression] FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr" on Linux/x86_64

2024-02-26 Thread Hongtao Liu
On Tue, Feb 27, 2024 at 3:44 PM Richard Biener  wrote:
>
> On Tue, 27 Feb 2024, haochen.jiang wrote:
>
> > On Linux/x86_64,
> >
> > af66ad89e8169f44db723813662917cf4cbb78fc is the first bad commit
> > commit af66ad89e8169f44db723813662917cf4cbb78fc
> > Author: Richard Biener 
> > Date:   Fri Feb 23 16:06:05 2024 +0100
> >
> > middle-end/114070 - folding breaking VEC_COND expansion
> >
> > caused
> >
> > FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"
>
> This shows that the x86 backend is missing vcond_mask_qiqi and friends
Interesting, so both operand and mask are vector boolean.
> (for AVX512 mask modes).  Either that or both expand_vec_cond_expr_p
> and all the machinery behind it (ISEL pass, lowering) should handle
> pure integer mode VEC_COND_EXPR via bit operations.  I think quite some
> targets now implement patterns for these variants, whatever their
> boolean vector modes are.
>
> One complication with the change, which was
>
>   (simplify
>(op @3 (vec_cond:s @0 @1 @2))
> -  (vec_cond @0 (op! @3 @1) (op! @3 @2
> +  (if (TREE_CODE_CLASS (op) != tcc_comparison
> +   || types_match (type, TREE_TYPE (@1))
> +   || expand_vec_cond_expr_p (type, TREE_TYPE (@0), ERROR_MARK))
> +   (vec_cond @0 (op! @3 @1) (op! @3 @2)
>
> is that expand_vec_cond_expr_p can also handle comparison defined
> masks, but whether or not we have this isn't visible here so we
> can only check whether vcond_mask expansion would work.
>
> We have optimize_vectors_before_lowering_p but we shouldn't even there
> turn supported into not supported ops and as said, what's supported or
> not cannot be finally decided (if it's only vcond and not vcond_mask
> that is supported).  Also optimize_vectors_before_lowering_p is set
> for a short time between vectorization and vector lowering and we
> definitely do not want to turn supported vectorizer emitted stmts
> into ones that we need to lower.  For GCC 15 we should see to move
> vector lowering before vectorization (before loop optimization I'd
> say) to close this particula hole (and also reliably ICE when the
> vectorizer creates unsupported IL).  We also definitely want to
> retire vcond expanders (no target I know of supports single-instruction
> compare-and-select).
>
> So short term we either live with this regression (the testcase
> verifies we perform constant folding to { 0, 0 }), implement
> the four missing patterns (qi, hi, si and di missing value mode
> vcond_mask patterns) or see to implement generic code for this.
>
> Given precedent I'd tend towards adding the x86 patterns.
>
> Hongtao, can you handle that?
Sure, I'll take a look.
>
> Thanks,
> Richard.



-- 
BR,
Hongtao


Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics

2024-02-26 Thread Hongtao Liu
On Mon, Feb 26, 2024 at 6:30 PM H.J. Lu  wrote:
>
> On Sun, Feb 25, 2024 at 8:25 PM H.J. Lu  wrote:
> >
> > On Sun, Feb 25, 2024 at 7:03 PM Hongtao Liu  wrote:
> > >
> > > On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu  wrote:
> > > >
> > > > On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu  wrote:
> > > > >
> > > > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu  wrote:
> > > > > >
> > > > > > ldtilecfg and sttilecfg take a 512-byte memory block.  With
> > > > > > _tile_loadconfig implemented as
> > > > > >
> > > > > > extern __inline void
> > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > > > _tile_loadconfig (const void *__config)
> > > > > > {
> > > > > >   __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void 
> > > > > > **)__config)));
> > > > > > }
> > > > > >
> > > > > > GCC sees:
> > > > > >
> > > > > > (parallel [
> > > > > >   (asm_operands/v ("ldtilecfg   %X0") ("") 0
> > > > > >[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
> > > > > >  (const_int -64 [0xffc0])) [1 
> > > > > > MEM[(const void * *)_data]+0 S8 A128])]
> > > > > >[(asm_input:DI ("m"))]
> > > > > >(clobber (reg:CC 17 flags))])
> > > > > >
> > > > > > and the memory operand size is 1 byte.  As the result, the rest of 
> > > > > > 511
> > > > > > bytes is ignored by GCC.  Implement ldtilecfg and sttilecfg 
> > > > > > intrinsics
> > > > > > with a pointer to BLKmode to honor the 512-byte memory block.
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > PR target/114098
> > > > > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use
> > > > > > __builtin_ia32_ldtilecfg.
> > > > > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg.
> > > > > > * config/i386/i386-builtin.def (BDESC): Add
> > > > > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
> > > > > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle
> > > > > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
> > > > > > * config/i386/i386.md (ldtilecfg): New pattern.
> > > > > > (sttilecfg): Likewise.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > PR target/114098
> > > > > > * gcc.target/i386/amxtile-4.c: New test.
> > > > > > ---
> > > > > >  gcc/config/i386/amxtileintrin.h   |  4 +-
> > > > > >  gcc/config/i386/i386-builtin.def  |  4 ++
> > > > > >  gcc/config/i386/i386-expand.cc| 19 
> > > > > >  gcc/config/i386/i386.md   | 24 ++
> > > > > >  gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 
> > > > > > +++
> > > > > >  5 files changed, 104 insertions(+), 2 deletions(-)
> > > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
> > > > > >
> > > > > > diff --git a/gcc/config/i386/amxtileintrin.h 
> > > > > > b/gcc/config/i386/amxtileintrin.h
> > > > > > index d1a26e0fea5..5081b326498 100644
> > > > > > --- a/gcc/config/i386/amxtileintrin.h
> > > > > > +++ b/gcc/config/i386/amxtileintrin.h
> > > > > > @@ -39,14 +39,14 @@ extern __inline void
> > > > > >  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > > > >  _tile_loadconfig (const void *__config)
> > > > > >  {
> > > > > > -  __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void 
> > > > > > **)__config)));
> > > > > > +  __builtin_ia32_ldtilecfg (__config);
> > > > > >  }
> > > > > >
> > > > > >  extern __inline void
> > > > > >  __attribute__((__gnu_i

Re: [PATCH v1] RTL: Bugfix ICE after allow vector type in DSE

2024-02-25 Thread Hongtao Liu
On Mon, Feb 26, 2024 at 11:42 AM Li, Pan2  wrote:
>
> > Be Careful, It may regresses some other backend.
>
> Thanks Hongtao, how about take INNER_MODE here for regsize. Currently it will 
> be the whole vector register when comparation.
>
> poly_uint64 regsize = REGMODE_NATURAL_SIZE (imode);
>
> Pan
>
> -Original Message-
> From: Hongtao Liu 
> Sent: Monday, February 26, 2024 11:41 AM
> To: Li, Pan2 
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@gmail.com; 
> richard.guent...@gmail.com; Wang, Yanzhang ; 
> rdapp@gmail.com
> Subject: Re: [PATCH v1] RTL: Bugfix ICE after allow vector type in DSE
>
> On Mon, Feb 26, 2024 at 11:26 AM  wrote:
> >
> > From: Pan Li 
> >
> > We allowed vector type for get_stored_val when read is less than or
> > equal to store in previous.  Unfortunately, we missed to adjust the
> > validate_subreg part accordingly.  For vector type, we don't need to
> > restrict the mode size is greater than the vector register size.
> >
> > Thus, for example when gen_lowpart from E_V2SFmode to E_V4QImode, it
> > will have NULL_RTX(of course ICE after that) because of the mode size
> > is less than vector register size.  That also explain that gen_lowpart
> > from E_V8SFmode to E_V16QImode is valid here.
> >
> > This patch would like to remove the the restriction for vector mode, to
> > rid of the ICE when gen_lowpart because of validate_subreg fails.
> Be Careful, It may regresses some other backend.
The related thread.
https://gcc.gnu.org/pipermail/gcc-patches/2021-August/578466.html
> >
> > The below test are passed for this patch:
> >
> > * The X86 bootstrap test.
> > * The fully riscv regression tests.
> >
> > gcc/ChangeLog:
> >
> > * emit-rtl.cc (validate_subreg): Bypass register size check
> > if the mode is vector.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.dg/tree-ssa/ssa-fre-44.c: Add ftree-vectorize to trigger
> > the ICE.
> > * gcc.target/riscv/rvv/base/bug-6.c: New test.
> >
> > Signed-off-by: Pan Li 
> > ---
> >  gcc/emit-rtl.cc   |  3 ++-
> >  gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c|  2 +-
> >  .../gcc.target/riscv/rvv/base/bug-6.c | 22 +++
> >  3 files changed, 25 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c
> >
> > diff --git a/gcc/emit-rtl.cc b/gcc/emit-rtl.cc
> > index 1856fa4884f..45c6301b487 100644
> > --- a/gcc/emit-rtl.cc
> > +++ b/gcc/emit-rtl.cc
> > @@ -934,7 +934,8 @@ validate_subreg (machine_mode omode, machine_mode imode,
> >  ;
> >/* ??? Similarly, e.g. with (subreg:DF (reg:TI)).  Though store_bit_field
> >   is the culprit here, and not the backends.  */
> > -  else if (known_ge (osize, regsize) && known_ge (isize, osize))
> > +  else if (known_ge (isize, osize) && (known_ge (osize, regsize)
> > +|| (VECTOR_MODE_P (imode) || VECTOR_MODE_P (omode
> >  ;
> >/* Allow component subregs of complex and vector.  Though given the below
> >   extraction rules, it's not always clear what that means.  */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> > index f79b4c142ae..624a00a4f32 100644
> > --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O -fdump-tree-fre1" } */
> > +/* { dg-options "-O -fdump-tree-fre1 -O3 -ftree-vectorize" } */
> >
> >  struct A { float x, y; };
> >  struct B { struct A u; };
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c 
> > b/gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c
> > new file mode 100644
> > index 000..5bb00b8f587
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c
> > @@ -0,0 +1,22 @@
> > +/* Test that we do not have ice when compile */
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize" } */
> > +
> > +struct A { float x, y; };
> > +struct B { struct A u; };
> > +
> > +extern void bar (struct A *);
> > +
> > +float
> > +f3 (struct B *x, int y)
> > +{
> > +  struct A p = {1.0f, 2.0f};
> > +  struct A *q = [y].u;
> > +
> > +  __builtin_memcpy (>x, , sizeof (float));
> > +  __builtin_memcpy (>y, , sizeof (float));
> > +
> > +  bar ();
> > +
> > +  return x[y].u.x + x[y].u.y;
> > +}
> > --
> > 2.34.1
> >
>
>
> --
> BR,
> Hongtao



--
BR,
Hongtao


Re: [PATCH v1] RTL: Bugfix ICE after allow vector type in DSE

2024-02-25 Thread Hongtao Liu
On Mon, Feb 26, 2024 at 11:26 AM  wrote:
>
> From: Pan Li 
>
> We allowed vector type for get_stored_val when read is less than or
> equal to store in previous.  Unfortunately, we missed to adjust the
> validate_subreg part accordingly.  For vector type, we don't need to
> restrict the mode size is greater than the vector register size.
>
> Thus, for example when gen_lowpart from E_V2SFmode to E_V4QImode, it
> will have NULL_RTX(of course ICE after that) because of the mode size
> is less than vector register size.  That also explain that gen_lowpart
> from E_V8SFmode to E_V16QImode is valid here.
>
> This patch would like to remove the the restriction for vector mode, to
> rid of the ICE when gen_lowpart because of validate_subreg fails.
Be Careful, It may regresses some other backend.
>
> The below test are passed for this patch:
>
> * The X86 bootstrap test.
> * The fully riscv regression tests.
>
> gcc/ChangeLog:
>
> * emit-rtl.cc (validate_subreg): Bypass register size check
> if the mode is vector.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/ssa-fre-44.c: Add ftree-vectorize to trigger
> the ICE.
> * gcc.target/riscv/rvv/base/bug-6.c: New test.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/emit-rtl.cc   |  3 ++-
>  gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c|  2 +-
>  .../gcc.target/riscv/rvv/base/bug-6.c | 22 +++
>  3 files changed, 25 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c
>
> diff --git a/gcc/emit-rtl.cc b/gcc/emit-rtl.cc
> index 1856fa4884f..45c6301b487 100644
> --- a/gcc/emit-rtl.cc
> +++ b/gcc/emit-rtl.cc
> @@ -934,7 +934,8 @@ validate_subreg (machine_mode omode, machine_mode imode,
>  ;
>/* ??? Similarly, e.g. with (subreg:DF (reg:TI)).  Though store_bit_field
>   is the culprit here, and not the backends.  */
> -  else if (known_ge (osize, regsize) && known_ge (isize, osize))
> +  else if (known_ge (isize, osize) && (known_ge (osize, regsize)
> +|| (VECTOR_MODE_P (imode) || VECTOR_MODE_P (omode
>  ;
>/* Allow component subregs of complex and vector.  Though given the below
>   extraction rules, it's not always clear what that means.  */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> index f79b4c142ae..624a00a4f32 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O -fdump-tree-fre1" } */
> +/* { dg-options "-O -fdump-tree-fre1 -O3 -ftree-vectorize" } */
>
>  struct A { float x, y; };
>  struct B { struct A u; };
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c
> new file mode 100644
> index 000..5bb00b8f587
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c
> @@ -0,0 +1,22 @@
> +/* Test that we do not have ice when compile */
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize" } */
> +
> +struct A { float x, y; };
> +struct B { struct A u; };
> +
> +extern void bar (struct A *);
> +
> +float
> +f3 (struct B *x, int y)
> +{
> +  struct A p = {1.0f, 2.0f};
> +  struct A *q = [y].u;
> +
> +  __builtin_memcpy (>x, , sizeof (float));
> +  __builtin_memcpy (>y, , sizeof (float));
> +
> +  bar ();
> +
> +  return x[y].u.x + x[y].u.y;
> +}
> --
> 2.34.1
>


-- 
BR,
Hongtao


Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics

2024-02-25 Thread Hongtao Liu
On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu  wrote:
>
> On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu  wrote:
> >
> > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu  wrote:
> > >
> > > ldtilecfg and sttilecfg take a 512-byte memory block.  With
> > > _tile_loadconfig implemented as
> > >
> > > extern __inline void
> > > __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > > _tile_loadconfig (const void *__config)
> > > {
> > >   __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> > > }
> > >
> > > GCC sees:
> > >
> > > (parallel [
> > >   (asm_operands/v ("ldtilecfg   %X0") ("") 0
> > >[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
> > >  (const_int -64 [0xffc0])) [1 
> > > MEM[(const void * *)_data]+0 S8 A128])]
> > >[(asm_input:DI ("m"))]
> > >(clobber (reg:CC 17 flags))])
> > >
> > > and the memory operand size is 1 byte.  As the result, the rest of 511
> > > bytes is ignored by GCC.  Implement ldtilecfg and sttilecfg intrinsics
> > > with a pointer to BLKmode to honor the 512-byte memory block.
> > >
> > > gcc/ChangeLog:
> > >
> > > PR target/114098
> > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use
> > > __builtin_ia32_ldtilecfg.
> > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg.
> > > * config/i386/i386-builtin.def (BDESC): Add
> > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
> > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle
> > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
> > > * config/i386/i386.md (ldtilecfg): New pattern.
> > > (sttilecfg): Likewise.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > PR target/114098
> > > * gcc.target/i386/amxtile-4.c: New test.
> > > ---
> > >  gcc/config/i386/amxtileintrin.h   |  4 +-
> > >  gcc/config/i386/i386-builtin.def  |  4 ++
> > >  gcc/config/i386/i386-expand.cc| 19 
> > >  gcc/config/i386/i386.md   | 24 ++
> > >  gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++
> > >  5 files changed, 104 insertions(+), 2 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
> > >
> > > diff --git a/gcc/config/i386/amxtileintrin.h 
> > > b/gcc/config/i386/amxtileintrin.h
> > > index d1a26e0fea5..5081b326498 100644
> > > --- a/gcc/config/i386/amxtileintrin.h
> > > +++ b/gcc/config/i386/amxtileintrin.h
> > > @@ -39,14 +39,14 @@ extern __inline void
> > >  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > >  _tile_loadconfig (const void *__config)
> > >  {
> > > -  __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void 
> > > **)__config)));
> > > +  __builtin_ia32_ldtilecfg (__config);
> > >  }
> > >
> > >  extern __inline void
> > >  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> > >  _tile_storeconfig (void *__config)
> > >  {
> > > -  __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
> > > +  __builtin_ia32_sttilecfg (__config);
> > >  }
> > >
> > >  extern __inline void
> > > diff --git a/gcc/config/i386/i386-builtin.def 
> > > b/gcc/config/i386/i386-builtin.def
> > > index 729355230b8..88dd7f8857f 100644
> > > --- a/gcc/config/i386/i386-builtin.def
> > > +++ b/gcc/config/i386/i386-builtin.def
> > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | 
> > > OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b
> > >  BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, 
> > > CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, 
> > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > >  BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, 
> > > CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, 
> > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64)
> > >
> > > +/* LDFILECFG and STFILECFG.  */
> > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, 
> > > CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg&

Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics

2024-02-25 Thread Hongtao Liu
On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu  wrote:
>
> ldtilecfg and sttilecfg take a 512-byte memory block.  With
> _tile_loadconfig implemented as
>
> extern __inline void
> __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _tile_loadconfig (const void *__config)
> {
>   __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> }
>
> GCC sees:
>
> (parallel [
>   (asm_operands/v ("ldtilecfg   %X0") ("") 0
>[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars)
>  (const_int -64 [0xffc0])) [1 MEM[(const 
> void * *)_data]+0 S8 A128])]
>[(asm_input:DI ("m"))]
>(clobber (reg:CC 17 flags))])
>
> and the memory operand size is 1 byte.  As the result, the rest of 511
> bytes is ignored by GCC.  Implement ldtilecfg and sttilecfg intrinsics
> with a pointer to BLKmode to honor the 512-byte memory block.
>
> gcc/ChangeLog:
>
> PR target/114098
> * config/i386/amxtileintrin.h (_tile_loadconfig): Use
> __builtin_ia32_ldtilecfg.
> (_tile_storeconfig): Use __builtin_ia32_sttilecfg.
> * config/i386/i386-builtin.def (BDESC): Add
> __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg.
> * config/i386/i386-expand.cc (ix86_expand_builtin): Handle
> IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG.
> * config/i386/i386.md (ldtilecfg): New pattern.
> (sttilecfg): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> PR target/114098
> * gcc.target/i386/amxtile-4.c: New test.
> ---
>  gcc/config/i386/amxtileintrin.h   |  4 +-
>  gcc/config/i386/i386-builtin.def  |  4 ++
>  gcc/config/i386/i386-expand.cc| 19 
>  gcc/config/i386/i386.md   | 24 ++
>  gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++
>  5 files changed, 104 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c
>
> diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h
> index d1a26e0fea5..5081b326498 100644
> --- a/gcc/config/i386/amxtileintrin.h
> +++ b/gcc/config/i386/amxtileintrin.h
> @@ -39,14 +39,14 @@ extern __inline void
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _tile_loadconfig (const void *__config)
>  {
> -  __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
> +  __builtin_ia32_ldtilecfg (__config);
>  }
>
>  extern __inline void
>  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _tile_storeconfig (void *__config)
>  {
> -  __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
> +  __builtin_ia32_sttilecfg (__config);
>  }
>
>  extern __inline void
> diff --git a/gcc/config/i386/i386-builtin.def 
> b/gcc/config/i386/i386-builtin.def
> index 729355230b8..88dd7f8857f 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 
> 0, CODE_FOR_nothing, "__b
>  BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, 
> "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) 
> VOID_FTYPE_PVOID_INT64)
>  BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, 
> "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) 
> VOID_FTYPE_PVOID_INT64)
>
> +/* LDFILECFG and STFILECFG.  */
> +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, 
> "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) 
> VOID_FTYPE_PCVOID)
> +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, 
> "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) 
> VOID_FTYPE_PVOID)
CODE_FOR_sttilecfg.
> +
>  /* SSE */
>  BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, 
> "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) 
> VOID_FTYPE_PFLOAT_V4SF)
>  BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, 
> "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) 
> VOID_FTYPE_PFLOAT_V4SF)
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index a4d3369f01b..17993eb837f 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx 
> subtarget,
> emit_insn (pat);
>return 0;
>
> +case IX86_BUILTIN_LDTILECFG:
> +case IX86_BUILTIN_STTILECFG:
> +  arg0 = CALL_EXPR_ARG (exp, 0);
> +  op0 = expand_normal (arg0);
> +
> +  if (!address_operand (op0, VOIDmode))
> +   {
> + op0 = convert_memory_address (Pmode, op0);
> + op0 = copy_addr_to_reg (op0);
> +   }
> +  op0 = gen_rtx_MEM (BLKmode, op0);
maybe we can just use XImode, and adjust the patterns with XI.
> +  if (fcode == IX86_BUILTIN_LDTILECFG)
> +   icode = CODE_FOR_ldtilecfg;
> +  else
> +

Re: PING: [PATCH] x86-64: Check R_X86_64_CODE_6_GOTTPOFF support

2024-02-22 Thread Hongtao Liu
On Thu, Feb 22, 2024 at 10:33 PM H.J. Lu  wrote:
>
> On Sun, Feb 18, 2024 at 8:02 AM H.J. Lu  wrote:
> >
> > If assembler and linker supports
> >
> > add %reg1, name@gottpoff(%rip), %reg2
> >
> > with R_X86_64_CODE_6_GOTTPOFF, we can generate it instead of
> >
> > mov name@gottpoff(%rip), %reg2
> > add %reg1, %reg2
x86 part LGTM, but I'm not familiar with the changes in config related files.
> >
> > gcc/
> >
> > * configure.ac (HAVE_AS_R_X86_64_CODE_6_GOTTPOFF): Defined as 1
> > if R_X86_64_CODE_6_GOTTPOFF is supported.
> > * config.in: Regenerated.
> > * configure: Likewise.
> > * config/i386/predicates.md (apx_ndd_add_memory_operand): Allow
> > UNSPEC_GOTNTPOFF if R_X86_64_CODE_6_GOTTPOFF is supported.
> >
> > gcc/testsuite/
> >
> > * gcc.target/i386/apx-ndd-tls-1b.c: New test.
> > * lib/target-supports.exp
> > (check_effective_target_code_6_gottpoff_reloc): New.
> > ---
> >  gcc/config.in |  7 +++
> >  gcc/config/i386/predicates.md |  6 +-
> >  gcc/configure | 62 +++
> >  gcc/configure.ac  | 37 +++
> >  .../gcc.target/i386/apx-ndd-tls-1b.c  |  9 +++
> >  gcc/testsuite/lib/target-supports.exp | 48 ++
> >  6 files changed, 168 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-tls-1b.c
> >
> > diff --git a/gcc/config.in b/gcc/config.in
> > index ce1d073833f..f3de4ba6776 100644
> > --- a/gcc/config.in
> > +++ b/gcc/config.in
> > @@ -737,6 +737,13 @@
> >  #endif
> >
> >
> > +/* Define 0/1 if your assembler and linker support 
> > R_X86_64_CODE_6_GOTTPOFF.
> > +   */
> > +#ifndef USED_FOR_TARGET
> > +#undef HAVE_AS_R_X86_64_CODE_6_GOTTPOFF
> > +#endif
> > +
> > +
> >  /* Define if your assembler supports relocs needed by -fpic. */
> >  #ifndef USED_FOR_TARGET
> >  #undef HAVE_AS_SMALL_PIC_RELOCS
> > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > index 4c1aedd7e70..391f108c360 100644
> > --- a/gcc/config/i386/predicates.md
> > +++ b/gcc/config/i386/predicates.md
> > @@ -2299,10 +2299,14 @@ (define_predicate "apx_ndd_memory_operand"
> >
> >  ;; Return true if OP is a memory operand which can be used in APX NDD
> >  ;; ADD with register source operand.  UNSPEC_GOTNTPOFF memory operand
> > -;; isn't allowed with APX NDD ADD.
> > +;; is allowed with APX NDD ADD only if R_X86_64_CODE_6_GOTTPOFF works.
> >  (define_predicate "apx_ndd_add_memory_operand"
> >(match_operand 0 "memory_operand")
> >  {
> > +  /* OK if "add %reg1, name@gottpoff(%rip), %reg2" is supported.  */
> > +  if (HAVE_AS_R_X86_64_CODE_6_GOTTPOFF)
> > +return true;
> > +
> >op = XEXP (op, 0);
> >
> >/* Disallow APX NDD ADD with UNSPEC_GOTNTPOFF.  */
> > diff --git a/gcc/configure b/gcc/configure
> > index 41b978b0380..c59c971862c 100755
> > --- a/gcc/configure
> > +++ b/gcc/configure
> > @@ -29834,6 +29834,68 @@ cat >>confdefs.h <<_ACEOF
> >  _ACEOF
> >
> >
> > +if echo "$ld_ver" | grep GNU > /dev/null; then
> > +  if $gcc_cv_ld -V 2>/dev/null | grep elf_x86_64_sol2 > /dev/null; then
> > +ld_ix86_gld_64_opt="-melf_x86_64_sol2"
> > +  else
> > +ld_ix86_gld_64_opt="-melf_x86_64"
> > +  fi
> > +fi
> > +conftest_s='
> > +   .text
> > +   .globl  _start
> > +   .type _start, @function
> > +_start:
> > +   addq%r23,foo@GOTTPOFF(%rip), %r15
> > +   .section .tdata,"awT",@progbits
> > +   .type foo, @object
> > +foo:
> > +   .quad 0'
> > +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler for 
> > R_X86_64_CODE_6_GOTTPOFF reloc" >&5
> > +$as_echo_n "checking assembler for R_X86_64_CODE_6_GOTTPOFF reloc... " 
> > >&6; }
> > +if ${gcc_cv_as_x86_64_code_6_gottpoff+:} false; then :
> > +  $as_echo_n "(cached) " >&6
> > +else
> > +  gcc_cv_as_x86_64_code_6_gottpoff=no
> > +  if test x$gcc_cv_as != x; then
> > +$as_echo "$conftest_s" > conftest.s
> > +if { ac_try='$gcc_cv_as $gcc_cv_as_flags  -o conftest.o conftest.s >&5'
> > +  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
> > +  (eval $ac_try) 2>&5
> > +  ac_status=$?
> > +  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
> > +  test $ac_status = 0; }; }
> > +then
> > +   if test x$gcc_cv_ld != x && test x$gcc_cv_objdump != x \
> > +   && test x$gcc_cv_readelf != x \
> > +   && $gcc_cv_readelf --relocs --wide conftest.o 2>&1 \
> > +  | grep R_X86_64_CODE_6_GOTTPOFF > /dev/null 2>&1 \
> > +   && $gcc_cv_ld $ld_ix86_gld_64_opt -o conftest conftest.o > 
> > /dev/null 2>&1; then
> > +  if $gcc_cv_objdump -dw conftest 2>&1 \
> > + | grep "add \+\$0xf\+8,%r23,%r15" > /dev/null 2>&1; then
> > +gcc_cv_as_x86_64_code_6_gottpoff=yes
> > +  else
> > +gcc_cv_as_x86_64_code_6_gottpoff=no

Re: [PATCH] x86-64: Generate push2/pop2 only if the incoming stack is 16-byte aligned

2024-02-17 Thread Hongtao Liu
On Wed, Feb 14, 2024 at 5:33 AM H.J. Lu  wrote:
>
> Since push2/pop2 requires 16-byte stack alignment, don't generate them
> if the incoming stack isn't 16-byte aligned.
Ok.
>
> gcc/
>
> PR target/113912
> * config/i386/i386.cc (ix86_can_use_push2pop2): New.
> (ix86_pro_and_epilogue_can_use_push2pop2): Use it.
> (ix86_emit_save_regs): Don't generate push2 if
> ix86_can_use_push2pop2 return false.
> (ix86_expand_epilogue): Don't generate pop2 if
> ix86_can_use_push2pop2 return false.
>
> gcc/testsuite/
>
> PR target/113912
> * gcc.target/i386/apx-push2pop2-2.c: New test.
> ---
>  gcc/config/i386/i386.cc   | 24 ++-
>  .../gcc.target/i386/apx-push2pop2-2.c | 24 +++
>  2 files changed, 42 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-push2pop2-2.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index a4e12602f70..46f238651a6 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -6802,16 +6802,24 @@ get_probe_interval (void)
>
>  #define SPLIT_STACK_AVAILABLE 256
>
> -/* Helper function to determine whether push2/pop2 can be used in prologue or
> -   epilogue for register save/restore.  */
> +/* Return true if push2/pop2 can be generated.  */
> +
>  static bool
> -ix86_pro_and_epilogue_can_use_push2pop2 (int nregs)
> +ix86_can_use_push2pop2 (void)
>  {
>/* Use push2/pop2 only if the incoming stack is 16-byte aligned.  */
>unsigned int incoming_stack_boundary
>  = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
> ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
> -  if (incoming_stack_boundary % 128 != 0)
> +  return incoming_stack_boundary % 128 == 0;
> +}
> +
> +/* Helper function to determine whether push2/pop2 can be used in prologue or
> +   epilogue for register save/restore.  */
> +static bool
> +ix86_pro_and_epilogue_can_use_push2pop2 (int nregs)
> +{
> +  if (!ix86_can_use_push2pop2 ())
>  return false;
>int aligned = cfun->machine->fs.sp_offset % 16 == 0;
>return TARGET_APX_PUSH2POP2
> @@ -7401,7 +7409,9 @@ ix86_emit_save_regs (void)
>int regno;
>rtx_insn *insn;
>
> -  if (!TARGET_APX_PUSH2POP2 || cfun->machine->func_type != TYPE_NORMAL)
> +  if (!TARGET_APX_PUSH2POP2
> +  || !ix86_can_use_push2pop2 ()
> +  || cfun->machine->func_type != TYPE_NORMAL)
>  {
>for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
> if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
> @@ -10039,7 +10049,9 @@ ix86_expand_epilogue (int style)
>  m->fs.cfa_reg == stack_pointer_rtx);
> }
>
> -  if (TARGET_APX_PUSH2POP2 && m->func_type == TYPE_NORMAL)
> +  if (TARGET_APX_PUSH2POP2
> + && ix86_can_use_push2pop2 ()
> + && m->func_type == TYPE_NORMAL)
> ix86_emit_restore_regs_using_pop2 ();
>else
> ix86_emit_restore_regs_using_pop (TARGET_APX_PPX);
> diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2-2.c 
> b/gcc/testsuite/gcc.target/i386/apx-push2pop2-2.c
> new file mode 100644
> index 000..975a6212b30
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2-2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mpreferred-stack-boundary=3 -mapx-features=push2pop2 
> -fomit-frame-pointer" } */
> +
> +extern int bar (int);
> +
> +void foo ()
> +{
> +  int a,b,c,d,e,f,i;
> +  a = bar (5);
> +  b = bar (a);
> +  c = bar (b);
> +  d = bar (c);
> +  e = bar (d);
> +  f = bar (e);
> +  for (i = 1; i < 10; i++)
> +  {
> +a += bar (a + i) + bar (b + i) +
> + bar (c + i) + bar (d + i) +
> + bar (e + i) + bar (f + i);
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "push2(|p)\[\\t \]*%r" } } */
> +/* { dg-final { scan-assembler-not "pop2(|p)\[\\t \]*%r" } } */
> --
> 2.43.0
>


-- 
BR,
Hongtao


Re: [PATCH] x86: Update constraints for APX NDD instructions

2024-02-07 Thread Hongtao Liu
On Tue, Feb 6, 2024 at 11:49 AM H.J. Lu  wrote:
>
> 1. The only supported TLS code sequence with ADD is
>
> addq foo@gottpoff(%rip),%reg
>
> Change je constraint to a memory operand in APX NDD ADD pattern with
> register source operand.
>
> 2. The instruction length of APX NDD instructions with immediate operand:
>
> op imm, mem, reg
>
> may exceed the size limit of 15 byes when non-default address space,
> segment register or address size prefix are used.
>
> Add jM constraint which is a memory operand valid for APX NDD instructions
> with immediate operand and add jO constraint which is an offsetable memory
> operand valid for APX NDD instructions with immediate operand.  Update
> APX NDD patterns with jM and jO constraints.
Ok.
>
> gcc/
>
> PR target/113711
> PR target/113733
> * config/i386/constraints.md: List all constraints with j prefix.
> (j>): Change auto-dec to auto-inc in documentation.
> (je): Changed to a memory constraint with APX NDD TLS operand
> check.
> (jM): New memory constraint for APX NDD instructions.
> (jO): Likewise.
> * config/i386/i386-protos.h (x86_poff_operand_p): Removed.
> * config/i386/i386.cc (x86_poff_operand_p): Likewise.
> * config/i386/i386.md (*add3_doubleword): Use rjO.
> (*add_1[SWI48]): Use je and jM.
> (addsi_1_zext): Use jM.
> (*addv4_doubleword_1[DWI]): Likewise.
> (*sub_1[SWI]): Use jM.
> (@add3_cc_overflow_1[SWI]): Likewise.
> (*add3_doubleword_cc_overflow_1): Use rjO.
> (*and3_doubleword): Likewise.
> (*anddi_1): Use jM.
> (*andsi_1_zext): Likewise.
> (*and_1[SWI24]): Likewise.
> (*3_doubleword[any_or]: Use rjO
> (*code_1[any_or SWI248]): Use jM.
> (*si_1_zext[zero_extend + any_or]): Likewise.
> * config/i386/predicates.md (apx_ndd_memory_operand): New.
> (apx_ndd_add_memory_operand): Likewise.
>
> gcc/testsuite/
>
> PR target/113711
> PR target/113733
> * gcc.target/i386/apx-ndd-2.c: New test.
> * gcc.target/i386/apx-ndd-base-index-1.c: Likewise.
> * gcc.target/i386/apx-ndd-no-seg-global-1.c: Likewise.
> * gcc.target/i386/apx-ndd-seg-1.c: Likewise.
> * gcc.target/i386/apx-ndd-seg-2.c: Likewise.
> * gcc.target/i386/apx-ndd-seg-3.c: Likewise.
> * gcc.target/i386/apx-ndd-seg-4.c: Likewise.
> * gcc.target/i386/apx-ndd-seg-5.c: Likewise.
> * gcc.target/i386/apx-ndd-tls-1a.c: Likewise.
> * gcc.target/i386/apx-ndd-tls-2.c: Likewise.
> * gcc.target/i386/apx-ndd-tls-3.c: Likewise.
> * gcc.target/i386/apx-ndd-tls-4.c: Likewise.
> * gcc.target/i386/apx-ndd-x32-1.c: Likewise.
> ---
>  gcc/config/i386/constraints.md|  36 -
>  gcc/config/i386/i386-protos.h |   1 -
>  gcc/config/i386/i386.cc   |  25 
>  gcc/config/i386/i386.md   | 129 +-
>  gcc/config/i386/predicates.md |  65 +
>  gcc/testsuite/gcc.target/i386/apx-ndd-2.c |  17 +++
>  .../gcc.target/i386/apx-ndd-base-index-1.c|  50 +++
>  .../gcc.target/i386/apx-ndd-no-seg-global-1.c |  74 ++
>  gcc/testsuite/gcc.target/i386/apx-ndd-seg-1.c |  98 +
>  gcc/testsuite/gcc.target/i386/apx-ndd-seg-2.c |  98 +
>  gcc/testsuite/gcc.target/i386/apx-ndd-seg-3.c |  14 ++
>  gcc/testsuite/gcc.target/i386/apx-ndd-seg-4.c |   9 ++
>  gcc/testsuite/gcc.target/i386/apx-ndd-seg-5.c |  13 ++
>  .../gcc.target/i386/apx-ndd-tls-1a.c  |  41 ++
>  gcc/testsuite/gcc.target/i386/apx-ndd-tls-2.c |  38 ++
>  gcc/testsuite/gcc.target/i386/apx-ndd-tls-3.c |  16 +++
>  gcc/testsuite/gcc.target/i386/apx-ndd-tls-4.c |  31 +
>  gcc/testsuite/gcc.target/i386/apx-ndd-x32-1.c |  49 +++
>  18 files changed, 712 insertions(+), 92 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-base-index-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-no-seg-global-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-seg-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-seg-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-seg-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-seg-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-seg-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-tls-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-tls-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-tls-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-tls-4.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-x32-1.c
>
> diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
> index 280e4c8e36c..64702d9c0a8 100644
> 

Re: [x86 PATCH] PR target/106060: Improved SSE vector constant materialization.

2024-01-25 Thread Hongtao Liu
On Fri, Jan 26, 2024 at 3:03 AM Roger Sayle  wrote:
>
>
> Hi Hongtao,
> Many thanks for the review.  Here's a revised version of my patch
> that addresses (most of) the issues you've raised.  Firstly the
> handling of zero and all_ones in this function is mostly for
> completeness/documentation, these standard_sse_constant_p
> values are (currently/normally) handled elsewhere.  But I have
> added an "n_var == 0" optimization to ix86_expand_vector_init.
>
> As you've suggested I've added explicit TARGET_SSE2 tests where
> required, and for consistency I've also added support for AVX512's
> V16SImode.
>
> As you've predicted, the eventual goal is to move this after combine
> (or reload) using define_insn_and_split, but that requires a significant
> restructuring that should be done in steps.  This also interacts with
> a similar planned reorganization of TImode constant handling.  If
> all 128-bit (vector) constants are acceptable before combine, then
> STV has the freedom to chose V1TImode (and this broadcast
> functionality) to implement TImode operations on immediate
> constants.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline (in stage 1)?
Ok, thanks for handling this.
>
>
> 2024-01-25  Roger Sayle  
> Hongtao Liu  
>
> gcc/ChangeLog
> PR target/106060
> * config/i386/i386-expand.cc (enum ix86_vec_bcast_alg): New.
> (struct ix86_vec_bcast_map_simode_t): New type for table below.
> (ix86_vec_bcast_map_simode): Table of SImode constants that may
> be efficiently synthesized by a ix86_vec_bcast_alg method.
> (ix86_vec_bcast_map_simode_cmp): New comparator for bsearch.
> (ix86_vector_duplicate_simode_const): Efficiently synthesize
> V4SImode and V8SImode constants that duplicate special constants.
> (ix86_vector_duplicate_value): Attempt to synthesize "special"
> vector constants using ix86_vector_duplicate_simode_const.
> * config/i386/i386.cc (ix86_rtx_costs) : ABS of a
> vector integer mode costs with a single SSE instruction.
>
> gcc/testsuite/ChangeLog
> PR target/106060
> * gcc.target/i386/auto-init-8.c: Update test case.
> * gcc.target/i386/avx512fp16-3.c: Likewise.
> * gcc.target/i386/pr100865-9a.c: Likewise.
> * gcc.target/i386/pr101796-1.c: Likewise.
> * gcc.target/i386/pr106060-1.c: New test case.
> * gcc.target/i386/pr106060-2.c: Likewise.
> * gcc.target/i386/pr106060-3.c: Likewise.
> * gcc.target/i386/pr70314.c: Update test case.
> * gcc.target/i386/vect-shiftv4qi.c: Likewise.
> * gcc.target/i386/vect-shiftv8qi.c: Likewise.
>
>
> Roger
> --
>
> > -Original Message-
> > From: Hongtao Liu 
> > Sent: 17 January 2024 03:13
> > To: Roger Sayle 
> > Cc: gcc-patches@gcc.gnu.org; Uros Bizjak 
> > Subject: Re: [x86 PATCH] PR target/106060: Improved SSE vector constant
> > materialization.
> >
> > On Wed, Jan 17, 2024 at 5:59 AM Roger Sayle 
> > wrote:
> > >
> > >
> > > I thought I'd just missed the bug fixing season of stage3, but there
> > > appears to a little latitude in early stage4 (for vector patches), so
> > > I'll post this now.
> > >
> > > This patch resolves PR target/106060 by providing efficient methods
> > > for materializing/synthesizing special "vector" constants on x86.
> > > Currently there are three methods of materializing a vector constant;
> > > the most general is to load a vector from the constant pool, secondly
> > "duplicated"
> > > constants can be synthesized by moving an integer between units and
> > > broadcasting (or shuffling it), and finally the special cases of the
> > > all-zeros vector and all-ones vectors can be loaded via a single SSE
> > > instruction.   This patch handles additional cases that can be synthesized
> > > in two instructions, loading an all-ones vector followed by another
> > > SSE instruction.  Following my recent patch for PR target/112992,
> > > there's conveniently a single place in i386-expand.cc where these
> > > special cases can be handled.
> > >
> > > Two examples are given in the original bugzilla PR for 106060.
> > >
> > > __m256i
> > > should_be_cmpeq_abs ()
> > > {
> > >   return _mm256_set1_epi8 (1);
> > > }
> > >
> > > is now generated (with -O3 -march=x

Re: [PATCH v3 0/2] x86: Don't save callee-saved registers if not needed

2024-01-24 Thread Hongtao Liu
On Tue, Jan 23, 2024 at 11:00 PM H.J. Lu  wrote:
>
> Changes in v3:
>
> 1. Rebase against commit 02e68389494
> 2. Don't add call_no_callee_saved_registers to machine_function since
> all callee-saved registers are properly clobbered by callee with
> no_callee_saved_registers attribute.
>
The patch LGTM, it should be low risk since there's already
no_caller_save_registers attribute, the patch just extends to
no_callee_save_registers with the same approach.
So if there's no objection(or any concerns) in the next couple days,
I'm ok for the patch to be in GCC14 and backport.

> Changes in v2:
>
> 1. Rebase against commit f9df00340e3
> 2. Don't add redundant clobbered_registers check in ix86_expand_call.
>
> In some cases, there are no need to save callee-saved registers:
>
> 1. If a noreturn function doesn't throw nor support exceptions, it can
> skip saving callee-saved registers.
>
> 2. When an interrupt handler is implemented by an assembly stub which does:
>
>   1. Save all registers.
>   2. Call a C function.
>   3. Restore all registers.
>   4. Return from interrupt.
>
> it is completely unnecessary to save and restore any registers in the C
> function called by the assembly stub, even if they would normally be
> callee-saved.
>
> This patch set adds no_callee_saved_registers function attribute, which
> is complementary to no_caller_saved_registers function attribute, to
> classify x86 backend call-saved register handling type with
>
>   1. Default call-saved registers.
>   2. No caller-saved registers with no_caller_saved_registers attribute.
>   3. No callee-saved registers with no_callee_saved_registers attribute.
>
> Functions of no callee-saved registers won't save callee-saved registers.
> If a noreturn function doesn't throw nor support exceptions, it is
> classified as the no callee-saved registers type.
>
> With these changes, __libc_start_main in glibc 2.39, which is a noreturn
> function, is changed from
>
> __libc_start_main:
> endbr64
> push   %r15
> push   %r14
> mov%rcx,%r14
> push   %r13
> push   %r12
> push   %rbp
> mov%esi,%ebp
> push   %rbx
> mov%rdx,%rbx
> sub$0x28,%rsp
> mov%rdi,(%rsp)
> mov%fs:0x28,%rax
> mov%rax,0x18(%rsp)
> xor%eax,%eax
> test   %r9,%r9
>
> to
>
> __libc_start_main:
> endbr64
> sub$0x28,%rsp
> mov%esi,%ebp
> mov%rdx,%rbx
> mov%rcx,%r14
> mov%rdi,(%rsp)
> mov%fs:0x28,%rax
> mov%rax,0x18(%rsp)
> xor%eax,%eax
> test   %r9,%r9
>
> In Linux kernel 6.7.0 on x86-64, do_exit is changed from
>
> do_exit:
> endbr64
> call   
> push   %r15
> push   %r14
> push   %r13
> push   %r12
> mov%rdi,%r12
> push   %rbp
> push   %rbx
> mov%gs:0x0,%rbx
> sub$0x28,%rsp
> mov%gs:0x28,%rax
> mov%rax,0x20(%rsp)
> xor%eax,%eax
> call   *0x0(%rip)# 
> test   $0x2,%ah
> je 
>
> to
>
> do_exit:
> endbr64
> call   
> sub$0x28,%rsp
> mov%rdi,%r12
> mov%gs:0x28,%rax
> mov%rax,0x20(%rsp)
> xor%eax,%eax
> mov%gs:0x0,%rbx
> call   *0x0(%rip)# 
> test   $0x2,%ah
> je 
>
> I compared GCC master branch bootstrap and test times on a slow machine
> with 6.6 Linux kernels compiled with the original GCC 13 and the GCC 13
> with the backported patch.  The performance data isn't precise since the
> measurements were done on different days with different GCC sources under
> different 6.6 kernel versions.
>
> GCC master branch build time in seconds:
>
> beforeafter  improvement
> 30043.75user  30013.16user   0%
> 1274.85system 1243.72system  2.4%
>
> GCC master branch test time in seconds (new tests added):
>
> beforeafter  improvement
> 216035.90user 216547.51user  0
> 27365.51system26658.54system 2.6%
>
> Backported to GCC 13 to rebuild system glibc and kernel on Fedora 39.
> Systems perform normally.
>
>
> H.J. Lu (2):
>   x86: Add no_callee_saved_registers function attribute
>   x86: Don't save callee-saved registers in noreturn functions
>
>  gcc/config/i386/i386-expand.cc| 52 +---
>  gcc/config/i386/i386-options.cc   | 61 +++
>  gcc/config/i386/i386.cc   | 57 +
>  gcc/config/i386/i386.h| 16 -
>  gcc/doc/extend.texi   |  8 +++
>  .../gcc.dg/torture/no-callee-saved-run-1a.c   | 23 +++
>  .../gcc.dg/torture/no-callee-saved-run-1b.c   | 59 ++
>  

Re: [PATCH] i386: Modify testcases failed under -DDEBUG

2024-01-24 Thread Hongtao Liu
On Mon, Jan 22, 2024 at 10:31 AM Haochen Jiang  wrote:
>
> Hi all,
>
> Recently, I happened to run i386.exp under -DDEBUG and found some fail.
>
> This patch aims to fix that. Ok for trunk?
OK.
>
> Thx,
> Haochen
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/adx-check.h: Include stdio.h when DEBUG
> is defined.
> * gcc.target/i386/avx512fp16-vscalefph-1b.c: Do not define
> DEBUG.
> * gcc.target/i386/avx512fp16vl-vaddph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vcmpph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vdivph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vfpclassph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vgetexpph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vgetmantph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vmaxph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vminph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vmulph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vrcpph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vreduceph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vrndscaleph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vrsqrtph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vscalefph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vsqrtph-1b.c: Ditto.
> * gcc.target/i386/avx512fp16vl-vsubph-1b.c: Ditto.
> * gcc.target/i386/readeflags-1.c: Include stdio.h when DEBUG
> is defined.
> * gcc.target/i386/rtm-check.h: Ditto.
> * gcc.target/i386/sha-check.h: Ditto.
> * gcc.target/i386/writeeflags-1.c: Ditto.
> ---
>  gcc/testsuite/gcc.target/i386/adx-check.h   | 3 +++
>  gcc/testsuite/gcc.target/i386/avx512fp16-vscalefph-1b.c | 3 ---
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vaddph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vcmpph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vdivph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vfpclassph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vgetexpph-1b.c   | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vgetmantph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vmaxph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vminph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vmulph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vrcpph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vreduceph-1b.c   | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vrndscaleph-1b.c | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vrsqrtph-1b.c| 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vscalefph-1b.c   | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vsqrtph-1b.c | 1 -
>  gcc/testsuite/gcc.target/i386/avx512fp16vl-vsubph-1b.c  | 1 -
>  gcc/testsuite/gcc.target/i386/readeflags-1.c| 3 +++
>  gcc/testsuite/gcc.target/i386/rtm-check.h   | 3 +++
>  gcc/testsuite/gcc.target/i386/sha-check.h   | 3 +++
>  gcc/testsuite/gcc.target/i386/writeeflags-1.c   | 3 +++
>  22 files changed, 15 insertions(+), 19 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.target/i386/adx-check.h 
> b/gcc/testsuite/gcc.target/i386/adx-check.h
> index cfed1a38483..45435b91d0e 100644
> --- a/gcc/testsuite/gcc.target/i386/adx-check.h
> +++ b/gcc/testsuite/gcc.target/i386/adx-check.h
> @@ -1,5 +1,8 @@
>  #include 
>  #include "cpuid.h"
> +#ifdef DEBUG
> +#include 
> +#endif
>
>  static void adx_test (void);
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vscalefph-1b.c 
> b/gcc/testsuite/gcc.target/i386/avx512fp16-vscalefph-1b.c
> index 7c7288d6eb3..0ba9ec57f37 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16-vscalefph-1b.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vscalefph-1b.c
> @@ -1,9 +1,6 @@
>  /* { dg-do run { target avx512fp16 } } */
>  /* { dg-options "-O2 -mavx512fp16 -mavx512dq" } */
>
> -
> -#define DEBUG
> -
>  #define AVX512FP16
>  #include "avx512fp16-helper.h"
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-vaddph-1b.c 
> b/gcc/testsuite/gcc.target/i386/avx512fp16vl-vaddph-1b.c
> index fcf6a9058f5..1db7c565262 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16vl-vaddph-1b.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-vaddph-1b.c
> @@ -1,7 +1,6 @@
>  /* { dg-do run { target avx512fp16 } } */
>  /* { dg-options "-O2 -mavx512fp16 -mavx512vl -mavx512dq" } */
>
> -#define DEBUG
>  #define AVX512VL
>  #define AVX512F_LEN 256
>  #define AVX512F_LEN_HALF 128
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-vcmpph-1b.c 
> b/gcc/testsuite/gcc.target/i386/avx512fp16vl-vcmpph-1b.c
> index c201a9258bf..bbd366a5d29 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512fp16vl-vcmpph-1b.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-vcmpph-1b.c
> @@ -1,7 +1,6 @@
>  /* { dg-do run { target 

Re: [PATCH 1/2] x86: Add no_callee_saved_registers function attribute

2024-01-21 Thread Hongtao Liu
On Sat, Jan 20, 2024 at 10:30 PM H.J. Lu  wrote:
>
> When an interrupt handler is implemented by an assembly stub which does:
>
> 1. Save all registers.
> 2. Call a C function.
> 3. Restore all registers.
> 4. Return from interrupt.
>
> it is completely unnecessary to save and restore any registers in the C
> function called by the assembly stub, even if they would normally be
> callee-saved.
>
> Add no_callee_saved_registers function attribute, which is complementary
> to no_caller_saved_registers function attribute, to mark a function which
> doesn't have any callee-saved registers.  Such a function won't save and
> restore any registers.  Classify function call-saved register handling
> type with:
>
> 1. Default call-saved registers.
> 2. No caller-saved registers with no_caller_saved_registers attribute.
> 3. No callee-saved registers with no_callee_saved_registers attribute.
>
> Disallow sibcall if callee is a no_callee_saved_registers function
> and caller isn't a no_callee_saved_registers function.  Otherwise,
> callee-saved registers won't be preserved.
>
> After a no_callee_saved_registers function is called, all registers may
> be clobbered.  If the calling function isn't a no_callee_saved_registers
> function, we need to preserve all registers which aren't used by function
> calls.
>
> gcc/
>
> PR target/103503
> PR target/113312
> * config/i386/i386-expand.cc (ix86_expand_call): Set
> call_no_callee_saved_registers to true when calling function
> with no_callee_saved_registers attribute.  Replace
> no_caller_saved_registers check with call_saved_registers check.
> * config/i386/i386-options.cc (ix86_set_func_type): Set
> call_saved_registers to TYPE_NO_CALLEE_SAVED_REGISTERS for
> noreturn function.  Disallow no_callee_saved_registers with
> interrupt or no_caller_saved_registers attributes together.
> (ix86_set_current_function): Replace no_caller_saved_registers
> check with call_saved_registers check.
> (ix86_handle_no_caller_saved_registers_attribute): Renamed to ...
> (ix86_handle_call_saved_registers_attribute): This.
> (ix86_gnu_attributes): Add
> ix86_handle_call_saved_registers_attribute.
> * config/i386/i386.cc (ix86_conditional_register_usage): Replace
> no_caller_saved_registers check with call_saved_registers check.
> (ix86_function_ok_for_sibcall): Don't allow callee with
> no_callee_saved_registers attribute when the calling function
> has callee-saved registers.
> (ix86_comp_type_attributes): Also check
> no_callee_saved_registers.
> (ix86_epilogue_uses): Replace no_caller_saved_registers check
> with call_saved_registers check.
> (ix86_hard_regno_scratch_ok): Likewise.
> (ix86_save_reg): Replace no_caller_saved_registers check with
> call_saved_registers check.  Don't save any registers for
> TYPE_NO_CALLEE_SAVED_REGISTERS.  Save all registers with
> TYPE_DEFAULT_CALL_SAVED_REGISTERS if function with
> no_callee_saved_registers attribute is called.
> (find_drap_reg): Replace no_caller_saved_registers check with
> call_saved_registers check.
> * config/i386/i386.h (call_saved_registers_type): New enum.
> (machine_function): Replace no_caller_saved_registers with
> call_saved_registers.  Add call_no_callee_saved_registers.
> * doc/extend.texi: Document no_callee_saved_registers attribute.
>
> gcc/testsuite/
>
> PR target/103503
> PR target/113312
> * gcc.dg/torture/no-callee-saved-run-1a.c: New file.
> * gcc.dg/torture/no-callee-saved-run-1b.c: Likewise.
> * gcc.target/i386/no-callee-saved-1.c: Likewise.
> * gcc.target/i386/no-callee-saved-2.c: Likewise.
> * gcc.target/i386/no-callee-saved-3.c: Likewise.
> * gcc.target/i386/no-callee-saved-4.c: Likewise.
> * gcc.target/i386/no-callee-saved-5.c: Likewise.
> * gcc.target/i386/no-callee-saved-6.c: Likewise.
> * gcc.target/i386/no-callee-saved-7.c: Likewise.
> * gcc.target/i386/no-callee-saved-8.c: Likewise.
> * gcc.target/i386/no-callee-saved-9.c: Likewise.
> * gcc.target/i386/no-callee-saved-10.c: Likewise.
> * gcc.target/i386/no-callee-saved-11.c: Likewise.
> * gcc.target/i386/no-callee-saved-12.c: Likewise.
> * gcc.target/i386/no-callee-saved-13.c: Likewise.
> * gcc.target/i386/no-callee-saved-14.c: Likewise.
> * gcc.target/i386/no-callee-saved-15.c: Likewise.
> * gcc.target/i386/no-callee-saved-16.c: Likewise.
> * gcc.target/i386/no-callee-saved-17.c: Likewise.
> * gcc.target/i386/no-callee-saved-18.c: Likewise.
> ---
>  gcc/config/i386/i386-expand.cc| 72 ---
>  gcc/config/i386/i386-options.cc   | 49 +
>  

Re: [PATCH] hwasan: Check if Intel LAM_U57 is enabled

2024-01-17 Thread Hongtao Liu
On Wed, Jan 10, 2024 at 12:47 AM H.J. Lu  wrote:
>
> When -fsanitize=hwaddress is used, libhwasan will try to enable LAM_U57
> in the startup code.  Update the target check to enable hwaddress tests
> if LAM_U57 is enabled.  Also compile hwaddress tests with -mlam=u57 on
> x86-64 since hwasan requires LAM_U57 on x86-64.
I've tested it on lam enabled SRF, and it passed all hwasan testcases
except below

FAIL: c-c++-common/hwasan/alloca-outside-caught.c   -O0  output pattern test
FAIL: c-c++-common/hwasan/hwasan-poison-optimisation.c   -O1
scan-assembler-times bl
s*__hwasan_tag_mismatch4 1
FAIL: c-c++-common/hwasan/hwasan-poison-optimisation.c   -O2
scan-assembler-times bl
s*__hwasan_tag_mismatch4 1
FAIL: c-c++-common/hwasan/hwasan-poison-optimisation.c   -O3 -g
scan-assembler-times bl
s*__hwasan_tag_mismatch4 1
FAIL: c-c++-common/hwasan/hwasan-poison-optimisation.c   -Os
scan-assembler-times bl
s*__hwasan_tag_mismatch4 1
FAIL: c-c++-common/hwasan/hwasan-poison-optimisation.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none   scan-assembler-times bl
s*__hwasan_tag_mismatch4 1
FAIL: c-c++-common/hwasan/hwasan-poison-optimisation.c   -O2 -flto
-fuse-linker-plugin -fno-fat-lto-objects   scan-assembler-times bl
s*__hwasan_tag_mismatch4 1
FAIL: c-c++-common/hwasan/vararray-outside-caught.c   -O0  output pattern test

Basically they're testcase issues, the testcases needs to be adjusted
for x86/ I'll commit a separate patch for those after this commit is
upstream.
Also I've also tested the patch on lam unsupported platforms, all
hwasan testcases shows unsupported.
So the patch LGTM.

>
> * lib/hwasan-dg.exp (check_effective_target_hwaddress_exec):
> Return 1 if Intel LAM_U57 is enabled.
> (hwasan_init): Add -mlam=u57 on x86-64.
> ---
>  gcc/testsuite/lib/hwasan-dg.exp | 25 ++---
>  1 file changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/testsuite/lib/hwasan-dg.exp b/gcc/testsuite/lib/hwasan-dg.exp
> index e9c5ef6524d..76057502ee6 100644
> --- a/gcc/testsuite/lib/hwasan-dg.exp
> +++ b/gcc/testsuite/lib/hwasan-dg.exp
> @@ -44,11 +44,25 @@ proc check_effective_target_hwaddress_exec {} {
> #ifdef __cplusplus
> extern "C" {
> #endif
> +   extern int arch_prctl (int, unsigned long int *);
> extern int prctl(int, unsigned long, unsigned long, unsigned long, 
> unsigned long);
> #ifdef __cplusplus
> }
> #endif
> int main (void) {
> +   #ifdef __x86_64__
> +   # ifdef __LP64__
> +   #  define ARCH_GET_UNTAG_MASK 0x4001
> +   #  define LAM_U57_MASK (0x3fULL << 57)
> + unsigned long mask = 0;
> + if (arch_prctl(ARCH_GET_UNTAG_MASK, ) != 0)
> +   return 1;
> + if (mask != ~LAM_U57_MASK)
> +   return 1;
> + return 0;
> +   # endif
> + return 1;
> +   #else
> #define PR_SET_TAGGED_ADDR_CTRL 55
> #define PR_GET_TAGGED_ADDR_CTRL 56
> #define PR_TAGGED_ADDR_ENABLE (1UL << 0)
> @@ -58,6 +72,7 @@ proc check_effective_target_hwaddress_exec {} {
>   || !prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0))
> return 1;
>   return 0;
> +   #endif
> }
>  }] {
> return 0;
> @@ -102,6 +117,10 @@ proc hwasan_init { args } {
>
>  setenv HWASAN_OPTIONS "random_tags=0"
>
> +if [istarget x86_64-*-*] {
> +  set target_hwasan_flags "-mlam=u57"
> +}
> +
>  set link_flags ""
>  if ![is_remote host] {
> if [info exists TOOL_OPTIONS] {
> @@ -119,12 +138,12 @@ proc hwasan_init { args } {
>  if [info exists ALWAYS_CXXFLAGS] {
> set hwasan_saved_ALWAYS_CXXFLAGS $ALWAYS_CXXFLAGS
> set ALWAYS_CXXFLAGS [concat "{ldflags=$link_flags}" $ALWAYS_CXXFLAGS]
> -   set ALWAYS_CXXFLAGS [concat "{additional_flags=-fsanitize=hwaddress 
> --param hwasan-random-frame-tag=0 -g $include_flags}" $ALWAYS_CXXFLAGS]
> +   set ALWAYS_CXXFLAGS [concat "{additional_flags=-fsanitize=hwaddress 
> $target_hwasan_flags --param hwasan-random-frame-tag=0 -g $include_flags}" 
> $ALWAYS_CXXFLAGS]
>  } else {
> if [info exists TEST_ALWAYS_FLAGS] {
> -   set TEST_ALWAYS_FLAGS "$link_flags -fsanitize=hwaddress --param 
> hwasan-random-frame-tag=0 -g $include_flags $TEST_ALWAYS_FLAGS"
> +   set TEST_ALWAYS_FLAGS "$link_flags -fsanitize=hwaddress 
> $target_hwasan_flags --param hwasan-random-frame-tag=0 -g $include_flags 
> $TEST_ALWAYS_FLAGS"
> } else {
> -   set TEST_ALWAYS_FLAGS "$link_flags -fsanitize=hwaddress --param 
> hwasan-random-frame-tag=0 -g $include_flags"
> +   set TEST_ALWAYS_FLAGS "$link_flags -fsanitize=hwaddress 
> $target_hwasan_flags --param hwasan-random-frame-tag=0 -g $include_flags"
> }
>  }
>  }
> --
> 2.43.0
>


-- 
BR,
Hongtao


Re: [x86 PATCH] PR target/106060: Improved SSE vector constant materialization.

2024-01-16 Thread Hongtao Liu
On Wed, Jan 17, 2024 at 5:59 AM Roger Sayle  wrote:
>
>
> I thought I'd just missed the bug fixing season of stage3, but there
> appears to a little latitude in early stage4 (for vector patches), so
> I'll post this now.
>
> This patch resolves PR target/106060 by providing efficient methods for
> materializing/synthesizing special "vector" constants on x86.  Currently
> there are three methods of materializing a vector constant; the most
> general is to load a vector from the constant pool, secondly "duplicated"
> constants can be synthesized by moving an integer between units and
> broadcasting (or shuffling it), and finally the special cases of the
> all-zeros vector and all-ones vectors can be loaded via a single SSE
> instruction.   This patch handles additional cases that can be synthesized
> in two instructions, loading an all-ones vector followed by another SSE
> instruction.  Following my recent patch for PR target/112992, there's
> conveniently a single place in i386-expand.cc where these special cases
> can be handled.
>
> Two examples are given in the original bugzilla PR for 106060.
>
> __m256i
> should_be_cmpeq_abs ()
> {
>   return _mm256_set1_epi8 (1);
> }
>
> is now generated (with -O3 -march=x86-64-v3) as:
>
> vpcmpeqd%ymm0, %ymm0, %ymm0
> vpabsb  %ymm0, %ymm0
> ret
>
> and
>
> __m256i
> should_be_cmpeq_add ()
> {
>   return _mm256_set1_epi8 (-2);
> }
>
> is now generated as:
>
> vpcmpeqd%ymm0, %ymm0, %ymm0
> vpaddb  %ymm0, %ymm0, %ymm0
> ret
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
>
> 2024-01-16  Roger Sayle  
>
> gcc/ChangeLog
> PR target/106060
> * config/i386/i386-expand.cc (enum ix86_vec_bcast_alg): New.
> (struct ix86_vec_bcast_map_simode_t): New type for table below.
> (ix86_vec_bcast_map_simode): Table of SImode constants that may
> be efficiently synthesized by a ix86_vec_bcast_alg method.
> (ix86_vec_bcast_map_simode_cmp): New comparator for bsearch.
> (ix86_vector_duplicate_simode_const): Efficiently synthesize
> V4SImode and V8SImode constants that duplicate special constants.
> (ix86_vector_duplicate_value): Attempt to synthesize "special"
> vector constants using ix86_vector_duplicate_simode_const.
> * config/i386/i386.cc (ix86_rtx_costs) : ABS of a
> vector integer mode costs with a single SSE instruction.
>

+  switch (entry->alg)
+{
+case VEC_BCAST_PXOR:
+  if (mode == V8SImode && !TARGET_AVX2)
+ return false;
+  emit_move_insn (target, CONST0_RTX (mode));
+  return true;
+case VEC_BCAST_PCMPEQ:
+  if ((mode == V4SImode && !TARGET_SSE2)
+  || (mode == V8SImode && !TARGET_AVX2))
+ return false;
+  emit_move_insn (target, CONSTM1_RTX (mode));
+  return true;

I think we need to prevent those standard_sse_constant_p getting in
ix86_expand_vector_init_duplicate by below codes.

  /* If all values are identical, broadcast the value.  */
  if (all_same
  && (nvars != 0 || !standard_sse_constant_p (gen_rtx_CONST_VECTOR
(mode, XVEC (vals, 0)), mode))
  && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
XVECEXP (vals, 0, 0)))
return;

+case VEC_BCAST_PABSB:
+  if (mode == V4SImode)
+ {
+  tmp1 = gen_reg_rtx (V16QImode);
+  emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
+  tmp2 = gen_reg_rtx (V16QImode);
+  emit_insn (gen_absv16qi2 (tmp2, tmp1));
Shouldn't it rely on TARGET_SSE2?

+case VEC_BCAST_PADDB:
+  if (mode == V4SImode)
+ {
+  tmp1 = gen_reg_rtx (V16QImode);
+  emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
+  tmp2 = gen_reg_rtx (V16QImode);
+  emit_insn (gen_addv16qi3 (tmp2, tmp1, tmp1));
Ditto here and for all logic shift cases.
+ }

+
+  if ((mode == V4SImode || mode == V8SImode)
+  && CONST_INT_P (val)
+  && ix86_vector_duplicate_simode_const (mode, target, INTVAL (val)))
+return true;
+
The alternative way is adding a pre_reload define_insn_and_split to
match specific const_vector and splitt it into new instructions.
In theoritically, the constant info can be retained before combine and
will enable more simplication.

Also the patch can be extend to V16SImode, but it can be a separate patch.

> gcc/testsuite/ChangeLog
> PR target/106060
> * gcc.target/i386/auto-init-8.c: Update test case.
> * gcc.target/i386/avx512fp16-3.c: Likewise.
> * gcc.target/i386/pr100865-9a.c: Likewise.
> * gcc.target/i386/pr106060-1.c: New test case.
> * gcc.target/i386/pr106060-2.c: Likewise.
> * gcc.target/i386/pr106060-3.c: Likewise.
> * gcc.target/i386/pr70314-3.c: Update test case.
> * gcc.target/i386/vect-shiftv4qi.c: Likewise.
> * gcc.target/i386/vect-shiftv8qi.c: Likewise.
>
>
> Thanks in advance,
> Roger
> 

Re: [PATCH] Update documents for fcf-protection=

2024-01-11 Thread Hongtao Liu
On Thu, Jan 11, 2024 at 12:06 AM H.J. Lu  wrote:
>
> On Tue, Jan 9, 2024 at 6:02 PM liuhongt  wrote:
> >
> > After r14-2692-g1c6231c05bdcca, the option is defined as EnumSet and
> > -fcf-protection=branch won't unset any others bits since they're in
> > different groups. So to override -fcf-protection, an explicit
> > -fcf-protection=none needs to be added and then with
> > -fcf-protection=XXX
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
>
> We should mention:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113039
Changed, and committed.
>
> > * doc/invoke.texi (fcf-protection=): Update documents.
> > ---
> >  gcc/doc/invoke.texi | 3 +++
> >  1 file changed, 3 insertions(+)
> >
> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > index 68d1f364ac0..d1e6fafb98c 100644
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -17734,6 +17734,9 @@ function.  The value @code{full} is an alias for 
> > specifying both
> >  @code{branch} and @code{return}. The value @code{none} turns off
> >  instrumentation.
> >
> > +To override @option{-fcf-protection}, @option{-fcf-protection=none}
> > +needs to be explicitly added and then with @option{-fcf-protection=xxx}.
> > +
> >  The value @code{check} is used for the final link with link-time
> >  optimization (LTO).  An error is issued if LTO object files are
> >  compiled with different @option{-fcf-protection} values.  The
> > --
> > 2.31.1
> >
>
>
> --
> H.J.



-- 
BR,
Hongtao


Re: [PATCH] i386: Add AVX10.1 related macros

2024-01-11 Thread Hongtao Liu
On Fri, Jan 12, 2024 at 10:55 AM Jiang, Haochen  wrote:
>
> > -Original Message-
> > From: Richard Biener 
> > Sent: Thursday, January 11, 2024 4:19 PM
> > To: Liu, Hongtao 
> > Cc: Jiang, Haochen ; gcc-patches@gcc.gnu.org;
> > ubiz...@gmail.com; bur...@net-b.de; san...@codesourcery.com
> > Subject: Re: [PATCH] i386: Add AVX10.1 related macros
> >
> > On Thu, Jan 11, 2024 at 2:16 AM Liu, Hongtao 
> > wrote:
> > >
> > >
> > >
> > > > -Original Message-
> > > > From: Richard Biener 
> > > > Sent: Wednesday, January 10, 2024 5:44 PM
> > > > To: Liu, Hongtao 
> > > > Cc: Jiang, Haochen ;
> > > > gcc-patches@gcc.gnu.org; ubiz...@gmail.com; bur...@net-b.de;
> > > > san...@codesourcery.com
> > > > Subject: Re: [PATCH] i386: Add AVX10.1 related macros
> > > >
> > > > On Wed, Jan 10, 2024 at 9:01 AM Liu, Hongtao 
> > > > wrote:
> > > > >
> > > > >
> > > > >
> > > > > > -Original Message-
> > > > > > From: Jiang, Haochen 
> > > > > > Sent: Wednesday, January 10, 2024 3:35 PM
> > > > > > To: gcc-patches@gcc.gnu.org
> > > > > > Cc: Liu, Hongtao ; ubiz...@gmail.com;
> > > > > > burnus@net- b.de; san...@codesourcery.com
> > > > > > Subject: [PATCH] i386: Add AVX10.1 related macros
> > > > > >
> > > > > > Hi all,
> > > > > >
> > > > > > This patch aims to add AVX10.1 related macros for libgomp's request.
> > > > > > The request comes following:
> > > > > >
> > > > > > https://gcc.gnu.org/pipermail/gcc-patches/2024-January/642025.ht
> > > > > > ml
> > > > > >
> > > > > > Ok for trunk?
> > > > > >
> > > > > > Thx,
> > > > > > Haochen
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > >   PR target/113288
> > > > > >   * config/i386/i386-c.cc (ix86_target_macros_internal):
> > > > > >   Add __AVX10_1__, __AVX10_1_256__ and __AVX10_1_512__.
> > > > > > ---
> > > > > >  gcc/config/i386/i386-c.cc | 7 +++
> > > > > >  1 file changed, 7 insertions(+)
> > > > > >
> > > > > > diff --git a/gcc/config/i386/i386-c.cc
> > > > > > b/gcc/config/i386/i386-c.cc index c3ae984670b..366b560158a
> > > > > > 100644
> > > > > > --- a/gcc/config/i386/i386-c.cc
> > > > > > +++ b/gcc/config/i386/i386-c.cc
> > > > > > @@ -735,6 +735,13 @@ ix86_target_macros_internal
> > (HOST_WIDE_INT
> > > > > > isa_flag,
> > > > > >  def_or_undef (parse_in, "__EVEX512__");
> > > > > >if (isa_flag2 & OPTION_MASK_ISA2_USER_MSR)
> > > > > >  def_or_undef (parse_in, "__USER_MSR__");
> > > > > > +  if (isa_flag2 & OPTION_MASK_ISA2_AVX10_1_256)
> > > > > > +{
> > > > > > +  def_or_undef (parse_in, "__AVX10_1_256__");
> > > > > > +  def_or_undef (parse_in, "__AVX10_1__");
> > > > > I think this is not needed, others LGTM.
> > > >
> > > > So __AVX10_1_256__ and __AVX10_1_512__ are redundant with
> > > > __AVX10_1__ and __EVEX512__, right?
> > > No, I mean __AVX10_1__ is redundant of __AVX10_1_256__ since -
> > mavx10.1 is just alias of -mavx10.1-256.
> > > We want explicit __AVX10_1_256__ and __AVX10_1_512__ and don't want
> > mix __EVEX512__ with AVX10(They are related in their internal
> > implementation, but we don't want the user to control the vector length of
> > avx10 with -mno-evex512, -mno-evex512 is supposed for the existing
> > AVX512).
>
> Let's keep both of them if we prefer __AVX10_1_256__ since I just found
> that LLVM got macro __AVX10_1__.
>
> https://github.com/llvm/llvm-project/pull/67278/files#diff-7435d50346a810555df89deb1f879b767ee985ace43fb3990de17fb23a47f004
>
> in file clang/lib/Basic/Targets/X86.cpp L774-777.
Ok.
>
> Thx,
> Haochen
>
> >
> > Ah, that makes sense.
> >
> > > > > > +}
> > > > > > +  if (isa_flag2 & OPTION_MASK_ISA2_AVX10_1_512)
> > > > > > +def_or_undef (parse_in, "__AVX10_1_512__");
> > > > > >if (TARGET_IAMCU)
> > > > > >  {
> > > > > >def_or_undef (parse_in, "__iamcu");
> > > > > > --
> > > > > > 2.31.1
> > > > >



-- 
BR,
Hongtao


Re: [PATCH] i386: [APX] Document inline asm behavior and new switch for APX

2024-01-10 Thread Hongtao Liu
On Thu, Jan 11, 2024 at 7:06 AM Andi Kleen  wrote:
>
> Hongtao Liu  writes:
> >>
> >> +@opindex mapx-inline-asm-use-gpr32
> >> +@item -mapx-inline-asm-use-gpr32
> >> +When APX_F enabled, EGPR usage was by default disabled to prevent
> >> +unexpected EGPR generation in instructions that does not support it.
> >> +To invoke EGPR usage in inline asm, use this switch to allow EGPR in
> >> +inline asm, while user should ensure the asm actually supports EGPR.
> > Please align with
> > https://gcc.gnu.org/pipermail/gcc-patches/2024-January/642228.html.
> > Ok after changing that.
>
> BTW I think we would need a way to specify this individually per inline
> asm statement too.
>
> Otherwise a library which wants to use APX inline asm in the header
> never can do so until all its users set the option, which will be
> awkward to deploy.
>
> Perhaps it could be a magic clobber string.
We do have new constraints string for gpr32 or gpr16 for registers,
but not for memory due to restrictiction of GCC RA infrastructure
which assumes universal BASE_REG_CLASS/INDEX_REG_CLASS for all inline
asm.
>
> -andi



-- 
BR,
Hongtao


Re: [PATCH] i386: [APX] Document inline asm behavior and new switch for APX

2024-01-10 Thread Hongtao Liu
On Tue, Jan 9, 2024 at 3:09 PM Hongyu Wang  wrote:
>
> Hi,
>
> For APX, the inline asm behavior was not mentioned in any document
> before. Add description for it.
>
> Ok for trunk?
>
> gcc/ChangeLog:
>
> * config/i386/i386.opt: Adjust document.
> * doc/invoke.texi: Add description for
> -mapx-inline-asm-use-gpr32.
> ---
>  gcc/config/i386/i386.opt | 3 +--
>  gcc/doc/invoke.texi  | 7 +++
>  2 files changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index a38e92baf92..5b4f1bff25f 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1357,8 +1357,7 @@ Enum(apx_features) String(all) Value(apx_all) Set(1)
>
>  mapx-inline-asm-use-gpr32
>  Target Var(ix86_apx_inline_asm_use_gpr32) Init(0)
> -Enable GPR32 in inline asm when APX_EGPR enabled, do not
> -hook reg or mem constraint in inline asm to GPR16.
> +Enable GPR32 in inline asm when APX_F enabled.
>
>  mevex512
>  Target Mask(ISA2_EVEX512) Var(ix86_isa_flags2) Save
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 68d1f364ac0..47fd96648d8 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -35272,6 +35272,13 @@ r8-r15 registers so that the call and jmp 
> instruction length is 6 bytes
>  to allow them to be replaced with @samp{lfence; call *%r8-r15} or
>  @samp{lfence; jmp *%r8-r15} at run-time.
>
> +@opindex mapx-inline-asm-use-gpr32
> +@item -mapx-inline-asm-use-gpr32
> +When APX_F enabled, EGPR usage was by default disabled to prevent
> +unexpected EGPR generation in instructions that does not support it.
> +To invoke EGPR usage in inline asm, use this switch to allow EGPR in
> +inline asm, while user should ensure the asm actually supports EGPR.
Please align with
https://gcc.gnu.org/pipermail/gcc-patches/2024-January/642228.html.
Ok after changing that.
> +
>  @end table
>
>  These @samp{-m} switches are supported in addition to the above
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] i386: [APX] Add missing document for APX

2024-01-07 Thread Hongtao Liu
On Mon, Jan 8, 2024 at 11:09 AM Hongyu Wang  wrote:
>
> Hi,
>
> The supported sub-features for APX was missing in option document and
> target attribute section. Add those missing ones.
>
> Ok for trunk?
Ok.
>
> gcc/ChangeLog:
>
> * config/i386/i386.opt: Add supported sub-features.
> * doc/extend.texi: Add description for target attribute.
> ---
>  gcc/config/i386/i386.opt | 3 ++-
>  gcc/doc/extend.texi  | 6 ++
>  2 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index 1bfff1e0d82..a38e92baf92 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1328,7 +1328,8 @@ Enable vectorization for scatter instruction.
>
>  mapxf
>  Target Mask(ISA2_APX_F) Var(ix86_isa_flags2) Save
> -Support APX code generation.
> +Support code generation for APX features, including EGPR, PUSH2POP2,
> +NDD and PPX.
>
>  mapx-features=
>  Target Undocumented Joined Enum(apx_features) EnumSet Var(ix86_apx_features) 
> Init(apx_none) Save
> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
> index 9e61ba9507d..84eef411e2d 100644
> --- a/gcc/doc/extend.texi
> +++ b/gcc/doc/extend.texi
> @@ -7344,6 +7344,12 @@ Enable/disable the generation of the SM4 instructions.
>  @itemx no-usermsr
>  Enable/disable the generation of the USER_MSR instructions.
>
> +@cindex @code{target("apxf")} function attribute, x86
> +@item apxf
> +@itemx no-apxf
> +Enable/disable the generation of the APX features, including
> +EGPR, PUSH2POP2, NDD and PPX.
> +
>  @cindex @code{target("avx10.1")} function attribute, x86
>  @item avx10.1
>  @itemx no-avx10.1
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: Disable FMADD in chains for Zen4 and generic

2024-01-07 Thread Hongtao Liu
On Thu, Dec 14, 2023 at 12:03 AM Jan Hubicka  wrote:
>
> > > The diffrerence is that Cores understand the fact that fmadd does not need
> > > all three parameters to start computation, while Zen cores doesn't.
> > >
> > > Since this seems noticeable win on zen and not loss on Core it seems like 
> > > good
> > > default for generic.
> > >
> > > I plan to commit the patch next week if there are no compplains.
> > The generic part LGTM.(It's exactly what we proposed in [1])
> >
> > [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637721.html
>
> Thanks.  I wonder if can think of other generic changes that would make
> sense to do?
> Concerning zen4 and FMA, it is not really win with AVX512 enabled
> (which is what I was benchmarking for znver4 tuning), but indeed it is
> win with AVX256 where the extra latency is not hidden by the parallelism
> exposed by doing evertyhing twice.
>
> I re-benmchmarked zen4 and it behaves similarly to zen3 with avx256, so
> for x86-64-v3 this makes sense.
>
> Honza
> > >
> > > Honza
> > >
> > > #include 
> > > #include 
> > >
> > > #define SIZE 1000
> > >
> > > float a[SIZE][SIZE];
> > > float b[SIZE][SIZE];
> > > float c[SIZE][SIZE];
> > >
> > > void init(void)
> > > {
> > >int i, j, k;
> > >for(i=0; i > >{
> > >   for(j=0; j > >   {
> > >  a[i][j] = (float)i + j;
> > >  b[i][j] = (float)i - j;
> > >  c[i][j] = 0.0f;
> > >   }
> > >}
> > > }
> > >
> > > void mult(void)
> > > {
> > >int i, j, k;
> > >
> > >for(i=0; i > >{
> > >   for(j=0; j > >   {
> > >  for(k=0; k > >  {
> > > c[i][j] += a[i][k] * b[k][j];
> > >  }
> > >   }
> > >}
> > > }
> > >
> > > int main(void)
> > > {
> > >clock_t s, e;
> > >
> > >init();
> > >s=clock();
> > >mult();
> > >e=clock();
> > >printf("mult took %10d clocks\n", (int)(e-s));
> > >
> > >return 0;
> > >
> > > }
> > >
> > > * confg/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS, 
> > > X86_TUNE_AVOID_256FMA_CHAINS)
> > > Enable for znver4 and Core.
> > >
> > > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> > > index 43fa9e8fd6d..74b03cbcc60 100644
> > > --- a/gcc/config/i386/x86-tune.def
> > > +++ b/gcc/config/i386/x86-tune.def
> > > @@ -515,13 +515,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
> > > "use_scatter_8parts",
> > >
> > >  /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit 
> > > or
> > > smaller FMA chain.  */
> > > -DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
> > > m_ZNVER2 | m_ZNVER3
> > > -  | m_YONGFENG)
> > > +DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
> > > m_ZNVER2 | m_ZNVER3 | m_ZNVER4
> > > +  | m_YONGFENG | m_GENERIC)
> > >
> > >  /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit 
> > > or
> > > smaller FMA chain.  */
> > > -DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 
> > > | m_ZNVER3
> > > - | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM)
> > > +DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 
> > > | m_ZNVER3 | m_ZNVER4
> > > + | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
Can we backport the patch(at least the generic part) to
GCC11/GCC12/GCC13 release branch?
> > >
> > >  /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit 
> > > or
> > > smaller FMA chain.  */
> >
> >
> >
> > --
> > BR,
> > Hongtao



-- 
BR,
Hongtao


Re: [x86_64 PATCH] PR target/112992: Optimize mode for broadcast of constants.

2024-01-07 Thread Hongtao Liu
On Sun, Jan 7, 2024 at 6:53 AM Roger Sayle  wrote:
>
> Hi Hongtao,
>
> Many thanks for the review.  This revised patch implements several
> of your suggestions, specifically to use pshufd for V4SImode and
> punpcklqdq for V2DImode.  These changes are demonstrated by the
> examples below:
>
> typedef unsigned int v4si __attribute((vector_size(16)));
> typedef unsigned long long v2di __attribute((vector_size(16)));
>
> v4si foo() { return (v4si){1,1,1,1}; }
> v2di bar() { return (v2di){1,1}; }
>
> The previous version of my patch generated:
>
> foo:movdqa  .LC0(%rip), %xmm0
> ret
> bar:movdqa  .LC1(%rip), %xmm0
> ret
>
> with this revised version, -O2 generates:
>
> foo:movl$1, %eax
> movd%eax, %xmm0
> pshufd  $0, %xmm0, %xmm0
> ret
> bar:movl$1, %eax
> movq%rax, %xmm0
> punpcklqdq  %xmm0, %xmm0
> ret
>
> However, if it's OK with you, I'd prefer to allow this function to
> return false, safely falling back to emitting a vector load from
> the constant bool rather than ICEing from a gcc_assert.  For one
Sure, that makes sense.
> thing this isn't a unrecoverable correctness issue, but at worst
> a missed optimization.  The deeper reason is that this usefully
> provides a handle for tuning on different microarchitectures.
> On some (AMD?) machines, where !TARGET_INTER_UNIT_MOVES_TO_VEC,
> the first form above may be preferable to the second.  Currently
> the start of ix86_convert_const_wide_int_to_broadcast disables
> broadcasts for !TARGET_INTER_UNIT_MOVES_TO_VEC even when an
> implementation doesn't reuire an inter unit move, such as a
> broadcast from memory.  I plan follow-up patches that benefit
> from this flexibility.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
Ok.
>
> gcc/ChangeLog
> PR target/112992
> * config/i386/i386-expand.cc
> (ix86_convert_const_wide_int_to_broadcast): Allow call to
> ix86_expand_vector_init_duplicate to fail, and return NULL_RTX.
> (ix86_broadcast_from_constant): Revert recent change; Return a
> suitable MEMREF independently of mode/target combinations.
> (ix86_expand_vector_move): Allow ix86_expand_vector_init_duplicate
> to decide whether expansion is possible/preferrable.  Only try
> forcing DImode constants to memory (and trying again) if calling
> ix86_expand_vector_init_duplicate fails with an DImode immediate
> constant.
> (ix86_expand_vector_init_duplicate) : Try using
> V4SImode for suitable immediate constants.
> : Try using V8SImode for suitable constants.
> : Fail for CONST_INT_P, i.e. use constant pool.
> : Likewise.
> : For CONST_INT_P try using V4SImode via widen.
> : For CONT_INT_P try using V8HImode via widen.
> : Handle CONT_INTs via simplify_binary_operation.
> Allow recursive calls to ix86_expand_vector_init_duplicate to fail.
> : For CONST_INT_P try V8SImode via widen.
> : For CONST_INT_P try V16HImode via widen.
> (ix86_expand_vector_init): Move try using a broadcast for all_same
> with ix86_expand_vector_init_duplicate before using constant pool.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/auto-init-8.c: Update test case.
> * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Likewise.
> * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Likewise.
> * gcc.target/i386/avx512fp16-13.c: Likewise.
> * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Likewise.
> * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Likewise.
> * gcc.target/i386/pr100865-1.c: Likewise.
> * gcc.target/i386/pr100865-10a.c: Likewise.
> * gcc.target/i386/pr100865-10b.c: Likewise.
> * gcc.target/i386/pr100865-2.c: Likewise.
> * gcc.target/i386/pr100865-3.c: Likewise.
> * gcc.target/i386/pr100865-4a.c: Likewise.
> * gcc.target/i386/pr100865-4b.c: Likewise.
> * gcc.target/i386/pr100865-5a.c: Likewise.
> * gcc.target/i386/pr100865-5b.c: Likewise.
>     * gcc.target/i386/pr100865-9a.c: Likewise.
> * gcc.target/i386/pr100865-9b.c: Likewise.
> * gcc.target/i386/pr102021.c: Likewise.
> * gcc.target/i386/pr90773-17.c: Likewise.
>
> Thanks in advance.
> Roger
> --
>
> > -Original Message-
> > From: Hongtao Liu 
> > Sent: 02 January 2024 05:40
> > To: Roger Sayle 
> > Cc: gcc-patches@gcc.gnu.org

Re: [x86_64 PATCH] PR target/112992: Optimize mode for broadcast of constants.

2024-01-01 Thread Hongtao Liu
On Fri, Dec 22, 2023 at 6:25 PM Roger Sayle  wrote:
>
>
> This patch resolves the second part of PR target/112992, building upon
> Hongtao Liu's solution to the first part.
>
> The issue addressed by this patch is that when initializing vectors by
> broadcasting integer constants, the compiler has the flexibility to
> select the most appropriate vector mode to perform the broadcast, as
> long as the resulting vector has an identical bit pattern.  For
> example, the following constants are all equivalent:
> V4SImode {0x01010101, 0x01010101, 0x01010101, 0x01010101 }
> V8HImode {0x0101, 0x0101, 0x0101, 0x0101, 0x0101, 0x0101, 0x0101, 0x0101 }
> V16QImode {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, ... 0x01 }
> So instruction sequences that construct any of these can be used to
> construct the others (with a suitable cast/SUBREG).
>
> On x86_64, it turns out that broadcasts of SImode constants are preferred,
> as DImode constants often require a longer movabs instruction, and
> HImode and QImode broadcasts require multiple uops on some architectures.
> Hence, SImode is always the equal shortest/fastest implementation.
>
> Examples of this improvement, can be seen in the testsuite.
>
> gcc.target/i386/pr102021.c
> Before:
>0:   48 b8 0c 00 0c 00 0cmovabs $0xc000c000c000c,%rax
>7:   00 0c 00
>a:   62 f2 fd 28 7c c0   vpbroadcastq %rax,%ymm0
>   10:   c3  retq
>
> After:
>0:   b8 0c 00 0c 00  mov$0xc000c,%eax
>5:   62 f2 7d 28 7c c0   vpbroadcastd %eax,%ymm0
>b:   c3  retq
>
> and
> gcc.target/i386/pr90773-17.c:
> Before:
>0:   48 8b 15 00 00 00 00mov0x0(%rip),%rdx# 7 
>7:   b8 0c 00 00 00  mov$0xc,%eax
>c:   62 f2 7d 08 7a c0   vpbroadcastb %eax,%xmm0
>   12:   62 f1 7f 08 7f 02   vmovdqu8 %xmm0,(%rdx)
>   18:   c7 42 0f 0c 0c 0c 0cmovl   $0xc0c0c0c,0xf(%rdx)
>   1f:   c3  retq
>
> After:
>0:   48 8b 15 00 00 00 00mov0x0(%rip),%rdx# 7 
>7:   b8 0c 0c 0c 0c  mov$0xc0c0c0c,%eax
>c:   62 f2 7d 08 7c c0   vpbroadcastd %eax,%xmm0
>   12:   62 f1 7f 08 7f 02   vmovdqu8 %xmm0,(%rdx)
>   18:   c7 42 0f 0c 0c 0c 0cmovl   $0xc0c0c0c,0xf(%rdx)
>   1f:   c3  retq
>
> where according to Agner Fog's instruction tables broadcastd is slightly
> faster on some microarchitectures, for example Knight's Landing.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
>
> 2023-12-21  Roger Sayle  
>
> gcc/ChangeLog
> PR target/112992
> * config/i386/i386-expand.cc
> (ix86_convert_const_wide_int_to_broadcast): Allow call to
> ix86_expand_vector_init_duplicate to fail, and return NULL_RTX.
> (ix86_broadcast_from_constant): Revert recent change; Return a
> suitable MEMREF independently of mode/target combinations.
> (ix86_expand_vector_move): Allow ix86_expand_vector_init_duplicate
> to decide whether expansion is possible/preferrable.  Only try
> forcing DImode constants to memory (and trying again) if calling
> ix86_expand_vector_init_duplicate fails with an DImode immediate
> constant.
> (ix86_expand_vector_init_duplicate) : Try using
> V4SImode for suitable immediate constants.
> : Try using V8SImode for suitable constants.
> : Use constant pool for AVX without AVX2.
> : Fail for CONST_INT_P, i.e. use constant pool.
> : Likewise.
> : For CONST_INT_P try using V4SImode via widen.
> : For CONT_INT_P try using V8HImode via widen.
> : Handle CONT_INTs via simplify_binary_operation.
> Allow recursive calls to ix86_expand_vector_init_duplicate to fail.
> : For CONST_INT_P try V8SImode via widen.
> : For CONST_INT_P try V16HImode via widen.
> (ix86_expand_vector_init): Move try using a broadcast for all_same
> with ix86_expand_vector_init_duplicate before using constant pool.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Update test case.
> * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Likewise.
> * gcc.target/i386/avx512fp16-13.c: Likewise.
> * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Likewise.
> * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Likewise.
> * gcc.target/i386/pr100865-10a.c: Likewise.
> * gcc.target/i386/pr100865-10b.c: Likewise.
> * gcc.target/i386/pr100865-11c.c: Likewise.
> * gcc.target/i386/pr100865-12c.c: Likewise.
> * gcc.target/i386/pr100865-2.c: Likewise.
> * gcc.target/i386/pr100865-3.c: Likewise.
> * gcc.target/i386/pr100865-4a.c: Likewise.
> * gcc.target/i386/pr100865-4b.c: Likewise.
> * 

Re: [PATCH] i386: Allow 64 bit mask register for -mno-evex512

2023-12-19 Thread Hongtao Liu
On Fri, Dec 15, 2023 at 10:34 AM Haochen Jiang  wrote:
>
> Hi all,
>
> There is a recent change in AVX10 documentation which allows 64 bit mask
> register instructions in AVX10-256, the documentation comes following:
>
> Intel Advanced Vector Extensions 10 (Intel AVX10) Architecture Specification
> https://cdrdv2.intel.com/v1/dl/getContent/784267
> The Converged Vector ISA: Intel Advanced Vector Extensions 10 Technical Paper
> https://cdrdv2.intel.com/v1/dl/getContent/784343
>
> As a result, we will need to allow 64 bit mask register for -mno-evex512. The
> patch aims to add them.
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk?
Ok.
>
> Thx,
> Haochen
>
> gcc/ChangeLog:
>
> * config/i386/avx512bwintrin.h: Allow 64 bit mask intrin usage
> for -mno-evex512.
> * config/i386/i386-builtin.def: Remove OPTION_MASK_ISA2_EVEX512
> for 64 bit mask builtins.
> * config/i386/i386.cc (ix86_hard_regno_mode_ok): Allow 64 bit
> mask register for -mno-evex512.
> * config/i386/i386.md (SWI1248_AVX512BWDQ_64): Remove
> TARGET_EVEX512.
> (*zero_extendsidi2): Change isa attribute to avx512bw.
> (kmov_isa): Ditto.
> (*anddi_1): Ditto.
> (*andn_1): Remove TARGET_EVEX512.
> (*one_cmplsi2_1_zext): Change isa attribute to avx512bw.
> (*ashl3_1): Ditto.
> (*lshr3_1): Ditto.
> * config/i386/sse.md (SWI1248_AVX512BWDQ): Remove TARGET_EVEX512.
> (SWI1248_AVX512BW): Ditto.
> (SWI1248_AVX512BWDQ2): Ditto.
> (*knotsi_1_zext): Ditto.
> (kunpckdi): Ditto.
> (SWI24_MASK): Removed.
> (vec_pack_trunc_): Change iterator from SWI24_MASK to SWI24.
> (vec_unpacks_lo_di): Remove TARGET_EVEX512.
> (SWI48x_MASK): Removed.
> (vec_unpacks_hi_): Change iterator from SWI48x_MASK to SWI48x.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx10_1-6.c: Remove check for errors.
> * gcc.target/i386/noevex512-2.c: Diito.
> ---
>  gcc/config/i386/avx512bwintrin.h| 42 ++---
>  gcc/config/i386/i386-builtin.def| 28 +++---
>  gcc/config/i386/i386.cc |  3 +-
>  gcc/config/i386/i386.md | 20 +-
>  gcc/config/i386/sse.md  | 30 ++-
>  gcc/testsuite/gcc.target/i386/avx10_1-6.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/noevex512-2.c |  2 +-
>  7 files changed, 59 insertions(+), 68 deletions(-)
>
> diff --git a/gcc/config/i386/avx512bwintrin.h 
> b/gcc/config/i386/avx512bwintrin.h
> index d5ce79fd073..37fd7c68976 100644
> --- a/gcc/config/i386/avx512bwintrin.h
> +++ b/gcc/config/i386/avx512bwintrin.h
> @@ -34,6 +34,8 @@
>  #define __DISABLE_AVX512BW__
>  #endif /* __AVX512BW__ */
>
> +typedef unsigned long long __mmask64;
> +
>  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
>  _mm_avx512_set_epi32 (int __q3, int __q2, int __q1, int __q0)
>  {
> @@ -223,27 +225,6 @@ _kshiftri_mask32 (__mmask32 __A, unsigned int __B)
>
>  #endif
>
> -#ifdef __DISABLE_AVX512BW__
> -#undef __DISABLE_AVX512BW__
> -#pragma GCC pop_options
> -#endif /* __DISABLE_AVX512BW__ */
> -
> -#if !defined (__AVX512BW__) || !defined (__EVEX512__)
> -#pragma GCC push_options
> -#pragma GCC target("avx512bw,evex512")
> -#define __DISABLE_AVX512BW_512__
> -#endif /* __AVX512BW_512__ */
> -
> -/* Internal data types for implementing the intrinsics.  */
> -typedef short __v32hi __attribute__ ((__vector_size__ (64)));
> -typedef short __v32hi_u __attribute__ ((__vector_size__ (64),  \
> -   __may_alias__, __aligned__ (1)));
> -typedef char __v64qi __attribute__ ((__vector_size__ (64)));
> -typedef char __v64qi_u __attribute__ ((__vector_size__ (64),   \
> -  __may_alias__, __aligned__ (1)));
> -
> -typedef unsigned long long __mmask64;
> -
>  extern __inline unsigned char
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _ktest_mask64_u8  (__mmask64 __A,  __mmask64 __B, unsigned char *__CF)
> @@ -365,6 +346,25 @@ _kandn_mask64 (__mmask64 __A, __mmask64 __B)
>return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) 
> __B);
>  }
>
> +#ifdef __DISABLE_AVX512BW__
> +#undef __DISABLE_AVX512BW__
> +#pragma GCC pop_options
> +#endif /* __DISABLE_AVX512BW__ */
> +
> +#if !defined (__AVX512BW__) || !defined (__EVEX512__)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw,evex512")
> +#define __DISABLE_AVX512BW_512__
> +#endif /* __AVX512BW_512__ */
> +
> +/* Internal data types for implementing the intrinsics.  */
> +typedef short __v32hi __attribute__ ((__vector_size__ (64)));
> +typedef short __v32hi_u __attribute__ ((__vector_size__ (64),  \
> +   __may_alias__, __aligned__ (1)));
> +typedef char __v64qi __attribute__ ((__vector_size__ (64)));
> +typedef char __v64qi_u 

Re: [PATCH] i386: Sync move_max/store_max with prefer-vector-width [PR112824]

2023-12-14 Thread Hongtao Liu
On Thu, Dec 14, 2023 at 3:54 PM Hongyu Wang  wrote:
>
> Hi,
>
> Currently move_max follows the tuning feature first, but ideally it
> should sync with prefer-vector-width when it is explicitly set to keep
> vector move and operation with same vector size.
>
> Bootstrapped/regtested on x86-64-pc-linux-gnu{-m32,}
>
> OK for trunk?
>
> gcc/ChangeLog:
>
> PR target/112824
> * config/i386/i386-options.cc (ix86_option_override_internal):
> Sync ix86_move_max/ix86_store_max with prefer_vector_width when
> it is explicitly set.
>
> gcc/testsuite/ChangeLog:
>
> PR target/112824
> * gcc.target/i386/pieces-memset-45.c: Remove
> -mprefer-vector-width=256.
> * g++.target/i386/pr112824-1.C: New test.
> ---
>  gcc/config/i386/i386-options.cc   |   8 +-
>  gcc/testsuite/g++.target/i386/pr112824-1.C| 113 ++
>  .../gcc.target/i386/pieces-memset-45.c|   2 +-
>  3 files changed, 120 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr112824-1.C
>
> diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
> index 588a0878c0d..440ef59 100644
> --- a/gcc/config/i386/i386-options.cc
> +++ b/gcc/config/i386/i386-options.cc
> @@ -3012,7 +3012,9 @@ ix86_option_override_internal (bool main_args_p,
>  {
>/* Set the maximum number of bits can be moved from memory to
>  memory efficiently.  */
> -  if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES])
> +  if (opts_set->x_prefer_vector_width_type != PVW_NONE)
> +   opts->x_ix86_move_max = opts->x_prefer_vector_width_type;
> +  else if (ix86_tune_features[X86_TUNE_AVX512_MOVE_BY_PIECES])
> opts->x_ix86_move_max = PVW_AVX512;
>else if (ix86_tune_features[X86_TUNE_AVX256_MOVE_BY_PIECES])
> opts->x_ix86_move_max = PVW_AVX256;
> @@ -3034,7 +3036,9 @@ ix86_option_override_internal (bool main_args_p,
>  {
>/* Set the maximum number of bits can be stored to memory
>  efficiently.  */
> -  if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES])
> +  if (opts_set->x_prefer_vector_width_type != PVW_NONE)
> +   opts->x_ix86_store_max = opts->x_prefer_vector_width_type;
> +  else if (ix86_tune_features[X86_TUNE_AVX512_STORE_BY_PIECES])
> opts->x_ix86_store_max = PVW_AVX512;
>else if (ix86_tune_features[X86_TUNE_AVX256_STORE_BY_PIECES])
> opts->x_ix86_store_max = PVW_AVX256;
> diff --git a/gcc/testsuite/g++.target/i386/pr112824-1.C 
> b/gcc/testsuite/g++.target/i386/pr112824-1.C
> new file mode 100644
> index 000..fccaf23c530
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr112824-1.C
> @@ -0,0 +1,113 @@
> +/* PR target/112824 */
> +/* { dg-do compile } */
> +/* { dg-options "-std=c++23 -O3 -march=skylake-avx512 
> -mprefer-vector-width=512" } */
> +/* { dg-final { scan-assembler-not "vmov(?:dqu|apd)\[ \\t\]+\[^\n\]*%ymm" } 
> } */
> +
> +
remove empty line.
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +template 
> +using Vec [[gnu::vector_size(W * sizeof(T))]] = T;
> +
> +// Omitted: 16 without AVX, 32 without AVX512F,
> +// or for forward compatibility some AVX10 may also mean 32-only
> +static constexpr ptrdiff_t VectorBytes = 64;
> +template
> +static constexpr ptrdiff_t VecWidth = 64 <= sizeof(T) ? 1 : 64/sizeof(T);
> +
> +template  struct Vector{
> +static constexpr ptrdiff_t L = N;
> +T data[L];
> +static constexpr auto size()->ptrdiff_t{return N;}
> +};
> +template  struct Vector{
> +static constexpr ptrdiff_t W = N >= VecWidth ? VecWidth : 
> ptrdiff_t(std::bit_ceil(size_t(N)));
> +static constexpr ptrdiff_t L = (N/W) + ((N%W)!=0);
> +using V = Vec;
> +V data[L];
> +static constexpr auto size()->ptrdiff_t{return N;}
> +};
> +/// should be trivially copyable
> +/// codegen is worse when passing by value, even though it seems like it 
> should make
> +/// aliasing simpler to analyze?
> +template
> +[[gnu::always_inline]] constexpr auto operator+(Vector x, Vector 
> y) -> Vector {
> +Vector z;
> +for (ptrdiff_t n = 0; n < Vector::L; ++n) z.data[n] = x.data[n] + 
> y.data[n];
> +return z;
> +}
> +template
> +[[gnu::always_inline]] constexpr auto operator*(Vector x, Vector 
> y) -> Vector {
> +Vector z;
> +for (ptrdiff_t n = 0; n < Vector::L; ++n) z.data[n] = x.data[n] * 
> y.data[n];
> +return z;
> +}
> +template
> +[[gnu::always_inline]] constexpr auto operator+(T x, Vector y) -> 
> Vector {
> +Vector z;
> +for (ptrdiff_t n = 0; n < Vector::L; ++n) z.data[n] = x + y.data[n];
> +return z;
> +}
> +template
> +[[gnu::always_inline]] constexpr auto operator*(T x, Vector y) -> 
> Vector {
> +Vector z;
> +for (ptrdiff_t n = 0; n < Vector::L; ++n) z.data[n] = x * y.data[n];
> +return z;
> +}
> +
> +
> +
Ditto.
> +template  struct Dual {
> +  T value;
> +  Vector partials;
> +};
> +// Here we have a specialization 

Re: [PATCH] i386: Remove RAO-INT from Grand Ridge

2023-12-14 Thread Hongtao Liu
On Thu, Dec 14, 2023 at 10:55 AM Haochen Jiang  wrote:
>
> Hi all,
>
> According to ISE050 published at the end of September, RAO-INT will not
> be in Grand Ridge anymore. This patch aims to remove it.
>
> The documentation comes following:
>
> https://cdrdv2.intel.com/v1/dl/getContent/671368
>
> Regtested on x86_64-pc-linux-gnu. Ok for trunk and backport to GCC13?
Ok.
>
> Thx,
> Haochen
>
> gcc/ChangeLog:
>
> * config/i386/driver-i386.cc (host_detect_local_cpu): Do not
> set Grand Ridge depending on RAO-INT.
> * config/i386/i386.h: Remove PTA_RAOINT from PTA_GRANDRIDGE.
> * doc/invoke.texi: Adjust documentation.
> ---
>  gcc/config/i386/driver-i386.cc | 3 ---
>  gcc/config/i386/i386.h | 2 +-
>  gcc/doc/invoke.texi| 4 ++--
>  3 files changed, 3 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
> index 0cfb2884d65..3342e550f2a 100644
> --- a/gcc/config/i386/driver-i386.cc
> +++ b/gcc/config/i386/driver-i386.cc
> @@ -665,9 +665,6 @@ const char *host_detect_local_cpu (int argc, const char 
> **argv)
>   /* Assume Arrow Lake S.  */
>   else if (has_feature (FEATURE_SM3))
> cpu = "arrowlake-s";
> - /* Assume Grand Ridge.  */
> - else if (has_feature (FEATURE_RAOINT))
> -   cpu = "grandridge";
>   /* Assume Sierra Forest.  */
>   else if (has_feature (FEATURE_AVXVNNIINT8))
> cpu = "sierraforest";
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 47340c6a4ad..303baf8c921 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -2416,7 +2416,7 @@ constexpr wide_int_bitmask PTA_GRANITERAPIDS = 
> PTA_SAPPHIRERAPIDS | PTA_AMX_FP16
>| PTA_PREFETCHI;
>  constexpr wide_int_bitmask PTA_GRANITERAPIDS_D = PTA_GRANITERAPIDS
>| PTA_AMX_COMPLEX;
> -constexpr wide_int_bitmask PTA_GRANDRIDGE = PTA_SIERRAFOREST | PTA_RAOINT;
> +constexpr wide_int_bitmask PTA_GRANDRIDGE = PTA_SIERRAFOREST;
>  constexpr wide_int_bitmask PTA_ARROWLAKE = PTA_ALDERLAKE | PTA_AVXIFMA
>| PTA_AVXVNNIINT8 | PTA_AVXNECONVERT | PTA_CMPCCXADD | PTA_UINTR;
>  constexpr wide_int_bitmask PTA_ARROWLAKE_S = PTA_ARROWLAKE | PTA_AVXVNNIINT16
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 1f26f80d26c..82dd9cdf907 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -33451,8 +33451,8 @@ SSSE3, SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, 
> PCLMUL, RDRND, XSAVE, XSAVEC,
>  XSAVES, XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI,
>  MOVDIR64B, CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT,
>  PCONFIG, PKU, VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL, AVX-VNNI,
> -AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD, ENQCMD, UINTR and RAOINT
> -instruction set support.
> +AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD, ENQCMD and UINTR instruction 
> set
> +support.
>
>  @item clearwaterforest
>  Intel Clearwater Forest CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2,
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] [ICE] Support vpcmov for V4HF/V4BF/V2HF/V2BF under TARGET_XOP.

2023-12-13 Thread Hongtao Liu
On Wed, Dec 13, 2023 at 7:59 PM Jakub Jelinek  wrote:
>
> On Fri, Dec 08, 2023 at 03:12:00PM +0800, liuhongt wrote:
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ready push to trunk.
> >
> > gcc/ChangeLog:
> >
> >   PR target/112904
> >   * config/i386/mmx.md (*xop_pcmov_): New define_insn.
> >
> > gcc/testsuite/ChangeLog:
> >
> >   * g++.target/i386/pr112904.C: New test.
>
> The new test FAILs on i686-linux and even on x86_64-linux I think
> it doesn't actually test what was reported, unless one performs testing
> with -march= for some XOP enabled CPU or -mxop.
>
> The following patch fixes that, tested on x86_64-linux with
> make check-g++ 
> RUNTESTFLAGS='--target_board=unix\{-m32,-m32/-mno-sse/-mno-mmx,-m64\} 
> i386.exp=pr112904.C'
> Ok for trunk?

Ok.
Sorry for the inconvenience, I must have missed something in my tester.

>
> 2023-12-13  Jakub Jelinek  
>
> * g++.target/i386/pr112904.C: Add dg-do compile, dg-options -mxop
> and for ia32 also dg-additional-options -mmmx.
>
> --- gcc/testsuite/g++.target/i386/pr112904.C.jj 2023-12-11 08:31:59.001938798 
> +0100
> +++ gcc/testsuite/g++.target/i386/pr112904.C2023-12-13 12:54:50.318521637 
> +0100
> @@ -1,3 +1,8 @@
> +// PR target/112904
> +// { dg-do compile }
> +// { dg-options "-mxop" }
> +// { dg-additional-options "-mmmx" { target ia32 } }
> +
>  typedef _Float16 v4hf __attribute__((vector_size(8)));
>  typedef short v4hi __attribute__((vector_size(8)));
>  typedef _Float16 v2hf __attribute__((vector_size(4)));
>
>
> Jakub
>


-- 
BR,
Hongtao


Re: [PATCH] i386: Fix ICE on __builtin_ia32_pabsd128 without lhs [PR112962]

2023-12-13 Thread Hongtao Liu
On Wed, Dec 13, 2023 at 4:44 PM Jakub Jelinek  wrote:
>
> Hi!
>
> The following patch fixes ICE on the testcase in similar way to how
> other folded builtins are handled in ix86_gimple_fold_builtin when
> they don't have a lhs; these builtins are const or pure, so normally
> DCE would remove them later, but with -O0 that isn't guaranteed to
> happen, and during expansion if they are marked TREE_SIDE_EFFECTS
> it might still be attempted to be expanded.
> This removes them right away during the folding.
>
> Initially I wanted to also change all gsi_replace last args in that function
> to true, but Andrew pointed to PR107209, so I've kept them as is.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Ok.
>
> 2023-12-13  Jakub Jelinek  
>
> PR target/112962
> * config/i386/i386.cc (ix86_gimple_fold_builtin): For shifts
> and abs without lhs replace with nop.
>
> * gcc.target/i386/pr112962.c: New test.
>
> --- gcc/config/i386/i386.cc.jj  2023-12-12 13:06:05.864509295 +0100
> +++ gcc/config/i386/i386.cc 2023-12-13 00:02:28.543600557 +0100
> @@ -19377,7 +19377,10 @@ ix86_gimple_fold_builtin (gimple_stmt_it
>  do_shift:
>gcc_assert (n_args >= 2);
>if (!gimple_call_lhs (stmt))
> -   break;
> +   {
> + gsi_replace (gsi, gimple_build_nop (), false);
> + return true;
> +   }
>arg0 = gimple_call_arg (stmt, 0);
>arg1 = gimple_call_arg (stmt, 1);
>elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> @@ -19523,7 +19526,10 @@ ix86_gimple_fold_builtin (gimple_stmt_it
>  case IX86_BUILTIN_PABSD256_MASK:
>gcc_assert (n_args >= 1);
>if (!gimple_call_lhs (stmt))
> -   break;
> +   {
> + gsi_replace (gsi, gimple_build_nop (), false);
> + return true;
> +   }
>arg0 = gimple_call_arg (stmt, 0);
>elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
>/* For masked ABS, only optimize if the mask is all ones.  */
> --- gcc/testsuite/gcc.target/i386/pr112962.c.jj 2023-12-12 11:56:56.735917531 
> +0100
> +++ gcc/testsuite/gcc.target/i386/pr112962.c2023-12-12 11:56:39.406157222 
> +0100
> @@ -0,0 +1,11 @@
> +/* PR target/112962 */
> +/* { dg-do compile } */
> +/* { dg-options "-fexceptions -mssse3" } */
> +
> +typedef int __attribute__((__vector_size__ (16))) V;
> +
> +void
> +foo (void)
> +{
> +  __builtin_ia32_pabsd128 ((V) {});
> +}
>
> Jakub
>


-- 
BR,
Hongtao


Re: Disable FMADD in chains for Zen4 and generic

2023-12-12 Thread Hongtao Liu
On Tue, Dec 12, 2023 at 10:38 PM Jan Hubicka  wrote:
>
> Hi,
> this patch disables use of FMA in matrix multiplication loop for generic (for
> x86-64-v3) and zen4.  I tested this on zen4 and Xenon Gold Gold 6212U.
>
> For Intel this is neutral both on the matrix multiplication microbenchmark
> (attached) and spec2k17 where the difference was within noise for Core.
>
> On core the micro-benchmark runs as follows:
>
> With FMA:
>
>578,500,241  cycles:u #3.645 GHz   
>   ( +-  0.12% )
>753,318,477  instructions:u   #1.30  insn per 
> cycle  ( +-  0.00% )
>125,417,701  branches:u   #  790.227 M/sec 
>   ( +-  0.00% )
>   0.159146 +- 0.000363 seconds time elapsed  ( +-  0.23% )
>
>
> No FMA:
>
>577,573,960  cycles:u #3.514 GHz   
>   ( +-  0.15% )
>878,318,479  instructions:u   #1.52  insn per 
> cycle  ( +-  0.00% )
>125,417,702  branches:u   #  763.035 M/sec 
>   ( +-  0.00% )
>   0.164734 +- 0.000321 seconds time elapsed  ( +-  0.19% )
>
> So the cycle count is unchanged and discrete multiply+add takes same time as 
> FMA.
>
> While on zen:
>
>
> With FMA:
>  484875179  cycles:u #3.599 GHz   
>( +-  0.05% )  (82.11%)
>  752031517  instructions:u   #1.55  insn per 
> cycle
>  125106525  branches:u   #  928.712 M/sec 
>( +-  0.03% )  (85.09%)
> 128356  branch-misses:u  #0.10% of all 
> branches  ( +-  0.06% )  (83.58%)
>
> No FMA:
>  375875209  cycles:u #3.592 GHz   
>( +-  0.08% )  (80.74%)
>  875725341  instructions:u   #2.33  insn per 
> cycle
>  124903825  branches:u   #1.194 G/sec 
>( +-  0.04% )  (84.59%)
>   0.105203 +- 0.000188 seconds time elapsed  ( +-  0.18% )
>
> The diffrerence is that Cores understand the fact that fmadd does not need
> all three parameters to start computation, while Zen cores doesn't.
>
> Since this seems noticeable win on zen and not loss on Core it seems like good
> default for generic.
>
> I plan to commit the patch next week if there are no compplains.
The generic part LGTM.(It's exactly what we proposed in [1])

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637721.html
>
> Honza
>
> #include 
> #include 
>
> #define SIZE 1000
>
> float a[SIZE][SIZE];
> float b[SIZE][SIZE];
> float c[SIZE][SIZE];
>
> void init(void)
> {
>int i, j, k;
>for(i=0; i{
>   for(j=0; j   {
>  a[i][j] = (float)i + j;
>  b[i][j] = (float)i - j;
>  c[i][j] = 0.0f;
>   }
>}
> }
>
> void mult(void)
> {
>int i, j, k;
>
>for(i=0; i{
>   for(j=0; j   {
>  for(k=0; k  {
> c[i][j] += a[i][k] * b[k][j];
>  }
>   }
>}
> }
>
> int main(void)
> {
>clock_t s, e;
>
>init();
>s=clock();
>mult();
>e=clock();
>printf("mult took %10d clocks\n", (int)(e-s));
>
>return 0;
>
> }
>
> * confg/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS, 
> X86_TUNE_AVOID_256FMA_CHAINS)
> Enable for znver4 and Core.
>
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 43fa9e8fd6d..74b03cbcc60 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -515,13 +515,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
> "use_scatter_8parts",
>
>  /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
> smaller FMA chain.  */
> -DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
> m_ZNVER2 | m_ZNVER3
> -  | m_YONGFENG)
> +DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
> m_ZNVER2 | m_ZNVER3 | m_ZNVER4
> +  | m_YONGFENG | m_GENERIC)
>
>  /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
> smaller FMA chain.  */
> -DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
> m_ZNVER3
> - | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM)
> +DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
> m_ZNVER3 | m_ZNVER4
> + | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
>
>  /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
> smaller FMA chain.  */



-- 
BR,
Hongtao


Re: [r14-6420 Regression] FAIL: gcc.target/i386/pr110790-2.c scan-assembler-times shrq 2 on Linux/x86_64

2023-12-11 Thread Hongtao Liu
On Tue, Dec 12, 2023 at 1:47 PM Jiang, Haochen via Gcc-regression
 wrote:
>
> > -Original Message-
> > From: Jiang, Haochen
> > Sent: Tuesday, December 12, 2023 9:11 AM
> > To: Andrew Pinski (QUIC) ; haochen.jiang
> > ; gcc-regress...@gcc.gnu.org; gcc-
> > patc...@gcc.gnu.org
> > Subject: RE: [r14-6420 Regression] FAIL: gcc.target/i386/pr110790-2.c scan-
> > assembler-times shrq 2 on Linux/x86_64
> >
> > > -Original Message-
> > > From: Andrew Pinski (QUIC) 
> > > Sent: Tuesday, December 12, 2023 9:01 AM
> > > To: haochen.jiang ; Andrew Pinski (QUIC)
> > > ; gcc-regress...@gcc.gnu.org; gcc-
> > > patc...@gcc.gnu.org; Jiang, Haochen 
> > > Subject: RE: [r14-6420 Regression] FAIL: gcc.target/i386/pr110790-2.c
> > scan-
> > > assembler-times shrq 2 on Linux/x86_64
> > >
> > > > -Original Message-
> > > > From: haochen.jiang 
> > > > Sent: Monday, December 11, 2023 4:54 PM
> > > > To: Andrew Pinski (QUIC) ; gcc-
> > > > regress...@gcc.gnu.org; gcc-patches@gcc.gnu.org;
> > haochen.ji...@intel.com
> > > > Subject: [r14-6420 Regression] FAIL: gcc.target/i386/pr110790-2.c scan-
> > > > assembler-times shrq 2 on Linux/x86_64
> > > >
> > > > On Linux/x86_64,
> > > >
> > > > 85c5efcffed19ca6160eeecc2d4faebd9fee63aa is the first bad commit
> > commit
> > > > 85c5efcffed19ca6160eeecc2d4faebd9fee63aa
> > > > Author: Andrew Pinski 
> > > > Date:   Sat Nov 11 15:54:10 2023 -0800
> > > >
> > > > MATCH: (convert)(zero_one !=/== 0/1) for outer type and zero_one 
> > > > type
> > are
> > > > the same
> > > >
> > > > caused
> > > >
> > > > FAIL: gcc.target/i386/pr110790-2.c scan-assembler-times shrq 2
> > >
> > >
> > > So I think this is a testsuite issue, in that shrx instruction is being 
> > > used here
> > > instead of just ` shrq` due to that instruction being enabled with `-
> > > march=cascadelake` .
> > > Can someone confirm that and submit a testcase change?
> >
> > I will do that today.
>
> I suppose we might just need to change the scan-asm from shrq to shr to cover
> shrx.
Please use shr\[qx\], not shr.
>
> Is that ok? If it is, I will commit a patch to change that.
>
> Thx,
> Haochen
>
> >
> > Thx,
> > Haochen
> >
> > >
> > > Thanks,
> > > Andrew
> > >
> > > >
> > > > with GCC configured with
> > > >
> > > > ../../gcc/configure --prefix=/export/users/haochenj/src/gcc-
> > > > bisect/master/master/r14-6420/usr --enable-clocale=gnu --with-system-
> > zlib -
> > > > -with-demangler-in-ld --with-fpmath=sse --enable-
> > languages=c,c++,fortran --
> > > > enable-cet --without-isl --enable-libmpx x86_64-linux 
> > > > --disable-bootstrap
> > > >
> > > > To reproduce:
> > > >
> > > > $ cd {build_dir}/gcc && make check
> > > > RUNTESTFLAGS="i386.exp=gcc.target/i386/pr110790-2.c --
> > > > target_board='unix{-m64\ -march=cascadelake}'"
> > > >
> > > > (Please do not reply to this email, for question about this report, 
> > > > contact
> > me at
> > > > haochen dot jiang at intel.com.) (If you met problems with cascadelake
> > > > related, disabling AVX512F in command line might save that.) (However,
> > > > please make sure that there is no potential problems with AVX512.)



-- 
BR,
Hongtao


Re: [PATCH] Don't assume it's AVX_U128_CLEAN after call_insn whose abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.

2023-12-11 Thread Hongtao Liu
On Fri, Dec 8, 2023 at 10:17 AM liuhongt  wrote:
>
> If the function desn't clobber any sse registers or only clobber
> 128-bit part, then vzeroupper isn't issued before the function exit.
> the status not CLEAN but ANY after the function.
>
> Also for sibling_call, it's safe to issue an vzeroupper. Also there
> could be missing vzeroupper since there's no mode_exit for
> sibling_call_p.
>
> Compared to the patch in the PR, this patch add sibling_call part.
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk and backport?
Part of this has been approved  in the PR, and for the sibling_call
part, i think it should be reasonable.
So i'm going to commit the patch.
>
> gcc/ChangeLog:
>
> PR target/112891
> * config/i386/i386.cc (ix86_avx_u128_mode_after): Return
> AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
> align with ix86_avx_u128_mode_needed.
> (ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
> sibling_call.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr112891.c: New test.
> * gcc.target/i386/pr112891-2.c: New test.
> ---
>  gcc/config/i386/i386.cc| 22 +---
>  gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++
>  gcc/testsuite/gcc.target/i386/pr112891.c   | 29 +
>  3 files changed, 78 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 7c5cab4e2c6..fe259cdb789 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -15038,8 +15038,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
>  vzeroupper if all SSE registers are clobbered.  */
>const function_abi  = insn_callee_abi (insn);
>if (vzeroupper_pattern (PATTERN (insn), VOIDmode)
> - || !hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
> -abi.mode_clobbers (V4DImode)))
> + /* Should be safe to issue an vzeroupper before sibling_call_p.
> +Also there not mode_exit for sibling_call, so there could be
> +missing vzeroupper for that.  */
> + || !(SIBLING_CALL_P (insn)
> +  || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
> +abi.mode_clobbers (V4DImode
> return AVX_U128_ANY;
>
>return AVX_U128_CLEAN;
> @@ -15177,7 +15181,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
>bool avx_upper_reg_found = false;
>note_stores (insn, ix86_check_avx_upper_stores, _upper_reg_found);
>
> -  return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
> +  if (avx_upper_reg_found)
> +   return AVX_U128_DIRTY;
> +
> +  /* If the function desn't clobber any sse registers or only clobber
> +128-bit part, Then vzeroupper isn't issued before the function exit.
> +the status not CLEAN but ANY after the function.  */
> +  const function_abi  = insn_callee_abi (insn);
> +  if (!(SIBLING_CALL_P (insn)
> +   || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
> + abi.mode_clobbers (V4DImode
> +   return AVX_U128_ANY;
> +
> +  return  AVX_U128_CLEAN;
>  }
>
>/* Otherwise, return current mode.  Remember that if insn
> diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c 
> b/gcc/testsuite/gcc.target/i386/pr112891-2.c
> new file mode 100644
> index 000..164c3985d50
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx2 -O3" } */
> +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> +
> +void
> +__attribute__((noinline))
> +bar (double* a)
> +{
> +  a[0] = 1.0;
> +  a[1] = 2.0;
> +}
> +
> +double
> +__attribute__((noinline))
> +foo (double* __restrict a, double* b)
> +{
> +  a[0] += b[0];
> +  a[1] += b[1];
> +  a[2] += b[2];
> +  a[3] += b[3];
> +  bar (b);
> +  return a[5] + b[5];
> +}
> +
> +double
> +foo1 (double* __restrict a, double* b)
> +{
> +  double c = foo (a, b);
> +  return __builtin_exp (c);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c 
> b/gcc/testsuite/gcc.target/i386/pr112891.c
> new file mode 100644
> index 000..dbf6c67948a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr112891.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx2 -O3" } */
> +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
> +
> +void
> +__attribute__((noinline))
> +bar (double* a)
> +{
> +  a[0] = 1.0;
> +  a[1] = 2.0;
> +}
> +
> +void
> +__attribute__((noinline))
> +foo (double* __restrict a, double* b)
> +{
> +  a[0] += b[0];
> +  a[1] += b[1];
> +  a[2] += b[2];
> +  a[3] += b[3];
> +  bar (b);
> +}
> +
> +double
> +foo1 (double* 

Re: [PATCH] i386: Fix missed APX_NDD check for shift/rotate expanders [PR 112943]

2023-12-11 Thread Hongtao Liu
On Mon, Dec 11, 2023 at 8:39 PM Hongyu Wang  wrote:
>
> > > +__int128 u128_2 = (9223372036854775808 << 4) * foo0_u8_0; /* { 
> > > dg-warning "integer constant is so large that it is unsigned" "so large" 
> > > } */
> >
> > Just you can use (9223372036854775807LL + (__int128) 1) instead of 
> > 9223372036854775808
> > to avoid the warning.
> > The testcase will ICE without the patch even with that.
>
> Thanks for the hint! Will adjust when pushing the patch.
Ok.



-- 
BR,
Hongtao


Re: [v3 PATCH] Simplify vector ((VCE (a cmp b ? -1 : 0)) < 0) ? c : d to just (VCE ((a cmp b) ? (VCE c) : (VCE d))).

2023-12-11 Thread Hongtao Liu
On Mon, Dec 11, 2023 at 4:14 PM Richard Biener
 wrote:
>
> On Mon, Dec 11, 2023 at 7:51 AM liuhongt  wrote:
> >
> > > since you are looking at TYPE_PRECISION below you want
> > > VECTOR_INTIEGER_TYPE_P here as well?  The alternative
> > > would be to compare TYPE_SIZE.
> > >
> > > Some of the checks feel redundant but are probably good for
> > > documentation purposes.
> > >
> > > OK with using VECTOR_INTIEGER_TYPE_P
> > Actually, the data type doens't need to integer, .i.e x86 support vblendvps
> > so I'm using TYPE_SIZE here, the code is adjusted to
> >
> > && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (type)))
> > && (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (type)))
> ><= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@6
> >
> > Here's the updated patch.
> > Ok for trunk?
> >
> > When I'm working on PR112443, I notice there's some misoptimizations:
> > after we fold _mm{,256}_blendv_epi8/pd/ps into gimple, the backend
> > fails to combine it back to v{,p}blendv{v,ps,pd} since the pattern is
> > too complicated, so I think maybe we should hanlde it in the gimple
> > level.
> >
> > The dump is like
> >
> >   _1 = c_3(D) >= { 0, 0, 0, 0 };
> >   _2 = VEC_COND_EXPR <_1, { -1, -1, -1, -1 }, { 0, 0, 0, 0 }>;
> >   _7 = VIEW_CONVERT_EXPR(_2);
> >   _8 = VIEW_CONVERT_EXPR(b_6(D));
> >   _9 = VIEW_CONVERT_EXPR(a_5(D));
> >   _10 = _7 < { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
> >   _11 = VEC_COND_EXPR <_10, _8, _9>;
> >
> > It can be optimized to
> >
> >   _1 = c_2(D) >= { 0, 0, 0, 0 };
> >   _6 = VEC_COND_EXPR <_1, b_5(D), a_4(D)>;
> >
> > since _7 is either -1 or 0, the selection of _7 < 0 ? _8 : _9 should
> > be euqal to _1 ? b : a as long as TYPE_PRECISION of the component type
> > of the second VEC_COND_EXPR is less equal to the first one.
> > The patch add a gimple pattern to handle that.
> >
> > gcc/ChangeLog:
> >
> > * match.pd (VCE (a cmp b ? -1 : 0) < 0) ? c : d ---> (VCE ((a
> > cmp b) ? (VCE:c) : (VCE:d))): New gimple simplication.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/avx512vl-blendv-3.c: New test.
> > * gcc.target/i386/blendv-3.c: New test.
> > ---
> >  gcc/match.pd  | 23 ++
> >  .../gcc.target/i386/avx512vl-blendv-3.c   |  6 +++
> >  gcc/testsuite/gcc.target/i386/blendv-3.c  | 46 +++
> >  3 files changed, 75 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/blendv-3.c
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 4d554ba4721..359c7b07dc3 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -5190,6 +5190,29 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >   (if (optimize_vectors_before_lowering_p () && types_match (@0, @3))
> >(vec_cond (bit_and @0 (bit_not @3)) @2 @1)))
> >
> > +/*  ((VCE (a cmp b ? -1 : 0)) < 0) ? c : d is just
> > +(VCE ((a cmp b) ? (VCE c) : (VCE d))) when TYPE_PRECISION of the
> > +component type of the outer vec_cond is greater equal the inner one.  
> > */
> > +(for cmp (simple_comparison)
> > + (simplify
> > +  (vec_cond
> > +(lt (view_convert@5 (vec_cond@6 (cmp@4 @0 @1)
> > +   integer_all_onesp
> > +   integer_zerop))
> > + integer_zerop) @2 @3)
> > +  (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
> > +   && VECTOR_INTEGER_TYPE_P (TREE_TYPE (@5))
> > +   && !TYPE_UNSIGNED (TREE_TYPE (@5))
> > +   && VECTOR_TYPE_P (TREE_TYPE (@6))
> > +   && VECTOR_TYPE_P (type)
> > +   && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (type)))
> > +   && (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (type)))
> > + <= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@6
>
> sorry for nitpicking, but can you please use
>
> && tree_int_cst_le (TYPE_SIZE (TREE_TYPE (type)),
>  TREE_TYPE (TREE_TYPE (@6)))
>
> thus not use precision on one and size on the other type?
>
> OK with that change.
Thanks, committed.
>
> Richard.
>
> > +   && TYPE_SIZE (type) == TYPE_SIZE (TREE_TYPE (@6)))
> > +   (with { tree vtype = TREE_TYPE (@6);}
> > + (view_convert:type
> > +   (vec_cond @4 (view_convert:vtype @2) (view_convert:vtype @3)))
> > +
> >  /* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
> >  (simplify
> >   (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c 
> > b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> > new file mode 100644
> > index 000..2777e72ab5f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512vl -mavx512bw -O2" } */
> > +/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
> > +/* { dg-final { scan-assembler-not {vpcmp} } } */
> > +
> > +#include "blendv-3.c"

Re: [PATCH] i386: Mark Xeon Phi ISAs as deprecated

2023-12-07 Thread Hongtao Liu
On Wed, Dec 6, 2023 at 3:52 PM Richard Biener
 wrote:
>
> On Wed, Dec 6, 2023 at 3:33 AM Jiang, Haochen  wrote:
> >
> > > -Original Message-
> > > From: Jiang, Haochen
> > > Sent: Friday, December 1, 2023 4:51 PM
> > > To: Richard Biener 
> > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> > > ubiz...@gmail.com
> > > Subject: RE: [PATCH] i386: Mark Xeon Phi ISAs as deprecated
> > >
> > > > -Original Message-
> > > > From: Richard Biener 
> > > > Sent: Friday, December 1, 2023 4:37 PM
> > > > To: Jiang, Haochen 
> > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> > > > ubiz...@gmail.com
> > > > Subject: Re: [PATCH] i386: Mark Xeon Phi ISAs as deprecated
> > > >
> > > > On Fri, Dec 1, 2023 at 8:34 AM Jiang, Haochen 
> > > > wrote:
> > > > >
> > > > > > -Original Message-
> > > > > > From: Richard Biener 
> > > > > > Sent: Friday, December 1, 2023 3:04 PM
> > > > > > To: Jiang, Haochen 
> > > > > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> > > > > > ubiz...@gmail.com
> > > > > > Subject: Re: [PATCH] i386: Mark Xeon Phi ISAs as deprecated
> > > > > >
> > > > > > On Fri, Dec 1, 2023 at 3:22 AM Haochen Jiang
> > > 
> > > > > > wrote:
> > > > > > >
> > > > > > > Since Knight Landing and Knight Mill microarchitectures are EOL, 
> > > > > > > we
> > > > > > > would like to remove its support in GCC 15. In GCC 14, we will 
> > > > > > > first
> > > > > > > emit a warning for the usage.
> > > > > >
> > > > > > I think it's better to keep supporting -mtune/arch=knl without 
> > > > > > diagnostics
> > > > >
> > > > > I see, it could be a choice and might be better. But if we take this, 
> > > > > how
> > > should
> > > > > we define -mtune=knl remains a question.
> > > >
> > > > I'd say mapping it to a "close" micro-architecture makes most sense, but
> > > > we could also simply keep the tuning entry for knl?
> > >
> > > Actually I have written a removal test patch, one of the issue might be 
> > > there is
> > > something specific about knl in tuning for VZEROUPPER, which is also 
> > > reflected
> > > in
> > > PR82990.
> > >
> > > /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction
> > > insertion
> > >before a transfer of control flow out of the function.  */
> > > DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
> > >
> > > If we chose to keep them, this behavior will be changed.
> >
> > Hi Richard,
> >
> > After double thinking, I suppose we still should remove the arch/tune 
> > options
> > here to avoid misleading behavior since there will always something be 
> > changed.
> >
> > What is your concern about removing? Do you have anything that relies on the
> > tune and arch?
>
> We usually promise backwards compatibility with respect to accepted options
> which is why we have things like
>
> ftree-vect-loop-version
> Common Ignore
> Does nothing. Preserved for backward compatibility.
>
> the backend errors on unknown march/tune and that would be a regression
> for build systems using that (even if that's indeed very unlikely).  That's 
> why
> I suggested to make it still do something (doing "nothing", aka keeping 
> generic
> is probably worse than dropping).  I guess having -march=knl behave 
> differently
> is also bad so I guess there's not a good solution for that.
To avoid confusion,  I prefer to remove all of them.
>
> So - just to have made the above point, I'm fine with what x86 maintainers
> decide here.
>
> Richard.
>
> > Thx,
> > Haochen
> >
> > >
> > > >
> > > > > > but simply not enable the ISAs we don't support.  The better 
> > > > > > question is
> > > > > > what to do about KNL specific intrinsics headers / intrinsics?  
> > > > > > Will we
> > > > > > simply remove those?
> > > > >
> > > > > If there is no objection, The intrinsics are planned to be removed in 
> > > > > GCC 15.
> > > > > As far as concerned, almost nobody are using them with the latest GCC.
> > > And
> > > > > there is no complaint when removing them in ICC/ICX.
> > > >
> > > > I see.  Replacing the header contents with #error "XYZ is no longer
> > > supported"
> > > > might be nicer.  OTOH x86intrin.h should simply no longer include them.
> > >
> > > That is nicer. I will take that in GCC 15 patch.
> > >
> > > Thx,
> > > Haochen
> > >
> > > >
> > > > Richard.
> > > >
> > > > > Thx,
> > > > > Haochen
> > > > >
> > > > > >
> > > > > > Richard.
> > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > > * config/i386/driver-i386.cc (host_detect_local_cpu):
> > > > > > > Do not append "-mno-" for Xeon Phi ISAs.
> > > > > > > * config/i386/i386-options.cc 
> > > > > > > (ix86_option_override_internal):
> > > > > > > Emit a warning for KNL/KNM targets.
> > > > > > > * config/i386/i386.opt: Emit a warning for Xeon Phi ISAs.
> > > > > > >
> > > > > > > gcc/testsuite/ChangeLog:
> > > > > > >
> > > > > > > * g++.dg/other/i386-2.C: Adjust testcases.
> > > > > > > * g++.dg/other/i386-3.C: Ditto.
> > > > > > >   

Re: [V2 PATCH] Simplify vector ((VCE (a cmp b ? -1 : 0)) < 0) ? c : d to just (VCE ((a cmp b) ? (VCE c) : (VCE d))).

2023-12-07 Thread Hongtao Liu
ping.

On Thu, Nov 16, 2023 at 6:49 PM liuhongt  wrote:
>
> Update in V2:
> 1) Add some comments before the pattern.
> 2) Remove ? from view_convert.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> When I'm working on PR112443, I notice there's some misoptimizations:
> after we fold _mm{,256}_blendv_epi8/pd/ps into gimple, the backend
> fails to combine it back to v{,p}blendv{v,ps,pd} since the pattern is
> too complicated, so I think maybe we should hanlde it in the gimple
> level.
>
> The dump is like
>
>   _1 = c_3(D) >= { 0, 0, 0, 0 };
>   _2 = VEC_COND_EXPR <_1, { -1, -1, -1, -1 }, { 0, 0, 0, 0 }>;
>   _7 = VIEW_CONVERT_EXPR(_2);
>   _8 = VIEW_CONVERT_EXPR(b_6(D));
>   _9 = VIEW_CONVERT_EXPR(a_5(D));
>   _10 = _7 < { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
>   _11 = VEC_COND_EXPR <_10, _8, _9>;
>
> It can be optimized to
>
>   _1 = c_2(D) >= { 0, 0, 0, 0 };
>   _6 = VEC_COND_EXPR <_1, b_5(D), a_4(D)>;
>
> since _7 is either -1 or 0, the selection of _7 < 0 ? _8 : _9 should
> be euqal to _1 ? b : a as long as TYPE_PRECISION of the component type
> of the second VEC_COND_EXPR is less equal to the first one.
> The patch add a gimple pattern to handle that.
>
> gcc/ChangeLog:
>
> * match.pd (VCE (a cmp b ? -1 : 0) < 0) ? c : d ---> (VCE ((a
> cmp b) ? (VCE:c) : (VCE:d))): New gimple simplication.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx512vl-blendv-3.c: New test.
> * gcc.target/i386/blendv-3.c: New test.
> ---
>  gcc/match.pd  | 22 +
>  .../gcc.target/i386/avx512vl-blendv-3.c   |  6 +++
>  gcc/testsuite/gcc.target/i386/blendv-3.c  | 46 +++
>  3 files changed, 74 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/blendv-3.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index dbc811b2b38..2a69622a300 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -5170,6 +5170,28 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (optimize_vectors_before_lowering_p () && types_match (@0, @3))
>(vec_cond (bit_and @0 (bit_not @3)) @2 @1)))
>
> +/*  ((VCE (a cmp b ? -1 : 0)) < 0) ? c : d is just
> +(VCE ((a cmp b) ? (VCE c) : (VCE d))) when TYPE_PRECISION of the
> +component type of the outer vec_cond is greater equal the inner one.  */
> +(for cmp (simple_comparison)
> + (simplify
> +  (vec_cond
> +(lt (view_convert@5 (vec_cond@6 (cmp@4 @0 @1)
> +   integer_all_onesp
> +   integer_zerop))
> + integer_zerop) @2 @3)
> +  (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
> +   && VECTOR_INTEGER_TYPE_P (TREE_TYPE (@5))
> +   && !TYPE_UNSIGNED (TREE_TYPE (@5))
> +   && VECTOR_TYPE_P (TREE_TYPE (@6))
> +   && VECTOR_TYPE_P (type)
> +   && (TYPE_PRECISION (TREE_TYPE (type))
> + <= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@6
> +   && TYPE_SIZE (type) == TYPE_SIZE (TREE_TYPE (@6)))
> +   (with { tree vtype = TREE_TYPE (@6);}
> + (view_convert:type
> +   (vec_cond @4 (view_convert:vtype @2) (view_convert:vtype @3)))
> +
>  /* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
>  (simplify
>   (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c 
> b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> new file mode 100644
> index 000..2777e72ab5f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512vl -mavx512bw -O2" } */
> +/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
> +/* { dg-final { scan-assembler-not {vpcmp} } } */
> +
> +#include "blendv-3.c"
> diff --git a/gcc/testsuite/gcc.target/i386/blendv-3.c 
> b/gcc/testsuite/gcc.target/i386/blendv-3.c
> new file mode 100644
> index 000..fa0fb067a73
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/blendv-3.c
> @@ -0,0 +1,46 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx2 -O2" } */
> +/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
> +/* { dg-final { scan-assembler-not {vpcmp} } } */
> +
> +#include 
> +
> +__m256i
> +foo (__m256i a, __m256i b, __m256i c)
> +{
> +  return _mm256_blendv_epi8 (a, b, ~c < 0);
> +}
> +
> +__m256d
> +foo1 (__m256d a, __m256d b, __m256i c)
> +{
> +  __m256i d = ~c < 0;
> +  return _mm256_blendv_pd (a, b, (__m256d)d);
> +}
> +
> +__m256
> +foo2 (__m256 a, __m256 b, __m256i c)
> +{
> +  __m256i d = ~c < 0;
> +  return _mm256_blendv_ps (a, b, (__m256)d);
> +}
> +
> +__m128i
> +foo4 (__m128i a, __m128i b, __m128i c)
> +{
> +  return _mm_blendv_epi8 (a, b, ~c < 0);
> +}
> +
> +__m128d
> +foo5 (__m128d a, __m128d b, __m128i c)
> +{
> +  __m128i d = ~c < 0;
> +  return _mm_blendv_pd (a, b, (__m128d)d);
> +}
> +
> +__m128
> +foo6 

Re: [PATCH v3 00/16] Support Intel APX NDD

2023-12-06 Thread Hongtao Liu
On Wed, Dec 6, 2023 at 8:11 PM Uros Bizjak  wrote:
>
> On Wed, Dec 6, 2023 at 9:08 AM Hongyu Wang  wrote:
> >
> > Hi,
> >
> > Following up the discussion of V2 patches in
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-December/639368.html,
> > this patch series add early clobber for all TImode NDD alternatives
> > to avoid any potential overlapping between dest register and src
> > register/memory. Also use get_attr_isa (insn) == ISA_APX_NDD instead of
> > checking alternative at asm output stage.
> >
> > Bootstrapped & regtested on x86_64-pc-linux-gnu{-m32,} and sde.
> >
> > Ok for master?
>
> LGTM, but Hongtao should have the final approval here.
Ok, thanks.
>
> Thanks,
> Uros.
>
> >
> > Hongyu Wang (7):
> >   [APX NDD] Disable seg_prefixed memory usage for NDD add
> >   [APX NDD] Support APX NDD for left shift insns
> >   [APX NDD] Support APX NDD for right shift insns
> >   [APX NDD] Support APX NDD for rotate insns
> >   [APX NDD] Support APX NDD for shld/shrd insns
> >   [APX NDD] Support APX NDD for cmove insns
> >   [APX NDD] Support TImode shift for NDD
> >
> > Kong Lingling (9):
> >   [APX NDD] Support Intel APX NDD for legacy add insn
> >   [APX NDD] Support APX NDD for optimization patterns of add
> >   [APX NDD] Support APX NDD for adc insns
> >   [APX NDD] Support APX NDD for sub insns
> >   [APX NDD] Support APX NDD for sbb insn
> >   [APX NDD] Support APX NDD for neg insn
> >   [APX NDD] Support APX NDD for not insn
> >   [APX NDD] Support APX NDD for and insn
> >   [APX NDD] Support APX NDD for or/xor insn
> >
> >  gcc/config/i386/constraints.md|5 +
> >  gcc/config/i386/i386-expand.cc|  164 +-
> >  gcc/config/i386/i386-options.cc   |2 +
> >  gcc/config/i386/i386-protos.h |   16 +-
> >  gcc/config/i386/i386.cc   |   30 +-
> >  gcc/config/i386/i386.md   | 2325 +++--
> >  gcc/testsuite/gcc.target/i386/apx-ndd-adc.c   |   15 +
> >  gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c  |   16 +
> >  gcc/testsuite/gcc.target/i386/apx-ndd-sbb.c   |6 +
> >  .../gcc.target/i386/apx-ndd-shld-shrd.c   |   24 +
> >  .../gcc.target/i386/apx-ndd-ti-shift.c|   91 +
> >  gcc/testsuite/gcc.target/i386/apx-ndd.c   |  202 ++
> >  12 files changed, 2141 insertions(+), 755 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-adc.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-sbb.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-shld-shrd.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-ti-shift.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd.c
> >
> > --
> > 2.31.1
> >



-- 
BR,
Hongtao


Re: [PATCH] Don't vectorize when vector stmts are only vec_contruct and stores

2023-12-05 Thread Hongtao Liu
On Mon, Dec 4, 2023 at 10:10 PM Richard Biener
 wrote:
>
> On Mon, Dec 4, 2023 at 6:32 AM liuhongt  wrote:
> >
> > .i.e. for below cases.
> >a[0] = b1;
> >a[1] = b2;
> >..
> >a[n] = bn;
> >
> > There're extra dependences when contructing the vector, but not for
> > scalar store. According to experiments, it's generally worse.
> >
> > The patch adds an cut-off heuristic when vec_stmt is just
> > vec_construct and vector store. It improves SPEC2017 a little bit.
> >
> > BenchMarks  Ratio
> > 500.perlbench_r 2.60%
> > 502.gcc_r   0.30%
> > 505.mcf_r   0.40%
> > 520.omnetpp_r   -1.00%
> > 523.xalancbmk_r 0.90%
> > 525.x264_r  0.00%
> > 531.deepsjeng_r 0.30%
> > 541.leela_r 0.90%
> > 548.exchange2_r 3.20%
> > 557.xz_r1.40%
> > 503.bwaves_r0.00%
> > 507.cactuBSSN_r 0.00%
> > 508.namd_r  0.30%
> > 510.parest_r0.00%
> > 511.povray_r0.20%
> > 519.lbm_r   SAME BIN
> > 521.wrf_r   -0.30%
> > 526.blender_r   -1.20%
> > 527.cam4_r  -0.20%
> > 538.imagick_r   4.00%
> > 544.nab_r   0.40%
> > 549.fotonik3d_r 0.00%
> > 554.roms_r  0.00%
> > Geomean-int 0.90%
> > Geomean-fp  0.30%
> > Geomean-all 0.50%
> >
> > And
> > Regressed testcases:
> >
> > gcc.target/i386/part-vect-absneghf.c
> > gcc.target/i386/part-vect-copysignhf.c
> > gcc.target/i386/part-vect-xorsignhf.c
> >
> > Regressed under -m32 since it generates 2 vector
> > .ABS/NEG/XORSIGN/COPYSIGN vs original 1 64-bit vec_construct. The
> > original testcases are used to test vectorization capability for
> > .ABS/NEG/XORG/COPYSIGN, so just restrict testcase to TARGET_64BIT.
> >
> > gcc.target/i386/pr111023-2.c
> > gcc.target/i386/pr111023.c
> > Regressed under -m32
> >
> > testcase as below
> >
> > void
> > v8hi_v8qi (v8hi *dst, v16qi src)
> > {
> >   short tem[8];
> >   tem[0] = src[0];
> >   tem[1] = src[1];
> >   tem[2] = src[2];
> >   tem[3] = src[3];
> >   tem[4] = src[4];
> >   tem[5] = src[5];
> >   tem[6] = src[6];
> >   tem[7] = src[7];
> >   dst[0] = *(v8hi *) tem;
> > }
> >
> > under 64-bit target, vectorizer realize it's just permutation of
> > original src vector, but under -m32, vectorizer relies on
> > vec_construct for vectorization. I think optimziation for this case
> > under 32-bit target maynot impact much, so just add
> > -fno-vect-cost-model.
> >
> > gcc.target/i386/pr91446.c: This testcase is guard for cost model of
> > vector store, not vectorization capability, so just adjust testcase.
> >
> > gcc.target/i386/pr108938-3.c: This testcase relies on vec_construct to
> > optimize for bswap, like other optimziation vectorizer can't realize
> > optimization after it. So the current solution is add
> > -fno-vect-cost-model to the testcase.
> >
> > costmodel-pr104582-1.c
> > costmodel-pr104582-2.c
> > costmodel-pr104582-4.c
> >
> > Failed since it's now not vectorized, looked at the PR, it's exactly
> > what's wanted, so adjust testcase to scan-tree-dump-not.
> >
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
>
> So the original motivation to not more aggressively prune
> store-from-CTOR vectorization in the vectorizer itself is that
> the vector store is possibly better for STLF (larger stores are
> good, larger loads eventually problematic).

That's exactly what I worried about, and I didn't observe any STLF
stall in SPEC2017, I'll try with more benchmarks.
But on the other hand, the cost model is not suitable for solving this
problem, at best it only circumvents part of this.

>
> I'd also expect the costs to play out to not make those profitable.
>
> OTOH, if you have a series of 'double' stores you can convert to
> a series of V2DF stores you _may_ be faster if this reduces
> pressure on the store unit.  Esp. V2DF is cheap to construct
> with one movhpd.

>
> So I don't think we want to try to pattern match it this way?
>
> In fact the SLP vectorization cases could all arrive with an
> SLP node specified (vectorizable_store would have to be
> changed here), which means you could check for an
> vect_external_def child instead?
>
> But as said, I would hope that we can arrive at a better way
> assessing the CONSTRUCTOR cost.  IMHO one big issue
> is that load and store cost are comparatively high compared
> to simple stmt ops so it's very hard to offset saving many
> stores with "ops".  That's because we generally think of
> 'cost' to model latency but as you say stores don't really
> have latency - we only have store bandwidth of the store

Yes.

> unit and of course issue width (but that's true for other ops
> as well).  I wonder what happens if we set both scalar and
> vector store cost to zero?  Or maybe one (to count one
> issue slot)?

I tried to reduce the cost of the scalar store, but it regressed in 

Re: [PATCH] i386: Move vzeroupper pass from after reload pass to after postreload_cse [PR112760]

2023-12-05 Thread Hongtao Liu
On Wed, Dec 6, 2023 at 6:23 AM Jakub Jelinek  wrote:
>
> Hi!
>
> Regardless of the outcome of the REG_UNUSED discussions, I think
> it is a good idea to move the vzeroupper pass one pass later.
> As can be seen in the multiple PRs and as postreload.cc documents,
> reload/LRA is known to create dead statements quite often, which
> is the reason why we have postreload_cse pass at all.
> Doing vzeroupper pass before such cleanup means the pass including
> df_analyze for it needs to process more instructions than needed
> and because mode switching adds note problem, also higher chance of
> having stale REG_UNUSED notes.
> And, I really don't see why vzeroupper can't wait until those cleanups
> are done.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
LGTM.
>
> 2023-12-05  Jakub Jelinek  
>
> PR rtl-optimization/112760
> * config/i386/i386-passes.def (pass_insert_vzeroupper): Insert
> after pass_postreload_cse rather than pass_reload.
> * config/i386/i386-features.cc (rest_of_handle_insert_vzeroupper):
> Adjust comment for it.
>
> * gcc.dg/pr112760.c: New test.
>
> --- gcc/config/i386/i386-passes.def.jj  2023-01-16 11:52:15.960735877 +0100
> +++ gcc/config/i386/i386-passes.def 2023-12-05 19:15:01.748279329 +0100
> @@ -24,7 +24,7 @@ along with GCC; see the file COPYING3.
> REPLACE_PASS (PASS, INSTANCE, TGT_PASS)
>   */
>
> -  INSERT_PASS_AFTER (pass_reload, 1, pass_insert_vzeroupper);
> +  INSERT_PASS_AFTER (pass_postreload_cse, 1, pass_insert_vzeroupper);
>INSERT_PASS_AFTER (pass_combine, 1, pass_stv, false /* timode_p */);
>/* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and
>   CONSTM1_RTX generated by the STV pass can be CSEed.  */
> --- gcc/config/i386/i386-features.cc.jj 2023-11-02 07:49:15.029894060 +0100
> +++ gcc/config/i386/i386-features.cc2023-12-05 19:15:48.658620698 +0100
> @@ -2627,10 +2627,11 @@ convert_scalars_to_vector (bool timode_p
>  static unsigned int
>  rest_of_handle_insert_vzeroupper (void)
>  {
> -  /* vzeroupper instructions are inserted immediately after reload to
> - account for possible spills from 256bit or 512bit registers.  The pass
> - reuses mode switching infrastructure by re-running mode insertion
> - pass, so disable entities that have already been processed.  */
> +  /* vzeroupper instructions are inserted immediately after reload and
> + postreload_cse to clean up after it a little bit to account for possible
> + spills from 256bit or 512bit registers.  The pass reuses mode switching
> + infrastructure by re-running mode insertion pass, so disable entities
> + that have already been processed.  */
>for (int i = 0; i < MAX_386_ENTITIES; i++)
>  ix86_optimize_mode_switching[i] = 0;
>
> --- gcc/testsuite/gcc.dg/pr112760.c.jj  2023-12-01 13:46:57.444746529 +0100
> +++ gcc/testsuite/gcc.dg/pr112760.c 2023-12-01 13:46:36.729036971 +0100
> @@ -0,0 +1,22 @@
> +/* PR rtl-optimization/112760 */
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fno-dce -fno-guess-branch-probability 
> --param=max-cse-insns=0" } */
> +/* { dg-additional-options "-m8bit-idiv -mavx" { target i?86-*-* x86_64-*-* 
> } } */
> +
> +unsigned g;
> +
> +__attribute__((__noipa__)) unsigned short
> +foo (unsigned short a, unsigned short b)
> +{
> +  unsigned short x = __builtin_add_overflow_p (a, g, (unsigned short) 0);
> +  g -= g / b;
> +  return x;
> +}
> +
> +int
> +main ()
> +{
> +  unsigned short x = foo (40, 6);
> +  if (x != 0)
> +__builtin_abort ();
> +}
>
> Jakub
>


-- 
BR,
Hongtao


Re: [PATCH] Take register pressure into account for vec_construct/scalar_to_vec when the components are not loaded from memory.

2023-12-04 Thread Hongtao Liu
On Mon, Dec 4, 2023 at 3:51 PM Uros Bizjak  wrote:
>
> On Mon, Dec 4, 2023 at 8:11 AM Hongtao Liu  wrote:
> >
> > On Fri, Dec 1, 2023 at 10:26 PM Richard Biener
> >  wrote:
> > >
> > > On Fri, Dec 1, 2023 at 3:39 AM liuhongt  wrote:
> > > >
> > > > > Hmm, I would suggest you put reg_needed into the class and accumulate
> > > > > over all vec_construct, with your patch you pessimize a single v32qi
> > > > > over two separate v16qi for example.  Also currently the whole block 
> > > > > is
> > > > > gated with INTEGRAL_TYPE_P but register pressure would be also
> > > > > a concern for floating point vectors.  finish_cost would then apply an
> > > > > adjustment.
> > > >
> > > > Changed.
> > > >
> > > > > 'target_avail_regs' is for GENERAL_REGS, does that include APX regs?
> > > > > I don't see anything similar for FP regs, but I guess the target 
> > > > > should know
> > > > > or maybe there's a #regs in regclass query already.
> > > > Haven't see any, use below setting.
> > > >
> > > > unsigned target_avail_sse = TARGET_64BIT ? (TARGET_AVX512F ? 32 : 16) : 
> > > > 8;
> > > >
> > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > > No big impact on SPEC2017.
> > > > Observe 1 big improvement from other benchmark by avoiding 
> > > > vectorization with
> > > > vec_construct v32qi which caused lots of spills.
> > > >
> > > > Ok for trunk?
> > >
> > > LGTM, let's see what x86 maintainers think.
> > +Honza and Uros.
> > Any comments?
>
> I have no comment on vector stuff, I think you are the most
> experienced developer in this area.
Thanks, committed.
>
> Uros.
>
> > >
> > > Richard.
> > >
> > > > For vec_contruct, the components must be live at the same time if
> > > > they're not loaded from memory, when the number of those components
> > > > exceeds available registers, spill happens. Try to account that with a
> > > > rough estimation.
> > > > ??? Ideally, we should have an overall estimation of register pressure
> > > > if we know the live range of all variables.
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
> > > > Count sse_reg/gpr_regs for components not loaded from memory.
> > > > (ix86_vector_costs:ix86_vector_costs): New constructor.
> > > > (ix86_vector_costs::m_num_gpr_needed[3]): New private memeber.
> > > > (ix86_vector_costs::m_num_sse_needed[3]): Ditto.
> > > > (ix86_vector_costs::finish_cost): Estimate overall register
> > > > pressure cost.
> > > > (ix86_vector_costs::ix86_vect_estimate_reg_pressure): New
> > > > function.
> > > > ---
> > > >  gcc/config/i386/i386.cc | 54 ++---
> > > >  1 file changed, 50 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > > > index 9390f525b99..dcaea6c2096 100644
> > > > --- a/gcc/config/i386/i386.cc
> > > > +++ b/gcc/config/i386/i386.cc
> > > > @@ -24562,15 +24562,34 @@ ix86_noce_conversion_profitable_p (rtx_insn 
> > > > *seq, struct noce_if_info *if_info)
> > > >  /* x86-specific vector costs.  */
> > > >  class ix86_vector_costs : public vector_costs
> > > >  {
> > > > -  using vector_costs::vector_costs;
> > > > +public:
> > > > +  ix86_vector_costs (vec_info *, bool);
> > > >
> > > >unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
> > > >   stmt_vec_info stmt_info, slp_tree node,
> > > >   tree vectype, int misalign,
> > > >   vect_cost_model_location where) override;
> > > >void finish_cost (const vector_costs *) override;
> > > > +
> > > > +private:
> > > > +
> > > > +  /* Estimate register pressure of the vectorized code.  */
> > > > +  void ix86_vect_estimate_reg_pressure ();
> > > > +  /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used 
> > > > for
> > >

Re: [PATCH v2 00/17] Support Intel APX NDD

2023-12-04 Thread Hongtao Liu
On Tue, Dec 5, 2023 at 10:32 AM Hongyu Wang  wrote:
>
> Hi,
>
> APX NDD patches have been posted at
> https://gcc.gnu.org/pipermail/gcc-patches/2023-November/636604.html
>
> Thanks to Hongtao's review, the V2 patch adds support of zext sematic with
> memory input as NDD by default clear upper bits of dest for any operand size.
>
> Also we support TImode shift with new split helper functions, which allows NDD
> form split but still restric the memory src usage as in post-reload splitter
> the register number is restricted, and no new register can be used for
> shld/shrd.
>
> Also fixed several typo/formatting/redundant code.
Patches LGTM, Please wait a few more days before committing incase
other folks have comments.
>
> Bootstrapped/regtested on x86_64-pc-linux-gnu{-m32,} and sde.
>
> OK for trunk?
>
> Hongyu Wang (8):
>   [APX NDD] Restrict TImode register usage when NDD enabled
>   [APX NDD] Disable seg_prefixed memory usage for NDD add
>   [APX NDD] Support APX NDD for left shift insns
>   [APX NDD] Support APX NDD for right shift insns
>   [APX NDD] Support APX NDD for rotate insns
>   [APX NDD] Support APX NDD for shld/shrd insns
>   [APX NDD] Support APX NDD for cmove insns
>   [APX NDD] Support TImode shift for NDD
>
> Kong Lingling (9):
>   [APX NDD] Support Intel APX NDD for legacy add insn
>   [APX NDD] Support APX NDD for optimization patterns of add
>   [APX NDD] Support APX NDD for adc insns
>   [APX NDD] Support APX NDD for sub insns
>   [APX NDD] Support APX NDD for sbb insn
>   [APX NDD] Support APX NDD for neg insn
>   [APX NDD] Support APX NDD for not insn
>   [APX NDD] Support APX NDD for and insn
>   [APX NDD] Support APX NDD for or/xor insn
>
>  gcc/config/i386/constraints.md|5 +
>  gcc/config/i386/i386-expand.cc|  164 +-
>  gcc/config/i386/i386-options.cc   |2 +
>  gcc/config/i386/i386-protos.h |   16 +-
>  gcc/config/i386/i386.cc   |   40 +-
>  gcc/config/i386/i386.md   | 2323 +++--
>  gcc/testsuite/gcc.target/i386/apx-ndd-adc.c   |   15 +
>  gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c  |   16 +
>  gcc/testsuite/gcc.target/i386/apx-ndd-sbb.c   |6 +
>  .../gcc.target/i386/apx-ndd-shld-shrd.c   |   24 +
>  .../gcc.target/i386/apx-ndd-ti-shift.c|   91 +
>  gcc/testsuite/gcc.target/i386/apx-ndd.c   |  202 ++
>  12 files changed, 2149 insertions(+), 755 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-adc.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-cmov.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-sbb.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-shld-shrd.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-ti-shift.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd.c
>
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] Take register pressure into account for vec_construct/scalar_to_vec when the components are not loaded from memory.

2023-12-03 Thread Hongtao Liu
On Fri, Dec 1, 2023 at 10:26 PM Richard Biener
 wrote:
>
> On Fri, Dec 1, 2023 at 3:39 AM liuhongt  wrote:
> >
> > > Hmm, I would suggest you put reg_needed into the class and accumulate
> > > over all vec_construct, with your patch you pessimize a single v32qi
> > > over two separate v16qi for example.  Also currently the whole block is
> > > gated with INTEGRAL_TYPE_P but register pressure would be also
> > > a concern for floating point vectors.  finish_cost would then apply an
> > > adjustment.
> >
> > Changed.
> >
> > > 'target_avail_regs' is for GENERAL_REGS, does that include APX regs?
> > > I don't see anything similar for FP regs, but I guess the target should 
> > > know
> > > or maybe there's a #regs in regclass query already.
> > Haven't see any, use below setting.
> >
> > unsigned target_avail_sse = TARGET_64BIT ? (TARGET_AVX512F ? 32 : 16) : 8;
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > No big impact on SPEC2017.
> > Observe 1 big improvement from other benchmark by avoiding vectorization 
> > with
> > vec_construct v32qi which caused lots of spills.
> >
> > Ok for trunk?
>
> LGTM, let's see what x86 maintainers think.
+Honza and Uros.
Any comments?
>
> Richard.
>
> > For vec_contruct, the components must be live at the same time if
> > they're not loaded from memory, when the number of those components
> > exceeds available registers, spill happens. Try to account that with a
> > rough estimation.
> > ??? Ideally, we should have an overall estimation of register pressure
> > if we know the live range of all variables.
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
> > Count sse_reg/gpr_regs for components not loaded from memory.
> > (ix86_vector_costs:ix86_vector_costs): New constructor.
> > (ix86_vector_costs::m_num_gpr_needed[3]): New private memeber.
> > (ix86_vector_costs::m_num_sse_needed[3]): Ditto.
> > (ix86_vector_costs::finish_cost): Estimate overall register
> > pressure cost.
> > (ix86_vector_costs::ix86_vect_estimate_reg_pressure): New
> > function.
> > ---
> >  gcc/config/i386/i386.cc | 54 ++---
> >  1 file changed, 50 insertions(+), 4 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 9390f525b99..dcaea6c2096 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -24562,15 +24562,34 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, 
> > struct noce_if_info *if_info)
> >  /* x86-specific vector costs.  */
> >  class ix86_vector_costs : public vector_costs
> >  {
> > -  using vector_costs::vector_costs;
> > +public:
> > +  ix86_vector_costs (vec_info *, bool);
> >
> >unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
> >   stmt_vec_info stmt_info, slp_tree node,
> >   tree vectype, int misalign,
> >   vect_cost_model_location where) override;
> >void finish_cost (const vector_costs *) override;
> > +
> > +private:
> > +
> > +  /* Estimate register pressure of the vectorized code.  */
> > +  void ix86_vect_estimate_reg_pressure ();
> > +  /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
> > + estimation of register pressure.
> > + ??? Currently it's only used by vec_construct/scalar_to_vec
> > + where we know it's not loaded from memory.  */
> > +  unsigned m_num_gpr_needed[3];
> > +  unsigned m_num_sse_needed[3];
> >  };
> >
> > +ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool 
> > costing_for_scalar)
> > +  : vector_costs (vinfo, costing_for_scalar),
> > +m_num_gpr_needed (),
> > +m_num_sse_needed ()
> > +{
> > +}
> > +
> >  /* Implement targetm.vectorize.create_costs.  */
> >
> >  static vector_costs *
> > @@ -24748,8 +24767,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > vect_cost_for_stmt kind,
> >  }
> >else if ((kind == vec_construct || kind == scalar_to_vec)
> >&& node
> > -  && SLP_TREE_DEF_TYPE (node) == vect_external_def
> > -  && INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
> > +  && SLP_TREE_DEF_TYPE (node) == vect_external_def)
> >  {
> >stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, 
> > misalign);
> >unsigned i;
> > @@ -24785,7 +24803,15 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > vect_cost_for_stmt kind,
> >   && (gimple_assign_rhs_code (def) != BIT_FIELD_REF
> >   || !VECTOR_TYPE_P (TREE_TYPE
> > (TREE_OPERAND (gimple_assign_rhs1 (def), 
> > 0))
> > -   stmt_cost += ix86_cost->sse_to_integer;
> > +   {
> > + if (fp)
> > +   m_num_sse_needed[where]++;
> > + else
> > +   {
> > + m_num_gpr_needed[where]++;
> > + 

Re: [PATCH] Set AVOID_256FMA_CHAINS TO m_GENERIC as it's generally good to new platforms

2023-11-30 Thread Hongtao Liu
Any comments?

On Wed, Nov 22, 2023 at 12:17 PM liuhongt  wrote:
>
> From: "Zhang, Annita" 
>
> Avoid_fma_chain was enabled in m_SAPPHIRERAPIDS, m_ALDERLAKE and
> m_CORE_HYBRID. It can also be enabled in m_GENERIC to improve the
> performance of -march=x86-64-v3/v4 with -mtune=generic set by
> default. One SPEC2017 benchmark 510.parest_r can improve greatly due
> to it. From the experiments, the single thread with -O2
> -march=x86-64-v3 can improve 26% on SPR, and 15% on Zen3. Meanwhile,
> it didn't cause notable regression in previous platforms including
> Cascade Lake and Ice Lake Server.
>
> On zenver4, it looks like fadd(3 cycles) is still fater than fma(4
> cycles). So in theory, avoid_fma_chain should be also better for
> znver4. And according to [1], enable fma_chain is not a generic win on
> znver4?
>
> cut from [1]---
> I also added X86_TUNE_AVOID_256FMA_CHAINS. Since fma has improved in
> zen4 this flag may not be a win except for very specific benchmarks. I
> am still doing some more detailed testing here.
> -cut end--
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607962.html
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog
>
> * config/i386/x86-tune.def (AVOID_256FMA_CHAINS): Add
> m_GENERIC.
> ---
>  gcc/config/i386/x86-tune.def | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 43fa9e8fd6d..a2e57e01550 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -521,7 +521,7 @@ DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, 
> "avoid_fma_chains", m_ZNVER1 | m_ZNVER2
>  /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
> smaller FMA chain.  */
>  DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
> m_ZNVER3
> - | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM)
> + | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
>
>  /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
> smaller FMA chain.  */
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] Take register pressure into account for vec_construct when the components are not loaded from memory.

2023-11-29 Thread Hongtao Liu
On Wed, Nov 29, 2023 at 3:47 PM Richard Biener
 wrote:
>
> On Tue, Nov 28, 2023 at 8:54 AM liuhongt  wrote:
> >
> > For vec_contruct, the components must be live at the same time if
> > they're not loaded from memory, when the number of those components
> > exceeds available registers, spill happens. Try to account that with a
> > rough estimation.
> > ??? Ideally, we should have an overall estimation of register pressure
> > if we know the live range of all variables.
> >
> > The patch can avoid regressions due to .i.e. vec_contruct with 32 char.
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> >
> > Ok for trunk?
>
> Hmm, I would suggest you put reg_needed into the class and accumulate
> over all vec_construct, with your patch you pessimize a single v32qi
> over two separate v16qi for example.  Also currently the whole block is
2 separate v16qi(or 4 v8qi) may have different live ranges, but yes,
vec_construct is probably an entry for vectorized code, so counting
them all also makes sense.

> gated with INTEGRAL_TYPE_P but register pressure would be also
> a concern for floating point vectors.  finish_cost would then apply an
> adjustment.
>
> 'target_avail_regs' is for GENERAL_REGS, does that include APX regs?
Yes, I saw it's used by ivopt which mostly cares about integers.
> I don't see anything similar for FP regs, but I guess the target should know
> or maybe there's a #regs in regclass query already.
For x86, float shares sse regs with vector registers, for that case,
we may need to count all vector temporary variables to get an
estimation for sse_regs?
but still we don't know the live range, so probably let's just start
with vec_constuct.
>
> That said, this kind of adjustment looks somewhat appealing.
>
> Richard.
>
> > gcc/ChangeLog:
> >
> > * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Take
> > register pressure into account for vec_construct when the
> > components are not loaded from memory.
> > ---
> >  gcc/config/i386/i386.cc | 22 +-
> >  1 file changed, 21 insertions(+), 1 deletion(-)
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 683ac643bc8..f8417555930 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -24706,6 +24706,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > vect_cost_for_stmt kind,
> >stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, 
> > misalign);
> >unsigned i;
> >tree op;
> > +  unsigned reg_needed = 0;
> >FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
> > if (TREE_CODE (op) == SSA_NAME)
> >   TREE_VISITED (op) = 0;
> > @@ -24737,11 +24738,30 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > vect_cost_for_stmt kind,
> >   && (gimple_assign_rhs_code (def) != BIT_FIELD_REF
> >   || !VECTOR_TYPE_P (TREE_TYPE
> > (TREE_OPERAND (gimple_assign_rhs1 (def), 
> > 0))
> > -   stmt_cost += ix86_cost->sse_to_integer;
> > +   {
> > + stmt_cost += ix86_cost->sse_to_integer;
> > + reg_needed++;
> > +   }
> > }
> >FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
> > if (TREE_CODE (op) == SSA_NAME)
> >   TREE_VISITED (op) = 0;
> > +
> > +  /* For vec_contruct, the components must be live at the same time if
> > +they're not loaded from memory, when the number of those components
> > +exceeds available registers, spill happens. Try to account that 
> > with a
> > +rough estimation. Currently only handle integral modes since 
> > scalar fp
> > +shares sse_regs with vectors.
> > +??? Ideally, we should have an overall estimation of register 
> > pressure
> > +if we know the live range of all variables.  */
> > +  if (!fp && kind == vec_construct
> > + && reg_needed > target_avail_regs)
> > +   {
> > + unsigned spill_cost = ix86_builtin_vectorization_cost 
> > (scalar_store,
> > +vectype,
> > +misalign);
> > + stmt_cost += spill_cost * (reg_needed - target_avail_regs);
> > +   }
> >  }
> >if (stmt_cost == -1)
> >  stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
> > --
> > 2.31.1
> >



-- 
BR,
Hongtao


Re: [PATCH] i386: Fix CPUID of USER_MSR.

2023-11-28 Thread Hongtao Liu
On Wed, Nov 29, 2023 at 9:23 AM Hu, Lin1  wrote:
>
> Hi, all
>
> This patch aims to fix the wrong CPUID of USER_MSR, its correct CPUID is
> (0x7, 0x1).EDX[15], But I set it as (0x7, 0x0).EDX[15]. And the patch modefied
> testcase for give the user a better example.
>
> It has been bootstrapped and regtested on x86-64-pc-linux-gnu, OK for trunk?
Ok.
>
> BR,
> Lin
>
> gcc/ChangeLog:
>
> * common/config/i386/cpuinfo.h (get_available_features): Move USER_MSR
> to the correct location.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/user_msr-1.c: Correct the MSR index for give the 
> user
> an proper example.
> ---
>  gcc/common/config/i386/cpuinfo.h   | 4 ++--
>  gcc/testsuite/gcc.target/i386/user_msr-1.c | 9 +
>  2 files changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/common/config/i386/cpuinfo.h 
> b/gcc/common/config/i386/cpuinfo.h
> index f90fb4d56a2..a1eb285daed 100644
> --- a/gcc/common/config/i386/cpuinfo.h
> +++ b/gcc/common/config/i386/cpuinfo.h
> @@ -861,8 +861,6 @@ get_available_features (struct __processor_model 
> *cpu_model,
> set_feature (FEATURE_IBT);
>if (edx & bit_UINTR)
> set_feature (FEATURE_UINTR);
> -  if (edx & bit_USER_MSR)
> -   set_feature (FEATURE_USER_MSR);
>if (amx_usable)
> {
>   if (edx & bit_AMX_TILE)
> @@ -921,6 +919,8 @@ get_available_features (struct __processor_model 
> *cpu_model,
> set_feature (FEATURE_PREFETCHI);
>   if (eax & bit_RAOINT)
> set_feature (FEATURE_RAOINT);
> + if (edx & bit_USER_MSR)
> +   set_feature (FEATURE_USER_MSR);
>   if (avx_usable)
> {
>   if (eax & bit_AVXVNNI)
> diff --git a/gcc/testsuite/gcc.target/i386/user_msr-1.c 
> b/gcc/testsuite/gcc.target/i386/user_msr-1.c
> index 447852306df..f315016d088 100644
> --- a/gcc/testsuite/gcc.target/i386/user_msr-1.c
> +++ b/gcc/testsuite/gcc.target/i386/user_msr-1.c
> @@ -1,9 +1,9 @@
>  /* { dg-do compile { target { ! ia32  }  }  } */
>  /* { dg-options "-musermsr -O2"  } */
>  /* { dg-final { scan-assembler-times "urdmsr\[ \\t\]\\%r\[a-z\]x, 
> \\%r\[a-z\]x" 1  }  } */
> -/* { dg-final { scan-assembler-times "urdmsr\[ \\t\]\\\$121" 1  }  } */
> +/* { dg-final { scan-assembler-times "urdmsr\[ \\t\]\\\$6912" 1  }  } */
>  /* { dg-final { scan-assembler-times "uwrmsr\[ \\t\]\\%r\[a-z\]x, 
> \\%r\[a-z\]x" 1  }  } */
> -/* { dg-final { scan-assembler-times "uwrmsr\[ \\t\]\\%r\[a-z\]x, \\\$121" 1 
>  }  } */
> +/* { dg-final { scan-assembler-times "uwrmsr\[ \\t\]\\%r\[a-z\]x, \\\$6912" 
> 1  }  } */
>
>  #include 
>
> @@ -13,8 +13,9 @@ volatile unsigned long long y;
>  void extern
>  user_msr_test (void)
>  {
> +  y = 6913;
>x = _urdmsr(y);
> -  x = _urdmsr(121);
> +  x = _urdmsr(6912);
>_uwrmsr(y, x);
> -  _uwrmsr(121, x);
> +  _uwrmsr(6912, x);
>  }
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] [i386] Fix push2pop2 test fail on non-linux target [PR112729]

2023-11-28 Thread Hongtao Liu
On Tue, Nov 28, 2023 at 9:51 PM Hongyu Wang  wrote:
>
> Hi,
>
> On linux x86-64, -fomit-frame-pointer was by default enabled so the
> push2pop2 tests cfi scans are based on it. On other target with
> -fno-omit-frame-pointer the cfi scan will be wrong as the frame pointer
> is pushed at first. Add -fomit-frame-pointer to these tests that related
> to cfi scan.
>
> OK for master?
Ok.
>
> gcc/testsuite/ChangeLog:
>
> PR target/112729
> * gcc.target/i386/apx-interrupt-1.c: Add -fomit-frame-pointer.
> * gcc.target/i386/apx-push2pop2-1.c: Likewise.
> * gcc.target/i386/apx-push2pop2_force_drap-1.c: Likewise.
> ---
>  gcc/testsuite/gcc.target/i386/apx-interrupt-1.c| 2 +-
>  gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c| 2 +-
>  gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c | 2 +-
>  3 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.target/i386/apx-interrupt-1.c 
> b/gcc/testsuite/gcc.target/i386/apx-interrupt-1.c
> index ffcb8fce71c..6844e574d00 100644
> --- a/gcc/testsuite/gcc.target/i386/apx-interrupt-1.c
> +++ b/gcc/testsuite/gcc.target/i386/apx-interrupt-1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile { target { ! ia32 } } } */
> -/* { dg-options "-mapx-features=egpr -m64 -O2 -mgeneral-regs-only -mno-cld 
> -mno-push-args -maccumulate-outgoing-args" } */
> +/* { dg-options "-mapx-features=egpr -m64 -O2 -mgeneral-regs-only -mno-cld 
> -mno-push-args -maccumulate-outgoing-args -fomit-frame-pointer" } */
>  /* { dg-skip-if "does not emit .cfi_xxx" "*-*-darwin*" } */
>
>  extern void foo (void *) __attribute__ ((interrupt));
> diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c 
> b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
> index d78c96d36a3..5f43b42e33f 100644
> --- a/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
> +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile { target { ! ia32 } } } */
> -/* { dg-options "-O2 -mapx-features=push2pop2" } */
> +/* { dg-options "-O2 -mapx-features=push2pop2 -fomit-frame-pointer" } */
>  /* { dg-skip-if "does not emit .cfi_xxx" "*-*-darwin*" } */
>
>  extern int bar (int);
> diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c 
> b/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c
> index 3cac7b10769..4e2259f0c99 100644
> --- a/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c
> +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile { target { ! ia32 } } } */
> -/* { dg-options "-O2 -mapx-features=push2pop2 -mforce-drap" } */
> +/* { dg-options "-O2 -mapx-features=push2pop2 -fomit-frame-pointer 
> -mforce-drap" } */
>  /* { dg-skip-if "does not emit .cfi_xxx" "*-*-darwin*" } */
>
>  #include "apx-push2pop2-1.c"
> --
> 2.31.1
>


-- 
BR,
Hongtao


Re: [PATCH] i386: Fix AVX512 and AVX10 option issues

2023-11-23 Thread Hongtao Liu
On Thu, Nov 23, 2023 at 2:10 PM Haochen Jiang  wrote:
>
> Hi all,
>
> This patch should be able to fix the current issue mentioned in PR112643.
>
> Also, I fixed some legacy issues in code related to AVX512/AVX10.
>
> Ok for trunk?
Ok
>
> Thx,
> Haochen
>
> gcc/ChangeLog:
>
> PR target/112643
> * config/i386/driver-i386.cc (check_avx10_avx512_features):
> Renamed to ...
> (check_avx512_features): this and remove avx10 check.
> (host_detect_local_cpu): Never append -mno-avx10.1-{256,512} to
> avoid emitting warnings when building GCC with native arch.
> * config/i386/i386-builtin.def (BDESC): Add missing AVX512VL for
> 128/256 bit builtin for AVX512VP2INTERSECT.
> * config/i386/i386-options.cc (ix86_option_override_internal):
> Also check whether the AVX512 flags is set when trying to reset.
> * config/i386/i386.h
> (PTA_SKYLAKE_AVX512): Add missing PTA_EVEX512.
> (PTA_ZNVER4): Ditto.
> ---
>  gcc/config/i386/driver-i386.cc   | 19 +--
>  gcc/config/i386/i386-builtin.def |  8 
>  gcc/config/i386/i386-options.cc  |  8 +---
>  gcc/config/i386/i386.h   |  4 ++--
>  4 files changed, 20 insertions(+), 19 deletions(-)
>
> diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc
> index ae67efc49c3..204600e128a 100644
> --- a/gcc/config/i386/driver-i386.cc
> +++ b/gcc/config/i386/driver-i386.cc
> @@ -377,15 +377,10 @@ detect_caches_intel (bool xeon_mp, unsigned max_level,
> enabled and the other disabled.  Add this function to avoid push "-mno-"
> options under this scenario for -march=native.  */
>
> -bool check_avx10_avx512_features (__processor_model _model,
> - unsigned int 
> (_features2)[SIZE_OF_CPU_FEATURES],
> - const enum processor_features feature)
> +bool check_avx512_features (__processor_model _model,
> +   unsigned int 
> (_features2)[SIZE_OF_CPU_FEATURES],
> +   const enum processor_features feature)
>  {
> -  if (has_feature (FEATURE_AVX512F)
> -  && ((feature == FEATURE_AVX10_1_256)
> - || (feature == FEATURE_AVX10_1_512)))
> -return false;
> -
>if (has_feature (FEATURE_AVX10_1_256)
>&& ((feature == FEATURE_AVX512F)
>   || (feature == FEATURE_AVX512CD)
> @@ -900,8 +895,12 @@ const char *host_detect_local_cpu (int argc, const char 
> **argv)
>   options = concat (options, " ",
> isa_names_table[i].option, NULL);
>   }
> -   else if (check_avx10_avx512_features (cpu_model, cpu_features2,
> - isa_names_table[i].feature))
> +   /* Never push -mno-avx10.1-{256,512} under -march=native to
> +  avoid unnecessary warnings when building librarys.  */
> +   else if ((isa_names_table[i].feature != FEATURE_AVX10_1_256)
> +&& (isa_names_table[i].feature != FEATURE_AVX10_1_512)
> +&& check_avx512_features (cpu_model, cpu_features2,
> +  isa_names_table[i].feature))
>   options = concat (options, neg_option,
> isa_names_table[i].option + 2, NULL);
>   }
> diff --git a/gcc/config/i386/i386-builtin.def 
> b/gcc/config/i386/i386-builtin.def
> index 19fa5c107c7..7a5f2676999 100644
> --- a/gcc/config/i386/i386-builtin.def
> +++ b/gcc/config/i386/i386-builtin.def
> @@ -301,10 +301,10 @@ BDESC (OPTION_MASK_ISA_AVX512BW, 
> OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512bw_sto
>  /* AVX512VP2INTERSECT */
>  BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT | OPTION_MASK_ISA2_EVEX512, 
> CODE_FOR_nothing, "__builtin_ia32_2intersectd512", 
> IX86_BUILTIN_2INTERSECTD512, UNKNOWN, (int) VOID_FTYPE_PUHI_PUHI_V16SI_V16SI)
>  BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT | OPTION_MASK_ISA2_EVEX512, 
> CODE_FOR_nothing, "__builtin_ia32_2intersectq512", 
> IX86_BUILTIN_2INTERSECTQ512, UNKNOWN, (int) VOID_FTYPE_PUQI_PUQI_V8DI_V8DI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
> "__builtin_ia32_2intersectd256", IX86_BUILTIN_2INTERSECTD256, UNKNOWN, (int) 
> VOID_FTYPE_PUQI_PUQI_V8SI_V8SI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
> "__builtin_ia32_2intersectq256", IX86_BUILTIN_2INTERSECTQ256, UNKNOWN, (int) 
> VOID_FTYPE_PUQI_PUQI_V4DI_V4DI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
> "__builtin_ia32_2intersectd128", IX86_BUILTIN_2INTERSECTD128, UNKNOWN, (int) 
> VOID_FTYPE_PUQI_PUQI_V4SI_V4SI)
> -BDESC (0, OPTION_MASK_ISA2_AVX512VP2INTERSECT, CODE_FOR_nothing, 
> "__builtin_ia32_2intersectq128", IX86_BUILTIN_2INTERSECTQ128, UNKNOWN, (int) 
> VOID_FTYPE_PUQI_PUQI_V2DI_V2DI)
> +BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512VP2INTERSECT, 
> CODE_FOR_nothing, 

Re: [PATCH] [APX PUSH2POP2] Adjust operand order for PUSH2POP2

2023-11-21 Thread Hongtao Liu
On Wed, Nov 22, 2023 at 11:31 AM Hongyu Wang  wrote:
>
> Hi,
>
> The push2/pop2 operand order does not match the binutils implementation
> for AT syntax that it will first push operands[2] then operands[1].
> Correct it by reverse operand order for AT syntax.
>
> Bootstrapped/regtested on x86-64-linux-pc-gnu{-m32,}
>
> Ok for master?
Ok.
>
> gcc/ChangeLog:
>
> * config/i386/i386.md (push2_di): Adjust operand order for AT
> syntax.
> (pop2_di): Likewise.
> (push2p_di): Likewise.
> (pop2p_di): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/apx-push2pop2-1.c: Adjust output scan.
> * gcc.target/i386/apx-push2pop2_force_drap-1.c: Likewise.
> ---
>  gcc/config/i386/i386.md   | 8 
>  gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c   | 8 
>  .../gcc.target/i386/apx-push2pop2_force_drap-1.c  | 8 
>  3 files changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 03e4ddd3037..99bb909b244 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -3772,7 +3772,7 @@ (define_insn "push2_di"
> (match_operand:DI 2 "register_operand" "r")]
> UNSPEC_APXPUSH2))]
>"TARGET_APX_PUSH2POP2"
> -  "push2\t%1, %2"
> +  "push2\t{%2, %1|%1, %2}"
>[(set_attr "mode" "TI")
> (set_attr "type" "multi")
> (set_attr "prefix" "evex")])
> @@ -3784,7 +3784,7 @@ (define_insn "pop2_di"
>   (set (match_operand:DI 2 "register_operand" "=r")
>(unspec:DI [(const_int 0)] UNSPEC_APXPOP2_HIGH))])]
>"TARGET_APX_PUSH2POP2"
> -  "pop2\t%0, %2"
> +  "pop2\t{%2, %0|%0, %2}"
>[(set_attr "mode" "TI")
> (set_attr "prefix" "evex")])
>
> @@ -3811,7 +3811,7 @@ (define_insn "push2p_di"
> UNSPEC_APXPUSH2))
> (unspec:DI [(const_int 0)] UNSPEC_APX_PPX)]
>"TARGET_APX_PUSH2POP2 && TARGET_APX_PPX"
> -  "push2p\t%1, %2"
> +  "push2p\t{%2, %1|%1, %2}"
>[(set_attr "mode" "TI")
> (set_attr "type" "multi")
> (set_attr "prefix" "evex")])
> @@ -3824,7 +3824,7 @@ (define_insn "pop2p_di"
>(unspec:DI [(const_int 0)] UNSPEC_APXPOP2_HIGH))
>   (unspec:DI [(const_int 0)] UNSPEC_APX_PPX)])]
>"TARGET_APX_PUSH2POP2 && TARGET_APX_PPX"
> -  "pop2p\t%0, %2"
> +  "pop2p\t{%2, %0|%0, %2}"
>[(set_attr "mode" "TI")
> (set_attr "prefix" "evex")])
>
> diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c 
> b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
> index c53112758a5..d78c96d36a3 100644
> --- a/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
> +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c
> @@ -24,11 +24,11 @@ void foo ()
>  /* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 16" 2 } } */
>  /* { dg-final { scan-assembler-times "pushq\[^\n\r]*%r15(?:\n|\[ \\t\]+#)" 1 
> } } */
>  /* { dg-final { scan-assembler-times ".cfi_offset 15, -16(?:\n|\[ \\t\]+#)" 
> 1 } } */
> -/* { dg-final { scan-assembler-times "push2\[\\t 
> \]*\[^\n\r]*%r14\[^\n\r]*%r13\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "push2\[\\t 
> \]*\[^\n\r]*%r13\[^\n\r]*%r14\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 32" 2 } } */
>  /* { dg-final { scan-assembler-times ".cfi_offset 14, -24(?:\n|\[ \\t\]+#)" 
> 1 } } */
>  /* { dg-final { scan-assembler-times ".cfi_offset 13, -32(?:\n|\[ \\t\]+#)" 
> 1 } } */
> -/* { dg-final { scan-assembler-times "push2\[\\t 
> \]*\[^\n\r]*%r12\[^\n\r]*%rbp\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "push2\[\\t 
> \]*\[^\n\r]*%rbp\[^\n\r]*%r12\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 48" 2 } } */
>  /* { dg-final { scan-assembler-times ".cfi_offset 12, -40(?:\n|\[ \\t\]+#)" 
> 1 } } */
>  /* { dg-final { scan-assembler-times ".cfi_offset 6, -48(?:\n|\[ \\t\]+#)" 1 
> } } */
> @@ -36,10 +36,10 @@ void foo ()
>  /* { dg-final { scan-assembler-times ".cfi_def_cfa_offset 56" 2 } } */
>  /* { dg-final { scan-assembler-times ".cfi_offset 3, -56(?:\n|\[ \\t\]+#)" 1 
> } } */
>  /* { dg-final { scan-assembler-times "popq\[^\n\r]*rbx(?:\n|\[ \\t\]+#)" 1 } 
> } */
> -/* { dg-final { scan-assembler-times "pop2\[\\t 
> \]*\[^\n\r]*%rbp\[^\n\r]*%r12\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "pop2\[\\t 
> \]*\[^\n\r]*%r12\[^\n\r]*%rbp\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times ".cfi_restore 12(?:\n|\[ \\t\]+#)" 1 } 
> } */
>  /* { dg-final { scan-assembler-times ".cfi_restore 6(?:\n|\[ \\t\]+#)" 1 } } 
> */
> -/* { dg-final { scan-assembler-times "pop2\[\\t 
> \]*\[^\n\r]*%r13\[^\n\r]*%r14\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "pop2\[\\t 
> \]*\[^\n\r]*%r14\[^\n\r]*%r13\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */
>  

Re: [PATCH] [APX PPX] Support Intel APX PPX

2023-11-20 Thread Hongtao Liu
On Tue, Nov 21, 2023 at 1:45 PM Hongyu Wang  wrote:
>
> > I think you just need to add an extra (unspec [(const_int 0)]
> > UNSPEC_PPX) to origin push_di pattern, then no need for particular
> > handling of dwarf and cfa stuffs.
> > Ditto for popp and push2p and pop2p.
>
> Yes, such change also worked and no cfa adjustment required then,
> thanks for the suggestion.
> Updated patch with just 1 new UNSPEC and removed cfa handling.
LGTM.
>
> Hongtao Liu  于2023年11月20日周一 14:46写道:
> >
> > On Fri, Nov 17, 2023 at 3:26 PM Hongyu Wang  wrote:
> > >
> > > Intel APX PPX feature has been released in [1].
> > >
> > > PPX stands for Push-Pop Acceleration. PUSH/PUSH2 and its corresponding POP
> > > can be marked with a 1-bit hint to indicate that the POP reads the
> > > value written by the PUSH from the stack. The processor tracks these 
> > > marked
> > > instructions internally and fast-forwards register data between
> > > matching PUSH and POP instructions, without going through memory or
> > > through the training loop of the Fast Store Forwarding Predictor (FSFP).
> > > This feature can also be adopted to PUSH2/POP2.
> > >
> > > For GCC, we emit explicit suffix 'p' (paired) to indicate the push/pop
> > > pair are marked with PPX hint. To separate form original push/pop, we
> > > use UNSPEC to restrict the PPX related patterns. So for pushp/popp, the
> > > cfi is manually adjusted for the UNSPEC PPX insns.
> > >
> > > In the first implementation we only emit them under prologue/epilogue
> > > when saving/restoring callee-saved registers to make sure push/pop are
> > > paired. So an extra flag was added to check if PPX insns can be emitted
> > > for those register save/restore interfaces.
> > >
> > > The PPX hint is purely a performance hint. If the 'p' suffix is not
> > > emitted for paired push/pop, the PPX optimization will be disabled,
> > > while program sematic will not be affected at all.
> > >
> > > Bootstrapped/regtest on x86-64-pc-linux-gnu{-m32,}.
> > >
> > > Ok for master?
> > >
> > > [1].https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.ht
> > >
> > > gcc/ChangeLog:
> > >
> > > * config/i386/i386-opts.h (enum apx_features): Add apx_ppx, add
> > > it to apx_all.
> > > * config/i386/i386.cc (ix86_emit_restore_reg_using_pop): Add
> > > ppx_p parameter for function declaration.
> > > (gen_push2): Add ppx_p parameter, emit push2p if ppx_p is true.
> > > (ix86_emit_restore_reg_using_pop2): Likewise for pop2p.
> > > (gen_pushp): New function to emit pushp and adjust cfi.
> > > (ix86_emit_save_regs): Emit pushp/push2p under TARGET_APX_PPX.
> > > (ix86_emit_restore_reg_using_pop): Add ppx_p, emit popp insn
> > > and adjust cfi when ppx_p is ture.
> > > (ix86_emit_restore_reg_using_pop2): Add ppx_p and parse to its
> > > callee.
> > > (ix86_emit_restore_regs_using_pop2): Likewise.
> > > (ix86_expand_epilogue): Parse TARGET_APX_PPX to
> > > ix86_emit_restore_reg_using_pop.
> > > * config/i386/i386.h (TARGET_APX_PPX): New.
> > > * config/i386/i386.md (UNSPEC_APXPUSHP): New unspec.
> > > (UNSPEC_APXPOPP): Likewise.
> > > (UNSPEC_APXPUSH2P): Likewise.
> > > (UNSPEC_APXPOP2P_LOW): Likewise.
> > > (UNSPEC_APXPOP2P_HIGH): Likewise.
> > > (pushp_di): New define_insn.
> > > (popp_di): Likewise.
> > > (push2p_di): Likewise.
> > > (pop2p_di): Likewise.
> > > * config/i386/i386.opt: Add apx_ppx enum.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > * gcc.target/i386/apx-interrupt-1.c: Adjust option to restrict 
> > > them
> > > under certain subfeatures.
> > > * gcc.target/i386/apx-push2pop2-1.c: Likewise.
> > > * gcc.target/i386/apx-push2pop2_force_drap-1.c: Likewise.
> > > * gcc.target/i386/apx-push2pop2_interrupt-1.c: Likewise.
> > > * gcc.target/i386/apx-ppx-1.c: New test.
> > > ---
> > >  gcc/config/i386/i386-opts.h   |   3 +-
> > >  gcc/config/i386/i386.cc   | 113 ++
> > >  gcc/config/i386/i386.h|   1 +
> > >  gcc/config

Re: [PATCH] [APX PPX] Support Intel APX PPX

2023-11-19 Thread Hongtao Liu
On Fri, Nov 17, 2023 at 3:26 PM Hongyu Wang  wrote:
>
> Intel APX PPX feature has been released in [1].
>
> PPX stands for Push-Pop Acceleration. PUSH/PUSH2 and its corresponding POP
> can be marked with a 1-bit hint to indicate that the POP reads the
> value written by the PUSH from the stack. The processor tracks these marked
> instructions internally and fast-forwards register data between
> matching PUSH and POP instructions, without going through memory or
> through the training loop of the Fast Store Forwarding Predictor (FSFP).
> This feature can also be adopted to PUSH2/POP2.
>
> For GCC, we emit explicit suffix 'p' (paired) to indicate the push/pop
> pair are marked with PPX hint. To separate form original push/pop, we
> use UNSPEC to restrict the PPX related patterns. So for pushp/popp, the
> cfi is manually adjusted for the UNSPEC PPX insns.
>
> In the first implementation we only emit them under prologue/epilogue
> when saving/restoring callee-saved registers to make sure push/pop are
> paired. So an extra flag was added to check if PPX insns can be emitted
> for those register save/restore interfaces.
>
> The PPX hint is purely a performance hint. If the 'p' suffix is not
> emitted for paired push/pop, the PPX optimization will be disabled,
> while program sematic will not be affected at all.
>
> Bootstrapped/regtest on x86-64-pc-linux-gnu{-m32,}.
>
> Ok for master?
>
> [1].https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.ht
>
> gcc/ChangeLog:
>
> * config/i386/i386-opts.h (enum apx_features): Add apx_ppx, add
> it to apx_all.
> * config/i386/i386.cc (ix86_emit_restore_reg_using_pop): Add
> ppx_p parameter for function declaration.
> (gen_push2): Add ppx_p parameter, emit push2p if ppx_p is true.
> (ix86_emit_restore_reg_using_pop2): Likewise for pop2p.
> (gen_pushp): New function to emit pushp and adjust cfi.
> (ix86_emit_save_regs): Emit pushp/push2p under TARGET_APX_PPX.
> (ix86_emit_restore_reg_using_pop): Add ppx_p, emit popp insn
> and adjust cfi when ppx_p is ture.
> (ix86_emit_restore_reg_using_pop2): Add ppx_p and parse to its
> callee.
> (ix86_emit_restore_regs_using_pop2): Likewise.
> (ix86_expand_epilogue): Parse TARGET_APX_PPX to
> ix86_emit_restore_reg_using_pop.
> * config/i386/i386.h (TARGET_APX_PPX): New.
> * config/i386/i386.md (UNSPEC_APXPUSHP): New unspec.
> (UNSPEC_APXPOPP): Likewise.
> (UNSPEC_APXPUSH2P): Likewise.
> (UNSPEC_APXPOP2P_LOW): Likewise.
> (UNSPEC_APXPOP2P_HIGH): Likewise.
> (pushp_di): New define_insn.
> (popp_di): Likewise.
> (push2p_di): Likewise.
> (pop2p_di): Likewise.
> * config/i386/i386.opt: Add apx_ppx enum.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/apx-interrupt-1.c: Adjust option to restrict them
> under certain subfeatures.
> * gcc.target/i386/apx-push2pop2-1.c: Likewise.
> * gcc.target/i386/apx-push2pop2_force_drap-1.c: Likewise.
> * gcc.target/i386/apx-push2pop2_interrupt-1.c: Likewise.
> * gcc.target/i386/apx-ppx-1.c: New test.
> ---
>  gcc/config/i386/i386-opts.h   |   3 +-
>  gcc/config/i386/i386.cc   | 113 ++
>  gcc/config/i386/i386.h|   1 +
>  gcc/config/i386/i386.md   |  47 +++-
>  gcc/config/i386/i386.opt  |   3 +
>  .../gcc.target/i386/apx-interrupt-1.c |   2 +-
>  gcc/testsuite/gcc.target/i386/apx-ppx-1.c |   9 ++
>  .../gcc.target/i386/apx-push2pop2-1.c |   2 +-
>  .../i386/apx-push2pop2_force_drap-1.c |   2 +-
>  .../i386/apx-push2pop2_interrupt-1.c  |   2 +-
>  10 files changed, 158 insertions(+), 26 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/apx-ppx-1.c
>
> diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
> index 2ec76a16bce..4d293edb399 100644
> --- a/gcc/config/i386/i386-opts.h
> +++ b/gcc/config/i386/i386-opts.h
> @@ -139,7 +139,8 @@ enum apx_features {
>apx_egpr = 1 << 0,
>apx_push2pop2 = 1 << 1,
>apx_ndd = 1 << 2,
> -  apx_all = apx_egpr | apx_push2pop2 | apx_ndd,
> +  apx_ppx = 1 << 3,
> +  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx,
>  };
>
>  #endif
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 683ac643bc8..df2fc236c0a 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -105,7 +105,7 @@ along with GCC; see the file COPYING3.  If not see
>  static rtx legitimize_dllimport_symbol (rtx, bool);
>  static rtx legitimize_pe_coff_extern_decl (rtx, bool);
>  static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
> -static void ix86_emit_restore_reg_using_pop (rtx);
> +static void ix86_emit_restore_reg_using_pop (rtx, bool = 

Re: [PATCH] Initial support for AVX10.1

2023-11-19 Thread Hongtao Liu
On Fri, Nov 10, 2023 at 9:42 AM Haochen Jiang  wrote:
>
> gcc/ChangeLog:
>
> * common/config/i386/cpuinfo.h (get_available_features):
> Add avx10_set and version and detect avx10.1.
> (cpu_indicator_init): Handle avx10.1-512.
> * common/config/i386/i386-common.cc
> (OPTION_MASK_ISA2_AVX10_1_256_SET): New.
> (OPTION_MASK_ISA2_AVX10_1_256_SET): Ditto.
> (OPTION_MASK_ISA2_AVX10_1_512_UNSET): Ditto.
> (OPTION_MASK_ISA2_AVX10_1_512_UNSET): Ditto.
> (OPTION_MASK_ISA2_AVX2_UNSET): Modify for AVX10.1.
> (ix86_handle_option): Handle -mavx10.1-256 and -mavx10.1-512.
> Add indicator for explicit no-avx512 and no-avx10.1 options.
> * common/config/i386/i386-cpuinfo.h (enum processor_features):
> Add FEATURE_AVX10_1_256 and FEATURE_AVX10_1_512.
> * common/config/i386/i386-isas.h: Add ISA_NAME_TABLE_ENTRY for
> AVX10_1_256 and AVX10_1_512.
> * config/i386/cpuid.h (bit_AVX10): New.
> (bit_AVX10_256): Ditto.
> (bit_AVX10_512): Ditto.
> * config/i386/driver-i386.cc (check_avx10_avx512_features): New.
> (host_detect_local_cpu): Do not append "-mno-" options under
> specific scenarios to avoid emitting a warning.
> * config/i386/i386-isa.def
> (EVEX512): Add DEF_PTA(EVEX512).
> (AVX10_1_256): Add DEF_PTA(AVX10_1_256).
> (AVX10_1_512): Add DEF_PTA(AVX10_1_512).
> * config/i386/i386-options.cc (isa2_opts): Add -mavx10.1-256 and
> -mavx10.1-512.
> (ix86_function_specific_save): Save explicit no indicator.
> (ix86_function_specific_restore): Restore explicit no indicator.
> (ix86_valid_target_attribute_inner_p): Handle avx10.1, avx10.1-256 and
> avx10.1-512.
> (ix86_valid_target_attribute_tree): Handle avx512 function
> attributes with avx10.1 command line option.
> (ix86_option_override_internal): Handle AVX10.1 options.
> * config/i386/i386.h: Add PTA_EVEX512 for AVX512 target
> machines.
> * config/i386/i386.opt: Add variable ix86_no_avx512_explicit and
> ix86_no_avx10_1_explicit, option -mavx10.1, -mavx10.1-256 and
> -mavx10.1-512.
> * doc/extend.texi: Document avx10.1, avx10.1-256 and avx10.1-512.
> * doc/invoke.texi: Document -mavx10.1, -mavx10.1-256 and 
> -mavx10.1-512.
> * doc/sourcebuild.texi: Document target avx10.1, avx10.1-256
> and avx10.1-512.
The main concern from AVX10 is related to the arch level(x86-64-v4)
which is discussed in another thread.[1]
So I'm going to approve this patch.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-November/636285.html
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx10_1-1.c: New test.
> * gcc.target/i386/avx10_1-10.c: Ditto.
> * gcc.target/i386/avx10_1-11.c: Ditto.
> * gcc.target/i386/avx10_1-12.c: Ditto.
> * gcc.target/i386/avx10_1-13.c: Ditto.
> * gcc.target/i386/avx10_1-14.c: Ditto.
> * gcc.target/i386/avx10_1-15.c: Ditto.
> * gcc.target/i386/avx10_1-16.c: Ditto.
> * gcc.target/i386/avx10_1-17.c: Ditto.
> * gcc.target/i386/avx10_1-18.c: Ditto.
> * gcc.target/i386/avx10_1-19.c: Ditto.
> * gcc.target/i386/avx10_1-2.c: Ditto.
> * gcc.target/i386/avx10_1-20.c: Ditto.
> * gcc.target/i386/avx10_1-21.c: Ditto.
> * gcc.target/i386/avx10_1-22.c: Ditto.
> * gcc.target/i386/avx10_1-23.c: Ditto.
> * gcc.target/i386/avx10_1-3.c: Ditto.
> * gcc.target/i386/avx10_1-4.c: Ditto.
> * gcc.target/i386/avx10_1-5.c: Ditto.
> * gcc.target/i386/avx10_1-6.c: Ditto.
> * gcc.target/i386/avx10_1-7.c: Ditto.
> * gcc.target/i386/avx10_1-8.c: Ditto.
> * gcc.target/i386/avx10_1-9.c: Ditto.
> ---
>  gcc/common/config/i386/cpuinfo.h   |  33 ++
>  gcc/common/config/i386/i386-common.cc  |  55 -
>  gcc/common/config/i386/i386-cpuinfo.h  |   2 +
>  gcc/common/config/i386/i386-isas.h |   3 +
>  gcc/config/i386/cpuid.h|   5 +
>  gcc/config/i386/driver-i386.cc |  43 ++-
>  gcc/config/i386/i386-isa.def   |   3 +
>  gcc/config/i386/i386-options.cc| 132 +++--
>  gcc/config/i386/i386.h |   2 +-
>  gcc/config/i386/i386.opt   |  30 +
>  gcc/doc/extend.texi|  15 +++
>  gcc/doc/invoke.texi|  17 ++-
>  gcc/doc/sourcebuild.texi   |   9 ++
>  gcc/testsuite/gcc.target/i386/avx10_1-1.c  |  22 
>  gcc/testsuite/gcc.target/i386/avx10_1-10.c |   6 +
>  gcc/testsuite/gcc.target/i386/avx10_1-11.c |   6 +
>  gcc/testsuite/gcc.target/i386/avx10_1-12.c |   6 +
>  gcc/testsuite/gcc.target/i386/avx10_1-13.c |  13 ++
>  gcc/testsuite/gcc.target/i386/avx10_1-14.c |  13 ++
>  

Re: [PATCH] [i386] APX: Fix EGPR usage in several patterns.

2023-11-15 Thread Hongtao Liu
On Wed, Nov 15, 2023 at 5:43 PM Hongyu Wang  wrote:
>
> Hi,
>
> For vextract/insert{if}128 they cannot adopt EGPR in their memory operand, all
> related pattern should be adjusted to disable EGPR usage on them.
> Also fix a wrong gpr16 attr for insertps.
>
> Bootstrapped/regtested on x86-64-pc-linux-gnu{-m32,}
>
> Ok for master?
Ok.
>
> gcc/ChangeLog:
>
> * config/i386/sse.md (vec_extract_hi_): Add noavx512vl
> alternative with attr addr gpr16 and "jm" constraint.
> (vec_extract_hi_): Likewise for SF vector modes.
> (@vec_extract_hi_): Likewise.
> (*vec_extractv2ti): Likewise.
> (vec_set_hi_): Likewise.
> * config/i386/mmx.md (@sse4_1_insertps_): Correct gpr16 attr for
> each alternative.
> ---
>  gcc/config/i386/mmx.md |  2 +-
>  gcc/config/i386/sse.md | 32 
>  2 files changed, 21 insertions(+), 13 deletions(-)
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index a3d08bb9d3b..355538749d1 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -1215,7 +1215,7 @@ (define_insn "@sse4_1_insertps_"
>  }
>  }
>[(set_attr "isa" "noavx,noavx,avx")
> -   (set_attr "addr" "*,*,gpr16")
> +   (set_attr "addr" "gpr16,gpr16,*")
> (set_attr "type" "sselog")
> (set_attr "prefix_data16" "1,1,*")
> (set_attr "prefix_extra" "1")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index c502582102e..472c2190f89 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -12049,9 +12049,9 @@ (define_insn "vec_extract_hi__mask"
> (set_attr "mode" "")])
>
>  (define_insn "vec_extract_hi_"
> -  [(set (match_operand: 0 "nonimmediate_operand" "=vm")
> +  [(set (match_operand: 0 "nonimmediate_operand" "=xjm,vm")
> (vec_select:
> - (match_operand:VI8F_256 1 "register_operand" "v")
> + (match_operand:VI8F_256 1 "register_operand" "x,v")
>   (parallel [(const_int 2) (const_int 3)])))]
>"TARGET_AVX"
>  {
> @@ -12065,7 +12065,9 @@ (define_insn "vec_extract_hi_"
>else
>  return "vextract\t{$0x1, %1, %0|%0, %1, 0x1}";
>  }
> -  [(set_attr "type" "sselog1")
> +  [(set_attr "isa" "noavx512vl,avx512vl")
> +   (set_attr "addr" "gpr16,*")
> +   (set_attr "type" "sselog1")
> (set_attr "prefix_extra" "1")
> (set_attr "length_immediate" "1")
> (set_attr "prefix" "vex")
> @@ -12132,7 +12134,7 @@ (define_insn "vec_extract_hi__mask"
> (set_attr "mode" "")])
>
>  (define_insn "vec_extract_hi_"
> -  [(set (match_operand: 0 "nonimmediate_operand" "=xm, vm")
> +  [(set (match_operand: 0 "nonimmediate_operand" "=xjm, vm")
> (vec_select:
>   (match_operand:VI4F_256 1 "register_operand" "x, v")
>   (parallel [(const_int 4) (const_int 5)
> @@ -12141,7 +12143,8 @@ (define_insn "vec_extract_hi_"
>"@
>  vextract\t{$0x1, %1, %0|%0, %1, 0x1}
>  vextract32x4\t{$0x1, %1, %0|%0, %1, 0x1}"
> -  [(set_attr "isa" "*, avx512vl")
> +  [(set_attr "isa" "noavx512vl, avx512vl")
> +   (set_attr "addr" "gpr16,*")
> (set_attr "prefix" "vex, evex")
> (set_attr "type" "sselog1")
> (set_attr "length_immediate" "1")
> @@ -1,7 +12225,7 @@ (define_insn_and_split "@vec_extract_lo_"
>"operands[1] = gen_lowpart (mode, operands[1]);")
>
>  (define_insn "@vec_extract_hi_"
> -  [(set (match_operand: 0 "nonimmediate_operand" "=xm,vm")
> +  [(set (match_operand: 0 "nonimmediate_operand" "=xjm,vm")
> (vec_select:
>   (match_operand:V16_256 1 "register_operand" "x,v")
>   (parallel [(const_int 8) (const_int 9)
> @@ -12236,7 +12239,8 @@ (define_insn "@vec_extract_hi_"
>[(set_attr "type" "sselog1")
> (set_attr "prefix_extra" "1")
> (set_attr "length_immediate" "1")
> -   (set_attr "isa" "*,avx512vl")
> +   (set_attr "isa" "noavx512vl,avx512vl")
> +   (set_attr "addr" "gpr16,*")
> (set_attr "prefix" "vex,evex")
> (set_attr "mode" "OI")])
>
> @@ -20465,7 +20469,7 @@ (define_split
>  })
>
>  (define_insn "*vec_extractv2ti"
> -  [(set (match_operand:TI 0 "nonimmediate_operand" "=xm,vm")
> +  [(set (match_operand:TI 0 "nonimmediate_operand" "=xjm,vm")
> (vec_select:TI
>   (match_operand:V2TI 1 "register_operand" "x,v")
>   (parallel
> @@ -20477,6 +20481,8 @@ (define_insn "*vec_extractv2ti"
>[(set_attr "type" "sselog")
> (set_attr "prefix_extra" "1")
> (set_attr "length_immediate" "1")
> +   (set_attr "isa" "noavx512vl,avx512vl")
> +   (set_attr "addr" "gpr16,*")
> (set_attr "prefix" "vex,evex")
> (set_attr "mode" "OI")])
>
> @@ -27556,12 +27562,12 @@ (define_insn "vec_set_lo_"
> (set_attr "mode" "")])
>
>  (define_insn "vec_set_hi_"
> -  [(set (match_operand:VI8F_256 0 "register_operand" "=v")
> +  [(set (match_operand:VI8F_256 0 "register_operand" "=x,v")
> (vec_concat:VI8F_256
>   (vec_select:
> -   (match_operand:VI8F_256 1 "register_operand" "v")
> +   (match_operand:VI8F_256 

Re: [PATCH] x86: Make testcase apx-spill_to_egprs-1.c more robust

2023-11-14 Thread Hongtao Liu
On Tue, Nov 14, 2023 at 5:01 PM Lehua Ding  wrote:
>
> Hi,
>
> This little patch adjust the assert in apx-spill_to_egprs-1.c testcase.
> The -mapxf compilation option allows more registers to be used, which in
> turn eliminates the need for local variables to be stored in stack memory.
> Therefore, the assertion is changed to detects no memory loaded through the
> %rsp register.
Ok, thanks.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/apx-spill_to_egprs-1.c: Make sure that no local
> variables are stored on the stack.
>
> ---
>  .../gcc.target/i386/apx-spill_to_egprs-1.c| 19 +++
>  1 file changed, 3 insertions(+), 16 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.target/i386/apx-spill_to_egprs-1.c 
> b/gcc/testsuite/gcc.target/i386/apx-spill_to_egprs-1.c
> index 290863d63a7..d7952b4c550 100644
> --- a/gcc/testsuite/gcc.target/i386/apx-spill_to_egprs-1.c
> +++ b/gcc/testsuite/gcc.target/i386/apx-spill_to_egprs-1.c
> @@ -3,22 +3,9 @@
>
>  #include "spill_to_mask-1.c"
>
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r16d" } } */
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r17d" } } */
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r18d" } } */
> -/* { dg-final { scan-assembler "movq\[ \t]+\[^\\n\\r\]*, %r19" } } */
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r20d" } } */
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r21d" } } */
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r22d" } } */
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r23d" } } */
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r24d" } } */
> -/* { dg-final { scan-assembler "addl\[ \t]+\[^\\n\\r\]*, %r25d" } } */
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r26d" } } */
> -/* { dg-final { scan-assembler "movl\[ \t]+\[^\\n\\r\]*, %r27d" } } */
> -/* { dg-final { scan-assembler "movbel\[ \t]+\[^\\n\\r\]*, %r28d" } } */
> -/* { dg-final { scan-assembler "movbel\[ \t]+\[^\\n\\r\]*, %r29d" } } */
> -/* { dg-final { scan-assembler "movbel\[ \t]+\[^\\n\\r\]*, %r30d" } } */
> -/* { dg-final { scan-assembler "movbel\[ \t]+\[^\\n\\r\]*, %r31d" } } */
> +/* Make sure that no local variables are stored on the stack. */
> +/* { dg-final { scan-assembler-not "\\(%rsp\\)" } } */
> +
>  /* { dg-final { scan-assembler-not "knot" } } */
>  /* { dg-final { scan-assembler-not "kxor" } } */
>  /* { dg-final { scan-assembler-not "kor" } } */
> --
> 2.36.3
>


-- 
BR,
Hongtao


Re: [RFC] Intel AVX10.1 Compiler Design and Support

2023-11-13 Thread Hongtao Liu
On Mon, Nov 13, 2023 at 7:25 PM Richard Biener
 wrote:
>
> On Mon, Nov 13, 2023 at 7:58 AM Hongtao Liu  wrote:
> >
> > On Fri, Nov 10, 2023 at 6:15 PM Richard Biener
> >  wrote:
> > >
> > > On Fri, Nov 10, 2023 at 2:42 AM Haochen Jiang  
> > > wrote:
> > > >
> > > > Hi all,
> > > >
> > > > This RFC patch aims to add AVX10.1 options. After we added 
> > > > -m[no-]evex512
> > > > support, it makes a lot easier to add them comparing to the August 
> > > > version.
> > > > Detail for AVX10 is shown below:
> > > >
> > > > Intel Advanced Vector Extensions 10 (Intel AVX10) Architecture 
> > > > Specification
> > > > It describes the Intel Advanced Vector Extensions 10 Instruction Set
> > > > Architecture.
> > > > https://cdrdv2.intel.com/v1/dl/getContent/784267
> > > >
> > > > The Converged Vector ISA: Intel Advanced Vector Extensions 10 Technical 
> > > > Paper
> > > > It provides introductory information regarding the converged vector 
> > > > ISA: Intel
> > > > Advanced Vector Extensions 10.
> > > > https://cdrdv2.intel.com/v1/dl/getContent/784343
> > > >
> > > > Our proposal is to take AVX10.1-256 and AVX10.1-512 as two "virtual" 
> > > > ISAs in
> > > > the compiler. AVX10.1-512 will imply AVX10.1-256. They will not enable
> > > > anything at first. At the end of the option handling, we will check 
> > > > whether
> > > > the two bits are set. If AVX10.1-256 is set, we will set the AVX512 
> > > > related
> > > > ISA bits. AVX10.1-512 will further set EVEX512 ISA bit.
> > > >
> > > > It means that AVX10 options will be separated from the existing AVX512 
> > > > and the
> > > > newly added -m[no-]evex512 options. AVX10 and AVX512 options will 
> > > > control
> > > > (enable/disable/set vector size) the AVX512 features underneath 
> > > > independently.
> > > > If there’s potential overlap or conflict between AVX10 and AVX512 
> > > > options,
> > > > some rules are provided to define the behavior, which will be described 
> > > > below.
> > > >
> > > > avx10.1 option will be provided as an alias of avx10.1-256.
> > > >
> > > > In the future, the AVX10 options will imply like this:
> > > >
> > > > AVX10.1-256 < AVX10.1-512
> > > >  ^ ^
> > > >  | |
> > > >
> > > > AVX10.2-256 < AVX10.2-512
> > > >  ^ ^
> > > >  | |
> > > >
> > > > AVX10.3-256 < AVX10.3-512
> > > >  ^ ^
> > > >  | |
> > > >
> > > > Each of them will have its own option to enable/disabled corresponding
> > > > features. The alias avx10.x will also be provided.
> > > >
> > > > As mentioned in August version RFC, since we lean towards the adoption 
> > > > of
> > > > AVX10 instead of AVX512 from now on, we don’t recommend users to 
> > > > combine the
> > > > AVX10 and legacy AVX512 options.
> > >
> > > I wonder whether adoption could be made easier by also providing a
> > > -mavx10[.0] level that removes some of the more obscure sub-ISA 
> > > requirements
> > > to cover more existing implementations (I'd not add -mavx10.0-512 here).
> > > I'd require only skylake-AVX512 features here, basically all non-KNL 
> > > AVX512
> > > CPUs should have a "virtual" AVX10 level that allows to use that feature 
> > > set,
> > We have -mno-evex512 can cover those cases, so what you want is like a
> > simple alias of "-march=skylake-avx512 -mno-evex512"?
>
> For the AVX512 enabled sub-isas of skylake-avx512 yes I guess.
>
> > > restricted to 256bits so future AVX10-256 implementations can handle it
> > > as well as all existing (and relevant, which excludes KNL) AVX512
> > > implementations.
> > >
> > > Otherwise AVX10 is really a hard sell (as AVX512 was originally).
> > It's a rebranding of the existing AVX512 to AVX10, AVX10.0  just
> > complicated things further(considering we already have x86-64-v4 which
> > is different from skylake-avx512).
>
> Well, the cut-off for "AVX512" i

Re: [PATCH] Avoid generate vblendps with ymm16+

2023-11-13 Thread Hongtao Liu
On Mon, Nov 13, 2023 at 4:45 PM Jakub Jelinek  wrote:
>
> On Mon, Nov 13, 2023 at 02:27:35PM +0800, Hongtao Liu wrote:
> > > 1) if it isn't better to use separate alternative instead of
> > >x86_evex_reg_mentioned_p, like in the patch below
> > vblendps doesn't support gpr32 which is checked by x86_evex_reg_mentioned_p.
> > we need to use xjm for operands[1], (I think we don't need to set
> > attribute addr to gpr16 for alternative 0 since the alternative 1 is
> > alway available and recog will match alternative1 when gpr32 is used)
>
> Ok, so like this then?  I've incorporated the other two tests into the patch
> as well.
LGTM.
>
> 2023-11-13  Jakub Jelinek  
> Hu, Lin1  
>
> PR target/112435
> * config/i386/sse.md (avx512vl_shuf_32x4_1,
> avx512dq_shuf_64x2_1): Add
> alternative with just x instead of v constraints and xjm instead of
> vm and use vblendps as optimization only with that alternative.
>
> * gcc.target/i386/avx512vl-pr112435-1.c: New test.
> * gcc.target/i386/avx512vl-pr112435-2.c: New test.
> * gcc.target/i386/avx512vl-pr112435-3.c: New test.
>
> --- gcc/config/i386/sse.md.jj   2023-11-11 08:52:20.377845673 +0100
> +++ gcc/config/i386/sse.md  2023-11-13 09:31:08.568935535 +0100
> @@ -19235,11 +19235,11 @@ (define_expand "avx512dq_shuf_  })
>
>  (define_insn "avx512dq_shuf_64x2_1"
> -  [(set (match_operand:VI8F_256 0 "register_operand" "=v")
> +  [(set (match_operand:VI8F_256 0 "register_operand" "=x,v")
> (vec_select:VI8F_256
>   (vec_concat:
> -   (match_operand:VI8F_256 1 "register_operand" "v")
> -   (match_operand:VI8F_256 2 "nonimmediate_operand" "vm"))
> +   (match_operand:VI8F_256 1 "register_operand" "x,v")
> +   (match_operand:VI8F_256 2 "nonimmediate_operand" "xjm,vm"))
>   (parallel [(match_operand 3 "const_0_to_3_operand")
>  (match_operand 4 "const_0_to_3_operand")
>  (match_operand 5 "const_4_to_7_operand")
> @@ -19254,7 +19254,7 @@ (define_insn "avx512dq_shu
>mask = INTVAL (operands[3]) / 2;
>mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
>operands[3] = GEN_INT (mask);
> -  if (INTVAL (operands[3]) == 2 && !)
> +  if (INTVAL (operands[3]) == 2 && ! && which_alternative == 0)
>  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
>return "vshuf64x2\t{%3, %2, %1, 
> %0|%0, %1, %2, %3}";
>  }
> @@ -19386,11 +19386,11 @@ (define_expand "avx512vl_shuf_  })
>
>  (define_insn "avx512vl_shuf_32x4_1"
> -  [(set (match_operand:VI4F_256 0 "register_operand" "=v")
> +  [(set (match_operand:VI4F_256 0 "register_operand" "=x,v")
> (vec_select:VI4F_256
>   (vec_concat:
> -   (match_operand:VI4F_256 1 "register_operand" "v")
> -   (match_operand:VI4F_256 2 "nonimmediate_operand" "vm"))
> +   (match_operand:VI4F_256 1 "register_operand" "x,v")
> +   (match_operand:VI4F_256 2 "nonimmediate_operand" "xjm,vm"))
>   (parallel [(match_operand 3 "const_0_to_7_operand")
>  (match_operand 4 "const_0_to_7_operand")
>  (match_operand 5 "const_0_to_7_operand")
> @@ -19414,7 +19414,7 @@ (define_insn "avx512vl_shuf_mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
>operands[3] = GEN_INT (mask);
>
> -  if (INTVAL (operands[3]) == 2 && !)
> +  if (INTVAL (operands[3]) == 2 && ! && which_alternative == 0)
>  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
>
>return "vshuf32x4\t{%3, %2, %1, 
> %0|%0, %1, %2, %3}";
> --- gcc/testsuite/gcc.target/i386/avx512vl-pr112435-1.c.jj  2023-11-13 
> 09:20:53.330643098 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512vl-pr112435-1.c 2023-11-13 
> 09:20:53.330643098 +0100
> @@ -0,0 +1,13 @@
> +/* PR target/112435 */
> +/* { dg-do assemble { target { avx512vl && { ! ia32 } } } } */
> +/* { dg-options "-mavx512vl -O2" } */
> +
> +#include 
> +
> +__m256i
> +foo (__m256i a, __m256i b)
> +{
> +  register __m256i c __asm__("ymm16") = a;
> +  asm ("" : "+v" (c));
> +  return _mm256_shuffle_i32x4 (c, b, 2);
> +}
> --- gcc/testsuite/gcc.target/i386/avx512vl-pr112435-2.c.jj  2023-11-13 

Re: [V2 PATCH] Handle bitop with INTEGER_CST in analyze_and_compute_bitop_with_inv_effect.

2023-11-12 Thread Hongtao Liu
On Fri, Nov 10, 2023 at 5:12 PM Richard Biener
 wrote:
>
> On Wed, Nov 8, 2023 at 9:22 AM Hongtao Liu  wrote:
> >
> > On Wed, Nov 8, 2023 at 3:53 PM Richard Biener
> >  wrote:
> > >
> > > On Wed, Nov 8, 2023 at 2:18 AM Hongtao Liu  wrote:
> > > >
> > > > On Tue, Nov 7, 2023 at 10:34 PM Richard Biener
> > > >  wrote:
> > > > >
> > > > > On Tue, Nov 7, 2023 at 2:03 PM Hongtao Liu  wrote:
> > > > > >
> > > > > > On Tue, Nov 7, 2023 at 4:10 PM Richard Biener
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Tue, Nov 7, 2023 at 7:08 AM liuhongt  
> > > > > > > wrote:
> > > > > > > >
> > > > > > > > analyze_and_compute_bitop_with_inv_effect assumes the first 
> > > > > > > > operand is
> > > > > > > > loop invariant which is not the case when it's INTEGER_CST.
> > > > > > > >
> > > > > > > > Bootstrapped and regtseted on x86_64-pc-linux-gnu{-m32,}.
> > > > > > > > Ok for trunk?
> > > > > > >
> > > > > > > So this addresses a missed optimization, right?  It seems to me 
> > > > > > > that
> > > > > > > even with two SSA names we are only "lucky" when rhs1 is the 
> > > > > > > invariant
> > > > > > > one.  So instead of swapping this way I'd do
> > > > > > Yes, it's a miss optimization.
> > > > > > And I think expr_invariant_in_loop_p (loop, match_op[1]) should be
> > > > > > enough, if match_op[1] is a loop invariant.it must be false for the
> > > > > > below conditions(there couldn't be any header_phi from its
> > > > > > definition).
> > > > >
> > > > > Yes, all I said is that when you now care for op1 being INTEGER_CST
> > > > > it could also be an invariant SSA name and thus only after swapping 
> > > > > op0/op1
> > > > > we could have a successful match, no?
> > > > Sorry, the commit message is a little bit misleading.
> > > > At first, I just wanted to handle the INTEGER_CST case (with TREE_CODE
> > > > (match_op[1]) == INTEGER_CST), but then I realized that this could
> > > > probably be extended to the normal SSA_NAME case as well, so I used
> > > > expr_invariant_in_loop_p, which should theoretically be able to handle
> > > > the SSA_NAME case as well.
> > > >
> > > > if (expr_invariant_in_loop_p (loop, match_op[1])) is true, w/o
> > > > swapping it must return NULL_TREE for below conditions.
> > > > if (expr_invariant_in_loop_p (loop, match_op[1])) is false, w/
> > > > swapping it must return NULL_TREE too.
> > > > So it can cover the both cases you mentioned, no need for a loop to
> > > > iterate 2 match_ops for all conditions.
> > >
> > > Sorry if it appears we're going in circles ;)
> > >
> > > > 3692  if (TREE_CODE (match_op[1]) != SSA_NAME
> > > > 3693  || !expr_invariant_in_loop_p (loop, match_op[0])
> > > > 3694  || !(header_phi = dyn_cast  (SSA_NAME_DEF_STMT 
> > > > (match_op[1])))
> > >
> > > but this only checks match_op[1] (an SSA name at this point) for being 
> > > defined
> > > by the header PHI.  What if expr_invariant_in_loop_p (loop, mach_op[1])
> > > and header_phi = dyn_cast  (SSA_NAME_DEF_STMT (match_op[0]))
> > > which I think can happen when both ops are SSA name?
> > The whole condition is like
> >
> > 3692  if (TREE_CODE (match_op[1]) != SSA_NAME
> > 3693  || !expr_invariant_in_loop_p (loop, match_op[0])
> > 3694  || !(header_phi = dyn_cast  (SSA_NAME_DEF_STMT 
> > (match_op[1])))
> > 3695  || gimple_bb (header_phi) != loop->header  - This would
> > be true if match_op[1] is SSA_NAME and expr_invariant_in_loop_p
>
> But it could be expr_invariant_in_loop_p (match_op[1]) and
> header_phi = dyn_cast  (SSA_NAME_DEF_STMT (match_op[0]))

> > > > > > > > +  if (expr_invariant_in_loop_p (loop, match_op[1]))
> > > > > > > > +std::swap (match_op[0], match_op[1]);
match_op[1] will be swapped to match_op[0], the case is also handled
by my patch [1](the v2 patch)
My point is the upper code already handles 2 SSA names, no need to
iterate with all conditions, expr_invariant_in_loop_p alone is 

Re: [PATCH] Simplify vector ((VCE?(a cmp b ? -1 : 0)) < 0) ? c : d to just VCE:((a cmp b) ? (VCE c) : (VCE d)).

2023-11-12 Thread Hongtao Liu
On Fri, Nov 10, 2023 at 2:14 PM liuhongt  wrote:
>
> When I'm working on PR112443, I notice there's some misoptimizations:
> after we fold _mm{,256}_blendv_epi8/pd/ps into gimple, the backend
> fails to combine it back to v{,p}blendv{v,ps,pd} since the pattern is
> too complicated, so I think maybe we should hanlde it in the gimple
> level.
>
> The dump is like
>
>   _1 = c_3(D) >= { 0, 0, 0, 0 };
>   _2 = VEC_COND_EXPR <_1, { -1, -1, -1, -1 }, { 0, 0, 0, 0 }>;
>   _7 = VIEW_CONVERT_EXPR(_2);
>   _8 = VIEW_CONVERT_EXPR(b_6(D));
>   _9 = VIEW_CONVERT_EXPR(a_5(D));
>   _10 = _7 < { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
>   _11 = VEC_COND_EXPR <_10, _8, _9>;
>
> It can be optimized to
>
>   _1 = c_2(D) >= { 0, 0, 0, 0 };
>   _6 = VEC_COND_EXPR <_1, b_5(D), a_4(D)>;
>
> since _7 is either -1 or 0, the selection of _7 < 0 ? _8 : _9 should
> be euqal to _1 ? b : a as long as TYPE_PRECISION of the component type
> of the second VEC_COND_EXPR is less equal to the first one.
> The patch add a gimple pattern to handle that.

The is the updated patch according to pinski's comments, I'll reply here.
> It looks like the outer vec_cond isn't actually relevant to the 
> simplification?
>

My original pattern is wrong as pinski mentioned, for the new pattern
outer vec_cond is needed
> Actually this is invalid transformation. It is only valid for unsigned types.
> The reason why it is invalid is because the sign bit changes when
> going to a smaller type from a larger one.
> It would be valid for equals but no other type.

>  (lt (view_convert? (vec_cond (cmp @0 @1) integer_all_onesp
> integer_zerop)) integer_zerop)
>
> is the relevant part?  I wonder what canonicalizes the inner vec_cond?
>  Did you ever see
> the (view_convert ... missing?

typedef char v32qi __attribute__((vector_size(16)));

v32qi
foo (v32qi a, v32qi b, v32qi c)
{
v32qi d = ~c < 0 ?
__extension__(v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
: (v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
return d < 0 ? a : b;
}
Looks like ccp1 can handle non view_convert case, so I'll remove "?"
in view_convert.
>
> gcc/ChangeLog:
>
> * match.pd (VCE:(a cmp b ? -1 : 0) < 0) ? c : d ---> VCE:((a
> cmp b) ? (VCE:c) : (VCE:d)): New gimple simplication.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx512vl-blendv-3.c: New test.
> * gcc.target/i386/blendv-3.c: New test.
> ---
>  gcc/match.pd  | 19 
>  .../gcc.target/i386/avx512vl-blendv-3.c   |  6 +++
>  gcc/testsuite/gcc.target/i386/blendv-3.c  | 46 +++
>  3 files changed, 71 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/blendv-3.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index dbc811b2b38..4d823882a7c 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -5170,6 +5170,25 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (optimize_vectors_before_lowering_p () && types_match (@0, @3))
>(vec_cond (bit_and @0 (bit_not @3)) @2 @1)))
>
> +(for cmp (simple_comparison)
> + (simplify
> +  (vec_cond
> +(lt (view_convert?@5 (vec_cond@6 (cmp@4 @0 @1)
> +integer_all_onesp
> +integer_zerop))
> + integer_zerop) @2 @3)
> +  (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
> +   && VECTOR_INTEGER_TYPE_P (TREE_TYPE (@5))
> +   && !TYPE_UNSIGNED (TREE_TYPE (@5))
> +   && VECTOR_TYPE_P (TREE_TYPE (@6))
> +   && VECTOR_TYPE_P (type)
> +   && (TYPE_PRECISION (TREE_TYPE (type))
> + <= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@6
> +   && TYPE_SIZE (type) == TYPE_SIZE (TREE_TYPE (@6)))
> +   (with { tree vtype = TREE_TYPE (@6);}
> + (view_convert:type
> +   (vec_cond @4 (view_convert:vtype @2) (view_convert:vtype @3)))
> +
>  /* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
>  (simplify
>   (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c 
> b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> new file mode 100644
> index 000..2777e72ab5f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512vl -mavx512bw -O2" } */
> +/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
> +/* { dg-final { scan-assembler-not {vpcmp} } } */
> +
> +#include "blendv-3.c"
> diff --git a/gcc/testsuite/gcc.target/i386/blendv-3.c 
> b/gcc/testsuite/gcc.target/i386/blendv-3.c
> new file mode 100644
> index 000..fa0fb067a73
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/blendv-3.c
> @@ -0,0 +1,46 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx2 -O2" } */
> +/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
> +/* { dg-final { scan-assembler-not {vpcmp} 

Re: [RFC] Intel AVX10.1 Compiler Design and Support

2023-11-12 Thread Hongtao Liu
On Fri, Nov 10, 2023 at 6:15 PM Richard Biener
 wrote:
>
> On Fri, Nov 10, 2023 at 2:42 AM Haochen Jiang  wrote:
> >
> > Hi all,
> >
> > This RFC patch aims to add AVX10.1 options. After we added -m[no-]evex512
> > support, it makes a lot easier to add them comparing to the August version.
> > Detail for AVX10 is shown below:
> >
> > Intel Advanced Vector Extensions 10 (Intel AVX10) Architecture Specification
> > It describes the Intel Advanced Vector Extensions 10 Instruction Set
> > Architecture.
> > https://cdrdv2.intel.com/v1/dl/getContent/784267
> >
> > The Converged Vector ISA: Intel Advanced Vector Extensions 10 Technical 
> > Paper
> > It provides introductory information regarding the converged vector ISA: 
> > Intel
> > Advanced Vector Extensions 10.
> > https://cdrdv2.intel.com/v1/dl/getContent/784343
> >
> > Our proposal is to take AVX10.1-256 and AVX10.1-512 as two "virtual" ISAs in
> > the compiler. AVX10.1-512 will imply AVX10.1-256. They will not enable
> > anything at first. At the end of the option handling, we will check whether
> > the two bits are set. If AVX10.1-256 is set, we will set the AVX512 related
> > ISA bits. AVX10.1-512 will further set EVEX512 ISA bit.
> >
> > It means that AVX10 options will be separated from the existing AVX512 and 
> > the
> > newly added -m[no-]evex512 options. AVX10 and AVX512 options will control
> > (enable/disable/set vector size) the AVX512 features underneath 
> > independently.
> > If there’s potential overlap or conflict between AVX10 and AVX512 options,
> > some rules are provided to define the behavior, which will be described 
> > below.
> >
> > avx10.1 option will be provided as an alias of avx10.1-256.
> >
> > In the future, the AVX10 options will imply like this:
> >
> > AVX10.1-256 < AVX10.1-512
> >  ^ ^
> >  | |
> >
> > AVX10.2-256 < AVX10.2-512
> >  ^ ^
> >  | |
> >
> > AVX10.3-256 < AVX10.3-512
> >  ^ ^
> >  | |
> >
> > Each of them will have its own option to enable/disabled corresponding
> > features. The alias avx10.x will also be provided.
> >
> > As mentioned in August version RFC, since we lean towards the adoption of
> > AVX10 instead of AVX512 from now on, we don’t recommend users to combine the
> > AVX10 and legacy AVX512 options.
>
> I wonder whether adoption could be made easier by also providing a
> -mavx10[.0] level that removes some of the more obscure sub-ISA requirements
> to cover more existing implementations (I'd not add -mavx10.0-512 here).
> I'd require only skylake-AVX512 features here, basically all non-KNL AVX512
> CPUs should have a "virtual" AVX10 level that allows to use that feature set,
We have -mno-evex512 can cover those cases, so what you want is like a
simple alias of "-march=skylake-avx512 -mno-evex512"?
> restricted to 256bits so future AVX10-256 implementations can handle it
> as well as all existing (and relevant, which excludes KNL) AVX512
> implementations.
>
> Otherwise AVX10 is really a hard sell (as AVX512 was originally).
It's a rebranding of the existing AVX512 to AVX10, AVX10.0  just
complicated things further(considering we already have x86-64-v4 which
is different from skylake-avx512).
>
> > However, we would like to introduce some
> > simple rules for user when it comes to combination.
> >
> > 1. Enabling AVX10 and AVX512 at the same command line with different vector
> > size will lead to a warning message. The behavior of the compiler will be
> > enabling AVX10 with longer, i.e., 512 bit vector size.
> >
> > If the vector sizes are the same (e.g. -mavx10.1-256 -mavx512f -mno-evex512,
> > -mavx10.1-512 -mavx512f), it will be valid with the corresponding vector 
> > size.
> >
> > 2. -mno-avx10.1 option can’t disable any features enabled by AVX512 options 
> > or
> > impact the vector size, and vice versa. The compiler will emit warnings if
> > necessary.
> >
> > For the auto dispatch support including function multi versioning, function
> > attribute usage, the behavior will be identical to compiler options.
> >
> > If you have any questions, feel free to ask in this thread.
> >
> > Thx,
> > Haochen
> >
> >



-- 
BR,
Hongtao


Re: [PATCH] Avoid generate vblendps with ymm16+

2023-11-12 Thread Hongtao Liu
On Sat, Nov 11, 2023 at 4:11 AM Jakub Jelinek  wrote:
>
> On Thu, Nov 09, 2023 at 03:27:11PM +0800, Hongtao Liu wrote:
> > On Thu, Nov 9, 2023 at 3:15 PM Hu, Lin1  wrote:
> > >
> > > This patch aims to avoid generate vblendps with ymm16+, And have
> > > bootstrapped and tested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > > PR target/112435
> > > * config/i386/sse.md: Adding constraints to restrict the 
> > > generation of
> > > vblendps.
> > It should be "Don't output vblendps when evex sse reg or gpr32 is involved."
> > Others LGTM.
>
> I've missed this patch, so wrote my own today, and am wondering
>
> 1) if it isn't better to use separate alternative instead of
>x86_evex_reg_mentioned_p, like in the patch below
vblendps doesn't support gpr32 which is checked by x86_evex_reg_mentioned_p.
we need to use xjm for operands[1], (I think we don't need to set
attribute addr to gpr16 for alternative 0 since the alternative 1 is
alway available and recog will match alternative1 when gpr32 is used)

> 2) why do you need the last two hunks in sse.md, both avx2_permv2ti and
>*avx_vperm2f128_nozero insns only use x in constraints, never v,
>so x86_evex_reg_mentioned_p ought to be always false there
true.
>
> Here is the untested patch, of course you have more testcases (though, I
> think it is better to test dg-do assemble with avx512vl target rather than
> dg-do compile and scan the assembler, after all, the problem was that it
> didn't assemble).
>
> 2023-11-10  Jakub Jelinek  
>
> PR target/112435
> * config/i386/sse.md (avx512vl_shuf_32x4_1,
> avx512dq_shuf_64x2_1): Add
> alternative with just x instead of v constraints and use vblendps
> as optimization only with that alternative.
>
> * gcc.target/i386/avx512vl-pr112435.c: New test.
>
> --- gcc/config/i386/sse.md.jj   2023-11-09 09:04:18.616543403 +0100
> +++ gcc/config/i386/sse.md  2023-11-10 15:56:44.138499931 +0100
> @@ -19235,11 +19235,11 @@ (define_expand "avx512dq_shuf_  })
>
>  (define_insn "avx512dq_shuf_64x2_1"
> -  [(set (match_operand:VI8F_256 0 "register_operand" "=v")
> +  [(set (match_operand:VI8F_256 0 "register_operand" "=x,v")
> (vec_select:VI8F_256
>   (vec_concat:
> -   (match_operand:VI8F_256 1 "register_operand" "v")
> -   (match_operand:VI8F_256 2 "nonimmediate_operand" "vm"))
> +   (match_operand:VI8F_256 1 "register_operand" "x,v")
> +   (match_operand:VI8F_256 2 "nonimmediate_operand" "xm,vm"))
>   (parallel [(match_operand 3 "const_0_to_3_operand")
>  (match_operand 4 "const_0_to_3_operand")
>  (match_operand 5 "const_4_to_7_operand")
> @@ -19254,7 +19254,7 @@ (define_insn "avx512dq_shu
>mask = INTVAL (operands[3]) / 2;
>mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
>operands[3] = GEN_INT (mask);
> -  if (INTVAL (operands[3]) == 2 && !)
> +  if (INTVAL (operands[3]) == 2 && ! && which_alternative == 0)
>  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
>return "vshuf64x2\t{%3, %2, %1, 
> %0|%0, %1, %2, %3}";
>  }
> @@ -19386,11 +19386,11 @@ (define_expand "avx512vl_shuf_  })
>
>  (define_insn "avx512vl_shuf_32x4_1"
> -  [(set (match_operand:VI4F_256 0 "register_operand" "=v")
> +  [(set (match_operand:VI4F_256 0 "register_operand" "=x,v")
> (vec_select:VI4F_256
>   (vec_concat:
> -   (match_operand:VI4F_256 1 "register_operand" "v")
> -   (match_operand:VI4F_256 2 "nonimmediate_operand" "vm"))
> +   (match_operand:VI4F_256 1 "register_operand" "x,v")
> +   (match_operand:VI4F_256 2 "nonimmediate_operand" "xm,vm"))
>   (parallel [(match_operand 3 "const_0_to_7_operand")
>  (match_operand 4 "const_0_to_7_operand")
>  (match_operand 5 "const_0_to_7_operand")
> @@ -19414,7 +19414,7 @@ (define_insn "avx512vl_shuf_mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
>operands[3] = GEN_INT (mask);
>
> -  if (INTVAL (operands[3]) == 2 && !)
> +  if (INTVAL (operands[3]) == 2 && ! && which_alternative == 0)
>  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
>
>return "vshuf32x4\t{%3, %2, %1, 
> %0|%0, %1, %2, %3}";
> --- gcc/testsuite/gcc.target/i386/avx512vl-pr112435.c.jj2023-11-10 
> 16:04:21.708046771 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512vl-pr112435.c   2023-11-10 
> 16:03:51.053479094 +0100
> @@ -0,0 +1,13 @@
> +/* PR target/112435 */
> +/* { dg-do assemble { target { avx512vl && { ! ia32 } } } } */
> +/* { dg-options "-mavx512vl -O2" } */
> +
> +#include 
> +
> +__m256i
> +foo (__m256i a, __m256i b)
> +{
> +  register __m256i c __asm__("ymm16") = a;
> +  asm ("" : "+v" (c));
> +  return _mm256_shuffle_i32x4 (c, b, 2);
> +}
>
> Jakub
>


-- 
BR,
Hongtao


Re: [PATCH] Simplify vector ((VCE?(a cmp b ? -1 : 0)) < 0) ? c : d to just (VCE:a cmp VCE:b) ? c : d.

2023-11-09 Thread Hongtao Liu
On Fri, Nov 10, 2023 at 10:11 AM Andrew Pinski  wrote:
>
> On Thu, Nov 9, 2023 at 5:52 PM liuhongt  wrote:
> >
> > When I'm working on PR112443, I notice there's some misoptimizations: after 
> > we
> > fold _mm{,256}_blendv_epi8/pd/ps into gimple, the backend fails to combine 
> > it
> > back to v{,p}blendv{v,ps,pd} since the pattern is too complicated, so I 
> > think
> > maybe we should hanlde it in the gimple level.
> >
> > The dump is like
> >
> >   _1 = c_3(D) >= { 0, 0, 0, 0 };
> >   _2 = VEC_COND_EXPR <_1, { -1, -1, -1, -1 }, { 0, 0, 0, 0 }>;
> >   _7 = VIEW_CONVERT_EXPR(_2);
> >   _8 = VIEW_CONVERT_EXPR(b_6(D));
> >   _9 = VIEW_CONVERT_EXPR(a_5(D));
> >   _10 = _7 < { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
> >   _11 = VEC_COND_EXPR <_10, _8, _9>;
> >
> >
> > It can be optimized to
> >
> >   _6 = VIEW_CONVERT_EXPR(b_4(D));
> >   _7 = VIEW_CONVERT_EXPR(a_3(D));
> >   _10 = VIEW_CONVERT_EXPR(c_1(D));
> >   _5 = _10 >= { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
> > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
> >   _8 = VEC_COND_EXPR <_5, _6, _7>;
> >   _9 = VIEW_CONVERT_EXPR<__m256i>(_8);
>
> Actually this is invalid transformation. It is only valid for unsigned types.
> The reason why it is invalid is because the sign bit changes when
> going to a smaller type from a larger one.
> It would be valid for equals but no other type.
Yes, I think we should VIEW_CONVERT_EXPR the true/false data instead
of the comparison operand.
And it should be only valid when the component type of the second
VEC_COND_EXPR is small than the first VEC_COND_EXPR.
>
> Thanks,
> Andrew
>
> >
> > since _7 is either -1 or 0, _7 < 0 should is euqal to _1 = c_3(D) > { 0, 0, 
> > 0, 0 };
> > The patch add a gimple pattern to handle that.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > * match.pd (VCE:(a cmp b ? -1 : 0) < 0) ? c : d ---> (VCE:a cmp
> > VCE:b) ? c : d): New gimple simplication.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/avx512vl-blendv-3.c: New test.
> > * gcc.target/i386/blendv-3.c: New test.
> > ---
> >  gcc/match.pd  | 17 +++
> >  .../gcc.target/i386/avx512vl-blendv-3.c   |  6 +++
> >  gcc/testsuite/gcc.target/i386/blendv-3.c  | 46 +++
> >  3 files changed, 69 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/blendv-3.c
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index dbc811b2b38..e6f9c4fa1fd 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -5170,6 +5170,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >   (if (optimize_vectors_before_lowering_p () && types_match (@0, @3))
> >(vec_cond (bit_and @0 (bit_not @3)) @2 @1)))
> >
> > +(for cmp (simple_comparison)
> > + (simplify
> > +  (vec_cond
> > +(lt@4 (view_convert?@5 (vec_cond (cmp @0 @1)
> > +integer_all_onesp
> > +integer_zerop))
> > + integer_zerop) @2 @3)
> > +  (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
> > +   && VECTOR_INTEGER_TYPE_P (TREE_TYPE (@5))
> > +   && TYPE_SIGN (TREE_TYPE (@0)) == TYPE_SIGN (TREE_TYPE (@5))
> > +   && VECTOR_TYPE_P (type))
> > +   (with {
> > +  tree itype = TREE_TYPE (@5);
> > +  tree vbtype = TREE_TYPE (@4);}
> > + (vec_cond (cmp:vbtype (view_convert:itype @0)
> > +  (view_convert:itype @1)) @2 @3)
> > +
> >  /* c1 ? c2 ? a : b : b  -->  (c1 & c2) ? a : b  */
> >  (simplify
> >   (vec_cond @0 (vec_cond:s @1 @2 @3) @3)
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c 
> > b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> > new file mode 100644
> > index 000..2777e72ab5f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-blendv-3.c
> > @@ -0,0 +1,6 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512vl -mavx512bw -O2" } */
> > +/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
> > +/* { dg-final { scan-assembler-not {vpcmp} } } */
> > +
> > +#include "blendv-3.c"
> > diff --git a/gcc/testsuite/gcc.target/i386/blendv-3.c 
> > b/gcc/testsuite/gcc.target/i386/blendv-3.c
> > new file mode 100644
> > index 000..fa0fb067a73
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/blendv-3.c
> > @@ -0,0 +1,46 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx2 -O2" } */
> > +/* { dg-final { scan-assembler-times {vp?blendv(?:b|p[sd])[ \t]*} 6 } } */
> > +/* { dg-final { scan-assembler-not {vpcmp} } } */
> > +
> > +#include 
> > +
> > +__m256i
> > +foo (__m256i a, __m256i b, __m256i c)
> > +{
> > +  return _mm256_blendv_epi8 (a, b, ~c < 0);
> > +}
> > +
> > +__m256d
> > +foo1 (__m256d a, __m256d b, __m256i c)
> > +{
> > +  __m256i d = ~c < 0;
> > +  

Re: [PATCH] Avoid generate vblendps with ymm16+

2023-11-08 Thread Hongtao Liu
On Thu, Nov 9, 2023 at 3:15 PM Hu, Lin1  wrote:
>
> This patch aims to avoid generate vblendps with ymm16+, And have
> bootstrapped and tested on x86_64-pc-linux-gnu{-m32,-m64}. Ok for trunk?
>
> gcc/ChangeLog:
>
> PR target/112435
> * config/i386/sse.md: Adding constraints to restrict the generation of
> vblendps.
It should be "Don't output vblendps when evex sse reg or gpr32 is involved."
Others LGTM.
>
> gcc/testsuite/ChangeLog:
>
> PR target/112435
> * gcc.target/i386/pr112435-1.c: New test.
> * gcc.target/i386/pr112435-2.c: Ditto.
> * gcc.target/i386/pr112435-3.c: Ditto.
> ---
>  gcc/config/i386/sse.md | 28 +---
>  gcc/testsuite/gcc.target/i386/pr112435-1.c | 14 
>  gcc/testsuite/gcc.target/i386/pr112435-2.c | 64 ++
>  gcc/testsuite/gcc.target/i386/pr112435-3.c | 79 ++
>  4 files changed, 175 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr112435-3.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 33198756bb0..666f931c88d 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -19254,7 +19254,8 @@
>mask = INTVAL (operands[3]) / 2;
>mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
>operands[3] = GEN_INT (mask);
> -  if (INTVAL (operands[3]) == 2 && !)
> +  if (INTVAL (operands[3]) == 2 && !
> +  && !x86_evex_reg_mentioned_p (operands, 3))
>  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
>return "vshuf64x2\t{%3, %2, %1, 
> %0|%0, %1, %2, %3}";
>  }
> @@ -19414,7 +19415,8 @@
>mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
>operands[3] = GEN_INT (mask);
>
> -  if (INTVAL (operands[3]) == 2 && !)
> +  if (INTVAL (operands[3]) == 2 && !
> +  && !x86_evex_reg_mentioned_p (operands, 3))
>  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
>
>return "vshuf32x4\t{%3, %2, %1, 
> %0|%0, %1, %2, %3}";
> @@ -26776,10 +26778,13 @@
> else
>   return "vmovaps\t{%2, %0|%0, %2}";
>}
> -if ((mask & 0xbb) == 18)
> -  return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> -if ((mask & 0xbb) == 48)
> -  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +if (!x86_evex_reg_mentioned_p (operands, 3))
> +  {
> +   if ((mask & 0xbb) == 18)
> + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> +   if ((mask & 0xbb) == 48)
> + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +  }
>  return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
>}
>[(set_attr "type" "sselog")
> @@ -27433,10 +27438,13 @@
> && avx_vperm2f128_parallel (operands[3], mode)"
>  {
>int mask = avx_vperm2f128_parallel (operands[3], mode) - 1;
> -  if ((mask & 0xbb) == 0x12)
> -return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> -  if ((mask & 0xbb) == 0x30)
> -return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +  if (!x86_evex_reg_mentioned_p (operands, 3))
> +{
> +  if ((mask & 0xbb) == 0x12)
> +   return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> +  if ((mask & 0xbb) == 0x30)
> +   return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +}
>if ((mask & 0xbb) == 0x20)
>  return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
>operands[3] = GEN_INT (mask);
> diff --git a/gcc/testsuite/gcc.target/i386/pr112435-1.c 
> b/gcc/testsuite/gcc.target/i386/pr112435-1.c
> new file mode 100644
> index 000..ff56523b4e1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr112435-1.c
> @@ -0,0 +1,14 @@
> +/* PR target/112435 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-Ofast -march=sapphirerapids" } */
> +/* { dg-final { scan-assembler-not "vblendps" } } */
> +
> +#include
> +
> +__m256i
> +f(__m256i a, __m256i  b)
> +{
> +  register __m256i t __asm__("ymm17") = a;
> +  asm("":"+v"(t));
> +  return _mm256_shuffle_i32x4 (t, b, 2);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr112435-2.c 
> b/gcc/testsuite/gcc.target/i386/pr112435-2.c
> new file mode 100644
> index 000..27ba80b1e68
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr112435-2.c
> @@ -0,0 +1,64 @@
> +/* PR target/112435 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-Ofast -march=sapphirerapids" } */
> +/* { dg-final { scan-assembler-not "vblendps.*ymm17\$" } } */
> +
> +#include
> +
> +/* Vpermi128/Vpermf128 */
> +__m256i
> +perm0 (__m256i a, __m256i b)
> +{
> +  register __m256i t __asm__("ymm17") = a;
> +  asm("":"+v"(t));
> +  return _mm256_permute2x128_si256 (t, b, 50);
> +}
> +
> +__m256i
> +perm1 (__m256i a, __m256i b)
> +{
> +  register __m256i t __asm__("ymm17") = a;
> +  asm("":"+v"(t));
> +  return _mm256_permute2x128_si256 (t, b, 18);
> +}
> +
> +__m256i
> +perm2 (__m256i a, 

Re: [V2 PATCH] Handle bitop with INTEGER_CST in analyze_and_compute_bitop_with_inv_effect.

2023-11-08 Thread Hongtao Liu
On Wed, Nov 8, 2023 at 3:53 PM Richard Biener
 wrote:
>
> On Wed, Nov 8, 2023 at 2:18 AM Hongtao Liu  wrote:
> >
> > On Tue, Nov 7, 2023 at 10:34 PM Richard Biener
> >  wrote:
> > >
> > > On Tue, Nov 7, 2023 at 2:03 PM Hongtao Liu  wrote:
> > > >
> > > > On Tue, Nov 7, 2023 at 4:10 PM Richard Biener
> > > >  wrote:
> > > > >
> > > > > On Tue, Nov 7, 2023 at 7:08 AM liuhongt  wrote:
> > > > > >
> > > > > > analyze_and_compute_bitop_with_inv_effect assumes the first operand 
> > > > > > is
> > > > > > loop invariant which is not the case when it's INTEGER_CST.
> > > > > >
> > > > > > Bootstrapped and regtseted on x86_64-pc-linux-gnu{-m32,}.
> > > > > > Ok for trunk?
> > > > >
> > > > > So this addresses a missed optimization, right?  It seems to me that
> > > > > even with two SSA names we are only "lucky" when rhs1 is the invariant
> > > > > one.  So instead of swapping this way I'd do
> > > > Yes, it's a miss optimization.
> > > > And I think expr_invariant_in_loop_p (loop, match_op[1]) should be
> > > > enough, if match_op[1] is a loop invariant.it must be false for the
> > > > below conditions(there couldn't be any header_phi from its
> > > > definition).
> > >
> > > Yes, all I said is that when you now care for op1 being INTEGER_CST
> > > it could also be an invariant SSA name and thus only after swapping 
> > > op0/op1
> > > we could have a successful match, no?
> > Sorry, the commit message is a little bit misleading.
> > At first, I just wanted to handle the INTEGER_CST case (with TREE_CODE
> > (match_op[1]) == INTEGER_CST), but then I realized that this could
> > probably be extended to the normal SSA_NAME case as well, so I used
> > expr_invariant_in_loop_p, which should theoretically be able to handle
> > the SSA_NAME case as well.
> >
> > if (expr_invariant_in_loop_p (loop, match_op[1])) is true, w/o
> > swapping it must return NULL_TREE for below conditions.
> > if (expr_invariant_in_loop_p (loop, match_op[1])) is false, w/
> > swapping it must return NULL_TREE too.
> > So it can cover the both cases you mentioned, no need for a loop to
> > iterate 2 match_ops for all conditions.
>
> Sorry if it appears we're going in circles ;)
>
> > 3692  if (TREE_CODE (match_op[1]) != SSA_NAME
> > 3693  || !expr_invariant_in_loop_p (loop, match_op[0])
> > 3694  || !(header_phi = dyn_cast  (SSA_NAME_DEF_STMT 
> > (match_op[1])))
>
> but this only checks match_op[1] (an SSA name at this point) for being defined
> by the header PHI.  What if expr_invariant_in_loop_p (loop, mach_op[1])
> and header_phi = dyn_cast  (SSA_NAME_DEF_STMT (match_op[0]))
> which I think can happen when both ops are SSA name?
The whole condition is like

3692  if (TREE_CODE (match_op[1]) != SSA_NAME
3693  || !expr_invariant_in_loop_p (loop, match_op[0])
3694  || !(header_phi = dyn_cast  (SSA_NAME_DEF_STMT (match_op[1])))
3695  || gimple_bb (header_phi) != loop->header  - This would
be true if match_op[1] is SSA_NAME and expr_invariant_in_loop_p
3696  || gimple_phi_num_args (header_phi) != 2)

If expr_invariant_in_loop_p (loop, mach_op[1]) is true and it's an SSA_NAME
according to code in expr_invariant_in_loop_p, def_bb of gphi is
either NULL or not belong to this loop, either case will make will
make gimple_bb (header_phi) != loop->header true.

1857  if (TREE_CODE (expr) == SSA_NAME)
1858{
1859  def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
1860  if (def_bb
1861  && flow_bb_inside_loop_p (loop, def_bb))  -- def_bb is
NULL or it doesn't belong to the loop
1862return false;
1863
1864  return true;
1865}
1866
1867  if (!EXPR_P (expr))

>
> The only canonicalization we have is that constant operands are put second so
> it would have been more natural to write the matching with the other operand
> order (but likely you'd have been unlucky for the existing testcases then).
>
> > 3695  || gimple_bb (header_phi) != loop->header
> > 3696  || gimple_phi_num_args (header_phi) != 2)
> > 3697return NULL_TREE;
> > 3698
> > 3699  if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != 
> > phidef)
> > 3700return NULL_TREE;
> >
> >
> > >
> > > > >
> > > > >  unsigned i;
> > > > >  for (i = 0; i < 2; ++i)
> > > > >if (TREE_CODE (match_op[i]) == SSA_NAME
> > > &

Re: [PATCH] [i386] APX: Fix ICE due to movti postreload splitter [PR112394]

2023-11-07 Thread Hongtao Liu
On Tue, Nov 7, 2023 at 3:33 PM Hongyu Wang  wrote:
>
> Hi,
>
> When APX EGPR enabled, the TImode move pattern *movti_internal allows
> move between gpr and sse reg using constraint pair ("r","Yd"). Then a
> post-reload splitter transform such move to vec_extractv2di, while under
> -msse4.1 -mno-avx EGPR is not allowed for its enabled alternative, which
> caused ICE that insn does not match the constraint. To prevent such ICE,
> we need to adjust the constraint correspond to "Yd". Add a new "jc"
> constraint to disable EGPR under -mno-avx.
>
> Bootstrapped and regtseted on x86_64-pc-linux-gnu{-m32,}.
>
> OK for trunk?
LGTM.
>
> gcc/ChangeLog:
>
> PR target/112394
> * config/i386/constraints.md (jc): New constraint that prohibits
> EGPR on -mno-avx.
> * config/i386/i386.md (*movdi_internal): Change r constraint
> corresponds to Yd.
> (*movti_internal): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> PR target/112394
> * gcc.target/i386/pr112394.c: New test.
> ---
>  gcc/config/i386/constraints.md   |  3 +++
>  gcc/config/i386/i386.md  |  8 
>  gcc/testsuite/gcc.target/i386/pr112394.c | 24 
>  3 files changed, 31 insertions(+), 4 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr112394.c
>
> diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
> index f6275740eb2..74c2f0f2d32 100644
> --- a/gcc/config/i386/constraints.md
> +++ b/gcc/config/i386/constraints.md
> @@ -434,3 +434,6 @@ (define_address_constraint "jb"
>(and (match_operand 0 "vsib_address_operand")
> (not (and (match_test "TARGET_APX_EGPR")
>  (match_test "x86_extended_rex2reg_mentioned_p (op)")
> +
> +(define_register_constraint  "jc"
> + "TARGET_APX_EGPR && !TARGET_AVX ? GENERAL_GPR16 : GENERAL_REGS")
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index ecc74e9994e..ec39c2dd512 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -2454,8 +2454,8 @@ (define_insn "*movoi_internal_avx"
> (set_attr "mode" "OI")])
>
>  (define_insn "*movti_internal"
> -  [(set (match_operand:TI 0 "nonimmediate_operand" "=!r ,o ,v,v ,v 
> ,m,?r,?Yd")
> -   (match_operand:TI 1 "general_operand"  "riFo,re,C,BC,vm,v,Yd,r"))]
> +  [(set (match_operand:TI 0 "nonimmediate_operand" "=!r ,o ,v,v ,v 
> ,m,?jc,?Yd")
> +   (match_operand:TI 1 "general_operand"  
> "riFo,re,C,BC,vm,v,Yd,jc"))]
>"(TARGET_64BIT
>  && !(MEM_P (operands[0]) && MEM_P (operands[1])))
> || (TARGET_SSE
> @@ -2537,9 +2537,9 @@ (define_split
>
>  (define_insn "*movdi_internal"
>[(set (match_operand:DI 0 "nonimmediate_operand"
> -"=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r,?*y,?Yv,?v,?v,m ,m,?r 
> ,?*Yd,?r,?v,?*y,?*x,*k,*k  ,*r,*m,*k")
> +"=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r,?*y,?Yv,?v,?v,m 
> ,m,?jc,?*Yd,?r,?v,?*y,?*x,*k,*k  ,*r,*m,*k")
> (match_operand:DI 1 "general_operand"
> -"riFo,riF,Z,rem,i,re,C ,*y,Bk ,*y,*y,r  ,C  ,?v,Bk,?v,v,*Yd,r   ,?v,r  
> ,*x ,*y ,*r,*kBk,*k,*k,CBC"))]
> +"riFo,riF,Z,rem,i,re,C ,*y,Bk ,*y,*y,r  ,C  ,?v,Bk,?v,v,*Yd,jc  ,?v,r  
> ,*x ,*y ,*r,*kBk,*k,*k,CBC"))]
>"!(MEM_P (operands[0]) && MEM_P (operands[1]))
> && ix86_hardreg_mov_ok (operands[0], operands[1])"
>  {
> diff --git a/gcc/testsuite/gcc.target/i386/pr112394.c 
> b/gcc/testsuite/gcc.target/i386/pr112394.c
> new file mode 100644
> index 000..c582f6ea6bd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr112394.c
> @@ -0,0 +1,24 @@
> +/* PR target/112394 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-msse4.1 -m64 -O -mapxf" } */
> +
> +typedef int __attribute__((__vector_size__ (8))) A;
> +typedef int __attribute__((__vector_size__ (16))) B;
> +typedef char __attribute__((__vector_size__ (4))) C;
> +typedef char __attribute__((__vector_size__ (32))) D;
> +typedef _Complex __int128 CU;
> +typedef _Float16 __attribute__((__vector_size__ (8))) F;
> +D d;
> +B b;
> +CU gcu;
> +
> +int
> +foo (char c, int, int, int, int, CU cu, int x)
> +{
> +  d /= c | d;
> +  F f = __builtin_convertvector (b, F);
> +  cu /= gcu;
> +  A a = (A) f;
> +  int i = cu + x;
> +  return ((C) a[0])[1] + i + c;
> +}
> --
> 2.31.1
>


-- 
BR,
Hongtao


  1   2   3   4   5   6   7   8   9   10   >